]> code.delx.au - gnu-emacs/blob - lisp/cedet/semantic/lex.el
cc7a9ca8885810519eabcf2f62c1165b91b19bea
[gnu-emacs] / lisp / cedet / semantic / lex.el
1 ;;; semantic/lex.el --- Lexical Analyzer builder
2
3 ;; Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
4 ;; 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
5
6 ;; Author: Eric M. Ludlam <zappo@gnu.org>
7
8 ;; This file is part of GNU Emacs.
9
10 ;; GNU Emacs is free software: you can redistribute it and/or modify
11 ;; it under the terms of the GNU General Public License as published by
12 ;; the Free Software Foundation, either version 3 of the License, or
13 ;; (at your option) any later version.
14
15 ;; GNU Emacs is distributed in the hope that it will be useful,
16 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;; GNU General Public License for more details.
19
20 ;; You should have received a copy of the GNU General Public License
21 ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
22
23 ;;; Commentary:
24 ;;
25 ;; This file handles the creation of lexical analyzers for different
26 ;; languages in Emacs Lisp. The purpose of a lexical analyzer is to
27 ;; convert a buffer into a list of lexical tokens. Each token
28 ;; contains the token class (such as 'number, 'symbol, 'IF, etc) and
29 ;; the location in the buffer it was found. Optionally, a token also
30 ;; contains a string representing what is at the designated buffer
31 ;; location.
32 ;;
33 ;; Tokens are pushed onto a token stream, which is basically a list of
34 ;; all the lexical tokens from the analyzed region. The token stream
35 ;; is then handed to the grammar which parsers the file.
36 ;;
37 ;;; How it works
38 ;;
39 ;; Each analyzer specifies a condition and forms. These conditions
40 ;; and forms are assembled into a function by `define-lex' that does
41 ;; the lexical analysis.
42 ;;
43 ;; In the lexical analyzer created with `define-lex', each condition
44 ;; is tested for a given point. When the condition is true, the forms
45 ;; run.
46 ;;
47 ;; The forms can push a lexical token onto the token stream. The
48 ;; analyzer forms also must move the current analyzer point. If the
49 ;; analyzer point is moved without pushing a token, then the matched
50 ;; syntax is effectively ignored, or skipped.
51 ;;
52 ;; Thus, starting at the beginning of a region to be analyzed, each
53 ;; condition is tested. One will match, and a lexical token might be
54 ;; pushed, and the point is moved to the end of the lexical token
55 ;; identified. At the new position, the process occurs again until
56 ;; the end of the specified region is reached.
57 ;;
58 ;;; How to use semantic-lex
59 ;;
60 ;; To create a lexer for a language, use the `define-lex' macro.
61 ;;
62 ;; The `define-lex' macro accepts a list of lexical analyzers. Each
63 ;; analyzer is created with `define-lex-analyzer', or one of the
64 ;; derivative macros. A single analyzer defines a regular expression
65 ;; to match text in a buffer, and a short segment of code to create
66 ;; one lexical token.
67 ;;
68 ;; Each analyzer has a NAME, DOC, a CONDITION, and possibly some
69 ;; FORMS. The NAME is the name used in `define-lex'. The DOC
70 ;; describes what the analyzer should do.
71 ;;
72 ;; The CONDITION evaluates the text at the current point in the
73 ;; current buffer. If CONDITION is true, then the FORMS will be
74 ;; executed.
75 ;;
76 ;; The purpose of the FORMS is to push new lexical tokens onto the
77 ;; list of tokens for the current buffer, and to move point after the
78 ;; matched text.
79 ;;
80 ;; Some macros for creating one analyzer are:
81 ;;
82 ;; define-lex-analyzer - A generic analyzer associating any style of
83 ;; condition to forms.
84 ;; define-lex-regex-analyzer - Matches a regular expression.
85 ;; define-lex-simple-regex-analyzer - Matches a regular expressions,
86 ;; and pushes the match.
87 ;; define-lex-block-analyzer - Matches list syntax, and defines
88 ;; handles open/close delimiters.
89 ;;
90 ;; These macros are used by the grammar compiler when lexical
91 ;; information is specified in a grammar:
92 ;; define-lex- * -type-analyzer - Matches syntax specified in
93 ;; a grammar, and pushes one token for it. The * would
94 ;; be `sexp' for things like lists or strings, and
95 ;; `string' for things that need to match some special
96 ;; string, such as "\\." where a literal match is needed.
97 ;;
98 ;;; Lexical Tables
99 ;;
100 ;; There are tables of different symbols managed in semantic-lex.el.
101 ;; They are:
102 ;;
103 ;; Lexical keyword table - A Table of symbols declared in a grammar
104 ;; file with the %keyword declaration.
105 ;; Keywords are used by `semantic-lex-symbol-or-keyword'
106 ;; to create lexical tokens based on the keyword.
107 ;;
108 ;; Lexical type table - A table of symbols declared in a grammar
109 ;; file with the %type declaration.
110 ;; The grammar compiler uses the type table to create new
111 ;; lexical analyzers. These analyzers are then used to when
112 ;; a new lexical analyzer is made for a language.
113 ;;
114 ;;; Lexical Types
115 ;;
116 ;; A lexical type defines a kind of lexical analyzer that will be
117 ;; automatically generated from a grammar file based on some
118 ;; predetermined attributes. For now these two attributes are
119 ;; recognized :
120 ;;
121 ;; * matchdatatype : define the kind of lexical analyzer. That is :
122 ;;
123 ;; - regexp : define a regexp analyzer (see
124 ;; `define-lex-regex-type-analyzer')
125 ;;
126 ;; - string : define a string analyzer (see
127 ;; `define-lex-string-type-analyzer')
128 ;;
129 ;; - block : define a block type analyzer (see
130 ;; `define-lex-block-type-analyzer')
131 ;;
132 ;; - sexp : define a sexp analyzer (see
133 ;; `define-lex-sexp-type-analyzer')
134 ;;
135 ;; - keyword : define a keyword analyzer (see
136 ;; `define-lex-keyword-type-analyzer')
137 ;;
138 ;; * syntax : define the syntax that matches a syntactic
139 ;; expression. When syntax is matched the corresponding type
140 ;; analyzer is entered and the resulting match data will be
141 ;; interpreted based on the kind of analyzer (see matchdatatype
142 ;; above).
143 ;;
144 ;; The following lexical types are predefined :
145 ;;
146 ;; +-------------+---------------+--------------------------------+
147 ;; | type | matchdatatype | syntax |
148 ;; +-------------+---------------+--------------------------------+
149 ;; | punctuation | string | "\\(\\s.\\|\\s$\\|\\s'\\)+" |
150 ;; | keyword | keyword | "\\(\\sw\\|\\s_\\)+" |
151 ;; | symbol | regexp | "\\(\\sw\\|\\s_\\)+" |
152 ;; | string | sexp | "\\s\"" |
153 ;; | number | regexp | semantic-lex-number-expression |
154 ;; | block | block | "\\s(\\|\\s)" |
155 ;; +-------------+---------------+--------------------------------+
156 ;;
157 ;; In a grammar you must use a %type expression to automatically generate
158 ;; the corresponding analyzers of that type.
159 ;;
160 ;; Here is an example to auto-generate punctuation analyzers
161 ;; with 'matchdatatype and 'syntax predefined (see table above)
162 ;;
163 ;; %type <punctuation> ;; will auto-generate this kind of analyzers
164 ;;
165 ;; It is equivalent to write :
166 ;;
167 ;; %type <punctuation> syntax "\\(\\s.\\|\\s$\\|\\s'\\)+" matchdatatype string
168 ;;
169 ;; ;; Some punctuations based on the type defines above
170 ;;
171 ;; %token <punctuation> NOT "!"
172 ;; %token <punctuation> NOTEQ "!="
173 ;; %token <punctuation> MOD "%"
174 ;; %token <punctuation> MODEQ "%="
175 ;;
176
177 ;;; On the Semantic 1.x lexer
178 ;;
179 ;; In semantic 1.x, the lexical analyzer was an all purpose routine.
180 ;; To boost efficiency, the analyzer is now a series of routines that
181 ;; are constructed at build time into a single routine. This will
182 ;; eliminate unneeded if statements to speed the lexer.
183
184 (require 'semantic/fw)
185
186 ;;; Code:
187
188 ;;; Semantic 2.x lexical analysis
189 ;;
190 (defun semantic-lex-map-symbols (fun table &optional property)
191 "Call function FUN on every symbol in TABLE.
192 If optional PROPERTY is non-nil, call FUN only on every symbol which
193 as a PROPERTY value. FUN receives a symbol as argument."
194 (if (arrayp table)
195 (mapatoms
196 #'(lambda (symbol)
197 (if (or (null property) (get symbol property))
198 (funcall fun symbol)))
199 table)))
200
201 ;;; Lexical keyword table handling.
202 ;;
203 ;; These keywords are keywords defined for using in a grammar with the
204 ;; %keyword declaration, and are not keywords used in Emacs Lisp.
205
206 (defvar semantic-flex-keywords-obarray nil
207 "Buffer local keyword obarray for the lexical analyzer.
208 These keywords are matched explicitly, and converted into special symbols.")
209 (make-variable-buffer-local 'semantic-flex-keywords-obarray)
210
211 (defmacro semantic-lex-keyword-invalid (name)
212 "Signal that NAME is an invalid keyword name."
213 `(signal 'wrong-type-argument '(semantic-lex-keyword-p ,name)))
214
215 (defsubst semantic-lex-keyword-symbol (name)
216 "Return keyword symbol with NAME or nil if not found."
217 (and (arrayp semantic-flex-keywords-obarray)
218 (stringp name)
219 (intern-soft name semantic-flex-keywords-obarray)))
220
221 (defsubst semantic-lex-keyword-p (name)
222 "Return non-nil if a keyword with NAME exists in the keyword table.
223 Return nil otherwise."
224 (and (setq name (semantic-lex-keyword-symbol name))
225 (symbol-value name)))
226
227 (defsubst semantic-lex-keyword-set (name value)
228 "Set value of keyword with NAME to VALUE and return VALUE."
229 (set (intern name semantic-flex-keywords-obarray) value))
230
231 (defsubst semantic-lex-keyword-value (name)
232 "Return value of keyword with NAME.
233 Signal an error if a keyword with NAME does not exist."
234 (let ((keyword (semantic-lex-keyword-symbol name)))
235 (if keyword
236 (symbol-value keyword)
237 (semantic-lex-keyword-invalid name))))
238
239 (defsubst semantic-lex-keyword-put (name property value)
240 "For keyword with NAME, set its PROPERTY to VALUE."
241 (let ((keyword (semantic-lex-keyword-symbol name)))
242 (if keyword
243 (put keyword property value)
244 (semantic-lex-keyword-invalid name))))
245
246 (defsubst semantic-lex-keyword-get (name property)
247 "For keyword with NAME, return its PROPERTY value."
248 (let ((keyword (semantic-lex-keyword-symbol name)))
249 (if keyword
250 (get keyword property)
251 (semantic-lex-keyword-invalid name))))
252
253 (defun semantic-lex-make-keyword-table (specs &optional propspecs)
254 "Convert keyword SPECS into an obarray and return it.
255 SPECS must be a list of (NAME . TOKSYM) elements, where:
256
257 NAME is the name of the keyword symbol to define.
258 TOKSYM is the lexical token symbol of that keyword.
259
260 If optional argument PROPSPECS is non nil, then interpret it, and
261 apply those properties.
262 PROPSPECS must be a list of (NAME PROPERTY VALUE) elements."
263 ;; Create the symbol hash table
264 (let ((semantic-flex-keywords-obarray (make-vector 13 0))
265 spec)
266 ;; fill it with stuff
267 (while specs
268 (setq spec (car specs)
269 specs (cdr specs))
270 (semantic-lex-keyword-set (car spec) (cdr spec)))
271 ;; Apply all properties
272 (while propspecs
273 (setq spec (car propspecs)
274 propspecs (cdr propspecs))
275 (semantic-lex-keyword-put (car spec) (nth 1 spec) (nth 2 spec)))
276 semantic-flex-keywords-obarray))
277
278 (defsubst semantic-lex-map-keywords (fun &optional property)
279 "Call function FUN on every lexical keyword.
280 If optional PROPERTY is non-nil, call FUN only on every keyword which
281 as a PROPERTY value. FUN receives a lexical keyword as argument."
282 (semantic-lex-map-symbols
283 fun semantic-flex-keywords-obarray property))
284
285 (defun semantic-lex-keywords (&optional property)
286 "Return a list of lexical keywords.
287 If optional PROPERTY is non-nil, return only keywords which have a
288 PROPERTY set."
289 (let (keywords)
290 (semantic-lex-map-keywords
291 #'(lambda (symbol) (setq keywords (cons symbol keywords)))
292 property)
293 keywords))
294
295 ;;; Inline functions:
296
297 (defvar semantic-lex-unterminated-syntax-end-function)
298 (defvar semantic-lex-analysis-bounds)
299 (defvar semantic-lex-end-point)
300
301 (defsubst semantic-lex-token-bounds (token)
302 "Fetch the start and end locations of the lexical token TOKEN.
303 Return a pair (START . END)."
304 (if (not (numberp (car (cdr token))))
305 (cdr (cdr token))
306 (cdr token)))
307
308 (defsubst semantic-lex-token-start (token)
309 "Fetch the start position of the lexical token TOKEN.
310 See also the function `semantic-lex-token'."
311 (car (semantic-lex-token-bounds token)))
312
313 (defsubst semantic-lex-token-end (token)
314 "Fetch the end position of the lexical token TOKEN.
315 See also the function `semantic-lex-token'."
316 (cdr (semantic-lex-token-bounds token)))
317
318 (defsubst semantic-lex-unterminated-syntax-detected (syntax)
319 "Inside a lexical analyzer, use this when unterminated syntax was found.
320 Argument SYNTAX indicates the type of syntax that is unterminated.
321 The job of this function is to move (point) to a new logical location
322 so that analysis can continue, if possible."
323 (goto-char
324 (funcall semantic-lex-unterminated-syntax-end-function
325 syntax
326 (car semantic-lex-analysis-bounds)
327 (cdr semantic-lex-analysis-bounds)
328 ))
329 (setq semantic-lex-end-point (point)))
330 \f
331 ;;; Type table handling.
332 ;;
333 ;; The lexical type table manages types that occur in a grammar file
334 ;; with the %type declaration. Types represent different syntaxes.
335 ;; See code for `semantic-lex-preset-default-types' for the classic
336 ;; types of syntax.
337 (defvar semantic-lex-types-obarray nil
338 "Buffer local types obarray for the lexical analyzer.")
339 (make-variable-buffer-local 'semantic-lex-types-obarray)
340
341 (defmacro semantic-lex-type-invalid (type)
342 "Signal that TYPE is an invalid lexical type name."
343 `(signal 'wrong-type-argument '(semantic-lex-type-p ,type)))
344
345 (defsubst semantic-lex-type-symbol (type)
346 "Return symbol with TYPE or nil if not found."
347 (and (arrayp semantic-lex-types-obarray)
348 (stringp type)
349 (intern-soft type semantic-lex-types-obarray)))
350
351 (defsubst semantic-lex-type-p (type)
352 "Return non-nil if a symbol with TYPE name exists."
353 (and (setq type (semantic-lex-type-symbol type))
354 (symbol-value type)))
355
356 (defsubst semantic-lex-type-set (type value)
357 "Set value of symbol with TYPE name to VALUE and return VALUE."
358 (set (intern type semantic-lex-types-obarray) value))
359
360 (defsubst semantic-lex-type-value (type &optional noerror)
361 "Return value of symbol with TYPE name.
362 If optional argument NOERROR is non-nil return nil if a symbol with
363 TYPE name does not exist. Otherwise signal an error."
364 (let ((sym (semantic-lex-type-symbol type)))
365 (if sym
366 (symbol-value sym)
367 (unless noerror
368 (semantic-lex-type-invalid type)))))
369
370 (defsubst semantic-lex-type-put (type property value &optional add)
371 "For symbol with TYPE name, set its PROPERTY to VALUE.
372 If optional argument ADD is non-nil, create a new symbol with TYPE
373 name if it does not already exist. Otherwise signal an error."
374 (let ((sym (semantic-lex-type-symbol type)))
375 (unless sym
376 (or add (semantic-lex-type-invalid type))
377 (semantic-lex-type-set type nil)
378 (setq sym (semantic-lex-type-symbol type)))
379 (put sym property value)))
380
381 (defsubst semantic-lex-type-get (type property &optional noerror)
382 "For symbol with TYPE name, return its PROPERTY value.
383 If optional argument NOERROR is non-nil return nil if a symbol with
384 TYPE name does not exist. Otherwise signal an error."
385 (let ((sym (semantic-lex-type-symbol type)))
386 (if sym
387 (get sym property)
388 (unless noerror
389 (semantic-lex-type-invalid type)))))
390
391 (defun semantic-lex-preset-default-types ()
392 "Install useful default properties for well known types."
393 (semantic-lex-type-put "punctuation" 'matchdatatype 'string t)
394 (semantic-lex-type-put "punctuation" 'syntax "\\(\\s.\\|\\s$\\|\\s'\\)+")
395 (semantic-lex-type-put "keyword" 'matchdatatype 'keyword t)
396 (semantic-lex-type-put "keyword" 'syntax "\\(\\sw\\|\\s_\\)+")
397 (semantic-lex-type-put "symbol" 'matchdatatype 'regexp t)
398 (semantic-lex-type-put "symbol" 'syntax "\\(\\sw\\|\\s_\\)+")
399 (semantic-lex-type-put "string" 'matchdatatype 'sexp t)
400 (semantic-lex-type-put "string" 'syntax "\\s\"")
401 (semantic-lex-type-put "number" 'matchdatatype 'regexp t)
402 (semantic-lex-type-put "number" 'syntax 'semantic-lex-number-expression)
403 (semantic-lex-type-put "block" 'matchdatatype 'block t)
404 (semantic-lex-type-put "block" 'syntax "\\s(\\|\\s)")
405 )
406
407 (defun semantic-lex-make-type-table (specs &optional propspecs)
408 "Convert type SPECS into an obarray and return it.
409 SPECS must be a list of (TYPE . TOKENS) elements, where:
410
411 TYPE is the name of the type symbol to define.
412 TOKENS is an list of (TOKSYM . MATCHER) elements, where:
413
414 TOKSYM is any lexical token symbol.
415 MATCHER is a string or regexp a text must match to be a such
416 lexical token.
417
418 If optional argument PROPSPECS is non nil, then interpret it, and
419 apply those properties.
420 PROPSPECS must be a list of (TYPE PROPERTY VALUE)."
421 ;; Create the symbol hash table
422 (let* ((semantic-lex-types-obarray (make-vector 13 0))
423 spec type tokens token alist default)
424 ;; fill it with stuff
425 (while specs
426 (setq spec (car specs)
427 specs (cdr specs)
428 type (car spec)
429 tokens (cdr spec)
430 default nil
431 alist nil)
432 (while tokens
433 (setq token (car tokens)
434 tokens (cdr tokens))
435 (if (cdr token)
436 (setq alist (cons token alist))
437 (setq token (car token))
438 (if default
439 (message
440 "*Warning* default value of <%s> tokens changed to %S, was %S"
441 type default token))
442 (setq default token)))
443 ;; Ensure the default matching spec is the first one.
444 (semantic-lex-type-set type (cons default (nreverse alist))))
445 ;; Install useful default types & properties
446 (semantic-lex-preset-default-types)
447 ;; Apply all properties
448 (while propspecs
449 (setq spec (car propspecs)
450 propspecs (cdr propspecs))
451 ;; Create the type if necessary.
452 (semantic-lex-type-put (car spec) (nth 1 spec) (nth 2 spec) t))
453 semantic-lex-types-obarray))
454
455 (defsubst semantic-lex-map-types (fun &optional property)
456 "Call function FUN on every lexical type.
457 If optional PROPERTY is non-nil, call FUN only on every type symbol
458 which as a PROPERTY value. FUN receives a type symbol as argument."
459 (semantic-lex-map-symbols
460 fun semantic-lex-types-obarray property))
461
462 (defun semantic-lex-types (&optional property)
463 "Return a list of lexical type symbols.
464 If optional PROPERTY is non-nil, return only type symbols which have
465 PROPERTY set."
466 (let (types)
467 (semantic-lex-map-types
468 #'(lambda (symbol) (setq types (cons symbol types)))
469 property)
470 types))
471 \f
472 ;;; Lexical Analyzer framework settings
473 ;;
474
475 (defvar semantic-lex-analyzer 'semantic-flex
476 "The lexical analyzer used for a given buffer.
477 See `semantic-lex' for documentation.
478 For compatibility with Semantic 1.x it defaults to `semantic-flex'.")
479 (make-variable-buffer-local 'semantic-lex-analyzer)
480
481 (defvar semantic-lex-tokens
482 '(
483 (bol)
484 (charquote)
485 (close-paren)
486 (comment)
487 (newline)
488 (open-paren)
489 (punctuation)
490 (semantic-list)
491 (string)
492 (symbol)
493 (whitespace)
494 )
495 "An alist of of semantic token types.
496 As of December 2001 (semantic 1.4beta13), this variable is not used in
497 any code. The only use is to refer to the doc-string from elsewhere.
498
499 The key to this alist is the symbol representing token type that
500 \\[semantic-flex] returns. These are
501
502 - bol: Empty string matching a beginning of line.
503 This token is produced with
504 `semantic-lex-beginning-of-line'.
505
506 - charquote: String sequences that match `\\s\\+' regexp.
507 This token is produced with `semantic-lex-charquote'.
508
509 - close-paren: Characters that match `\\s)' regexp.
510 These are typically `)', `}', `]', etc.
511 This token is produced with
512 `semantic-lex-close-paren'.
513
514 - comment: A comment chunk. These token types are not
515 produced by default.
516 This token is produced with `semantic-lex-comments'.
517 Comments are ignored with `semantic-lex-ignore-comments'.
518 Comments are treated as whitespace with
519 `semantic-lex-comments-as-whitespace'.
520
521 - newline Characters matching `\\s-*\\(\n\\|\\s>\\)' regexp.
522 This token is produced with `semantic-lex-newline'.
523
524 - open-paren: Characters that match `\\s(' regexp.
525 These are typically `(', `{', `[', etc.
526 If `semantic-lex-paren-or-list' is used,
527 then `open-paren' is not usually generated unless
528 the `depth' argument to \\[semantic-lex] is
529 greater than 0.
530 This token is always produced if the analyzer
531 `semantic-lex-open-paren' is used.
532
533 - punctuation: Characters matching `{\\(\\s.\\|\\s$\\|\\s'\\)'
534 regexp.
535 This token is produced with `semantic-lex-punctuation'.
536 Always specify this analyzer after the comment
537 analyzer.
538
539 - semantic-list: String delimited by matching parenthesis, braces,
540 etc. that the lexer skipped over, because the
541 `depth' parameter to \\[semantic-flex] was not high
542 enough.
543 This token is produced with `semantic-lex-paren-or-list'.
544
545 - string: Quoted strings, i.e., string sequences that start
546 and end with characters matching `\\s\"'
547 regexp. The lexer relies on @code{forward-sexp} to
548 find the matching end.
549 This token is produced with `semantic-lex-string'.
550
551 - symbol: String sequences that match `\\(\\sw\\|\\s_\\)+'
552 regexp.
553 This token is produced with
554 `semantic-lex-symbol-or-keyword'. Always add this analyzer
555 after `semantic-lex-number', or other analyzers that
556 match its regular expression.
557
558 - whitespace: Characters that match `\\s-+' regexp.
559 This token is produced with `semantic-lex-whitespace'.")
560
561 (defvar semantic-lex-syntax-modifications nil
562 "Changes to the syntax table for this buffer.
563 These changes are active only while the buffer is being flexed.
564 This is a list where each element has the form:
565 (CHAR CLASS)
566 CHAR is the char passed to `modify-syntax-entry',
567 and CLASS is the string also passed to `modify-syntax-entry' to define
568 what syntax class CHAR has.")
569 (make-variable-buffer-local 'semantic-lex-syntax-modifications)
570
571 (defvar semantic-lex-syntax-table nil
572 "Syntax table used by lexical analysis.
573 See also `semantic-lex-syntax-modifications'.")
574 (make-variable-buffer-local 'semantic-lex-syntax-table)
575
576 (defvar semantic-lex-comment-regex nil
577 "Regular expression for identifying comment start during lexical analysis.
578 This may be automatically set when semantic initializes in a mode, but
579 may need to be overridden for some special languages.")
580 (make-variable-buffer-local 'semantic-lex-comment-regex)
581
582 (defvar semantic-lex-number-expression
583 ;; This expression was written by David Ponce for Java, and copied
584 ;; here for C and any other similar language.
585 (eval-when-compile
586 (concat "\\("
587 "\\<[0-9]+[.][0-9]+\\([eE][-+]?[0-9]+\\)?[fFdD]?\\>"
588 "\\|"
589 "\\<[0-9]+[.][eE][-+]?[0-9]+[fFdD]?\\>"
590 "\\|"
591 "\\<[0-9]+[.][fFdD]\\>"
592 "\\|"
593 "\\<[0-9]+[.]"
594 "\\|"
595 "[.][0-9]+\\([eE][-+]?[0-9]+\\)?[fFdD]?\\>"
596 "\\|"
597 "\\<[0-9]+[eE][-+]?[0-9]+[fFdD]?\\>"
598 "\\|"
599 "\\<0[xX][0-9a-fA-F]+[lL]?\\>"
600 "\\|"
601 "\\<[0-9]+[lLfFdD]?\\>"
602 "\\)"
603 ))
604 "Regular expression for matching a number.
605 If this value is nil, no number extraction is done during lex.
606 This expression tries to match C and Java like numbers.
607
608 DECIMAL_LITERAL:
609 [1-9][0-9]*
610 ;
611 HEX_LITERAL:
612 0[xX][0-9a-fA-F]+
613 ;
614 OCTAL_LITERAL:
615 0[0-7]*
616 ;
617 INTEGER_LITERAL:
618 <DECIMAL_LITERAL>[lL]?
619 | <HEX_LITERAL>[lL]?
620 | <OCTAL_LITERAL>[lL]?
621 ;
622 EXPONENT:
623 [eE][+-]?[09]+
624 ;
625 FLOATING_POINT_LITERAL:
626 [0-9]+[.][0-9]*<EXPONENT>?[fFdD]?
627 | [.][0-9]+<EXPONENT>?[fFdD]?
628 | [0-9]+<EXPONENT>[fFdD]?
629 | [0-9]+<EXPONENT>?[fFdD]
630 ;")
631 (make-variable-buffer-local 'semantic-lex-number-expression)
632
633 (defvar semantic-lex-depth 0
634 "Default lexing depth.
635 This specifies how many lists to create tokens in.")
636 (make-variable-buffer-local 'semantic-lex-depth)
637
638 (defvar semantic-lex-unterminated-syntax-end-function
639 (lambda (syntax syntax-start lex-end) lex-end)
640 "Function called when unterminated syntax is encountered.
641 This should be set to one function. That function should take three
642 parameters. The SYNTAX, or type of syntax which is unterminated.
643 SYNTAX-START where the broken syntax begins.
644 LEX-END is where the lexical analysis was asked to end.
645 This function can be used for languages that can intelligently fix up
646 broken syntax, or the exit lexical analysis via `throw' or `signal'
647 when finding unterminated syntax.")
648
649 ;;; Interactive testing commands
650
651 (declare-function semantic-elapsed-time "semantic")
652
653 (defun semantic-lex-test (arg)
654 "Test the semantic lexer in the current buffer.
655 If universal argument ARG, then try the whole buffer."
656 (interactive "P")
657 (require 'semantic)
658 (let* ((start (current-time))
659 (result (semantic-lex
660 (if arg (point-min) (point))
661 (point-max)))
662 (end (current-time)))
663 (message "Elapsed Time: %.2f seconds."
664 (semantic-elapsed-time start end))
665 (pop-to-buffer "*Lexer Output*")
666 (require 'pp)
667 (erase-buffer)
668 (insert (pp-to-string result))
669 (goto-char (point-min))
670 ))
671
672 (defvar semantic-lex-debug nil
673 "When non-nil, debug the local lexical analyzer.")
674
675 (defun semantic-lex-debug (arg)
676 "Debug the semantic lexer in the current buffer.
677 Argument ARG specifies of the analyze the whole buffer, or start at point.
678 While engaged, each token identified by the lexer will be highlighted
679 in the target buffer A description of the current token will be
680 displayed in the minibuffer. Press SPC to move to the next lexical token."
681 (interactive "P")
682 (require 'semantic/debug)
683 (let ((semantic-lex-debug t))
684 (semantic-lex-test arg)))
685
686 (defun semantic-lex-highlight-token (token)
687 "Highlight the lexical TOKEN.
688 TOKEN is a lexical token with a START And END position.
689 Return the overlay."
690 (let ((o (semantic-make-overlay (semantic-lex-token-start token)
691 (semantic-lex-token-end token))))
692 (semantic-overlay-put o 'face 'highlight)
693 o))
694
695 (defsubst semantic-lex-debug-break (token)
696 "Break during lexical analysis at TOKEN."
697 (when semantic-lex-debug
698 (let ((o nil))
699 (unwind-protect
700 (progn
701 (when token
702 (setq o (semantic-lex-highlight-token token)))
703 (semantic-read-event
704 (format "%S :: SPC - continue" token))
705 )
706 (when o
707 (semantic-overlay-delete o))))))
708
709 ;;; Lexical analyzer creation
710 ;;
711 ;; Code for creating a lex function from lists of analyzers.
712 ;;
713 ;; A lexical analyzer is created from a list of individual analyzers.
714 ;; Each individual analyzer specifies a single match, and code that
715 ;; goes with it.
716 ;;
717 ;; Creation of an analyzer assembles these analyzers into a new function
718 ;; with the behaviors of all the individual analyzers.
719 ;;
720 (defmacro semantic-lex-one-token (analyzers)
721 "Calculate one token from the current buffer at point.
722 Uses locally bound variables from `define-lex'.
723 Argument ANALYZERS is the list of analyzers being used."
724 (cons 'cond (mapcar #'symbol-value analyzers)))
725
726 (defvar semantic-lex-end-point nil
727 "The end point as tracked through lexical functions.")
728
729 (defvar semantic-lex-current-depth nil
730 "The current depth as tracked through lexical functions.")
731
732 (defvar semantic-lex-maximum-depth nil
733 "The maximum depth of parenthesis as tracked through lexical functions.")
734
735 (defvar semantic-lex-token-stream nil
736 "The current token stream we are collecting.")
737
738 (defvar semantic-lex-analysis-bounds nil
739 "The bounds of the current analysis.")
740
741 (defvar semantic-lex-block-streams nil
742 "Streams of tokens inside collapsed blocks.
743 This is an alist of (ANCHOR . STREAM) elements where ANCHOR is the
744 start position of the block, and STREAM is the list of tokens in that
745 block.")
746
747 (defvar semantic-lex-reset-hooks nil
748 "Abnormal hook used by major-modes to reset lexical analyzers.
749 Hook functions are called with START and END values for the
750 current lexical pass. Should be set with `add-hook', specifying
751 a LOCAL option.")
752
753 ;; Stack of nested blocks.
754 (defvar semantic-lex-block-stack nil)
755 ;;(defvar semantic-lex-timeout 5
756 ;; "*Number of sections of lexing before giving up.")
757
758 (defmacro define-lex (name doc &rest analyzers)
759 "Create a new lexical analyzer with NAME.
760 DOC is a documentation string describing this analyzer.
761 ANALYZERS are small code snippets of analyzers to use when
762 building the new NAMED analyzer. Only use analyzers which
763 are written to be used in `define-lex'.
764 Each analyzer should be an analyzer created with `define-lex-analyzer'.
765 Note: The order in which analyzers are listed is important.
766 If two analyzers can match the same text, it is important to order the
767 analyzers so that the one you want to match first occurs first. For
768 example, it is good to put a number analyzer in front of a symbol
769 analyzer which might mistake a number for as a symbol."
770 `(defun ,name (start end &optional depth length)
771 ,(concat doc "\nSee `semantic-lex' for more information.")
772 ;; Make sure the state of block parsing starts over.
773 (setq semantic-lex-block-streams nil)
774 ;; Allow specialty reset items.
775 (run-hook-with-args 'semantic-lex-reset-hooks start end)
776 ;; Lexing state.
777 (let* (;(starttime (current-time))
778 (starting-position (point))
779 (semantic-lex-token-stream nil)
780 (semantic-lex-block-stack nil)
781 (tmp-start start)
782 (semantic-lex-end-point start)
783 (semantic-lex-current-depth 0)
784 ;; Use the default depth when not specified.
785 (semantic-lex-maximum-depth
786 (or depth semantic-lex-depth))
787 ;; Bounds needed for unterminated syntax
788 (semantic-lex-analysis-bounds (cons start end))
789 ;; This entry prevents text properties from
790 ;; confusing our lexical analysis. See Emacs 22 (CVS)
791 ;; version of C++ mode with template hack text properties.
792 (parse-sexp-lookup-properties nil)
793 )
794 ;; Maybe REMOVE THIS LATER.
795 ;; Trying to find incremental parser bug.
796 (when (> end (point-max))
797 (error ,(format "%s: end (%%d) > point-max (%%d)" name)
798 end (point-max)))
799 (with-syntax-table semantic-lex-syntax-table
800 (goto-char start)
801 (while (and (< (point) end)
802 (or (not length)
803 (<= (length semantic-lex-token-stream) length)))
804 (semantic-lex-one-token ,analyzers)
805 (when (eq semantic-lex-end-point tmp-start)
806 (error ,(format "%s: endless loop at %%d, after %%S" name)
807 tmp-start (car semantic-lex-token-stream)))
808 (setq tmp-start semantic-lex-end-point)
809 (goto-char semantic-lex-end-point)
810 ;;(when (> (semantic-elapsed-time starttime (current-time))
811 ;; semantic-lex-timeout)
812 ;; (error "Timeout during lex at char %d" (point)))
813 (semantic-throw-on-input 'lex)
814 (semantic-lex-debug-break (car semantic-lex-token-stream))
815 ))
816 ;; Check that there is no unterminated block.
817 (when semantic-lex-block-stack
818 (let* ((last (pop semantic-lex-block-stack))
819 (blk last))
820 (while blk
821 (message
822 ,(format "%s: `%%s' block from %%S is unterminated" name)
823 (car blk) (cadr blk))
824 (setq blk (pop semantic-lex-block-stack)))
825 (semantic-lex-unterminated-syntax-detected (car last))))
826 ;; Return to where we started.
827 ;; Do not wrap in protective stuff so that if there is an error
828 ;; thrown, the user knows where.
829 (goto-char starting-position)
830 ;; Return the token stream
831 (nreverse semantic-lex-token-stream))))
832 \f
833 ;;; Collapsed block tokens delimited by any tokens.
834 ;;
835 (defun semantic-lex-start-block (syntax)
836 "Mark the last read token as the beginning of a SYNTAX block."
837 (if (or (not semantic-lex-maximum-depth)
838 (< semantic-lex-current-depth semantic-lex-maximum-depth))
839 (setq semantic-lex-current-depth (1+ semantic-lex-current-depth))
840 (push (list syntax (car semantic-lex-token-stream))
841 semantic-lex-block-stack)))
842
843 (defun semantic-lex-end-block (syntax)
844 "Process the end of a previously marked SYNTAX block.
845 That is, collapse the tokens inside that block, including the
846 beginning and end of block tokens, into a high level block token of
847 class SYNTAX.
848 The token at beginning of block is the one marked by a previous call
849 to `semantic-lex-start-block'. The current token is the end of block.
850 The collapsed tokens are saved in `semantic-lex-block-streams'."
851 (if (null semantic-lex-block-stack)
852 (setq semantic-lex-current-depth (1- semantic-lex-current-depth))
853 (let* ((stream semantic-lex-token-stream)
854 (blk (pop semantic-lex-block-stack))
855 (bstream (cdr blk))
856 (first (car bstream))
857 (last (pop stream)) ;; The current token mark the EOBLK
858 tok)
859 (if (not (eq (car blk) syntax))
860 ;; SYNTAX doesn't match the syntax of the current block in
861 ;; the stack. So we encountered the end of the SYNTAX block
862 ;; before the end of the current one in the stack which is
863 ;; signaled unterminated.
864 (semantic-lex-unterminated-syntax-detected (car blk))
865 ;; Move tokens found inside the block from the main stream
866 ;; into a separate block stream.
867 (while (and stream (not (eq (setq tok (pop stream)) first)))
868 (push tok bstream))
869 ;; The token marked as beginning of block was not encountered.
870 ;; This should not happen!
871 (or (eq tok first)
872 (error "Token %S not found at beginning of block `%s'"
873 first syntax))
874 ;; Save the block stream for future reuse, to avoid to redo
875 ;; the lexical analysis of the block content!
876 ;; Anchor the block stream with its start position, so we can
877 ;; use: (cdr (assq start semantic-lex-block-streams)) to
878 ;; quickly retrieve the lexical stream associated to a block.
879 (setcar blk (semantic-lex-token-start first))
880 (setcdr blk (nreverse bstream))
881 (push blk semantic-lex-block-streams)
882 ;; In the main stream, replace the tokens inside the block by
883 ;; a high level block token of class SYNTAX.
884 (setq semantic-lex-token-stream stream)
885 (semantic-lex-push-token
886 (semantic-lex-token
887 syntax (car blk) (semantic-lex-token-end last)))
888 ))))
889 \f
890 ;;; Lexical token API
891 ;;
892 ;; Functions for accessing parts of a token. Use these functions
893 ;; instead of accessing the list structure directly because the
894 ;; contents of the lexical may change.
895 ;;
896 (defmacro semantic-lex-token (symbol start end &optional str)
897 "Create a lexical token.
898 SYMBOL is a symbol representing the class of syntax found.
899 START and END define the bounds of the token in the current buffer.
900 Optional STR is the string for the token only if the bounds in
901 the buffer do not cover the string they represent. (As from
902 macro expansion.)"
903 ;; This if statement checks the existence of a STR argument at
904 ;; compile time, where STR is some symbol or constant. If the
905 ;; variable STr (runtime) is nil, this will make an incorrect decision.
906 ;;
907 ;; It is like this to maintain the original speed of the compiled
908 ;; code.
909 (if str
910 `(cons ,symbol (cons ,str (cons ,start ,end)))
911 `(cons ,symbol (cons ,start ,end))))
912
913 (defun semantic-lex-token-p (thing)
914 "Return non-nil if THING is a semantic lex token.
915 This is an exhaustively robust check."
916 (and (consp thing)
917 (symbolp (car thing))
918 (or (and (numberp (nth 1 thing))
919 (numberp (nthcdr 2 thing)))
920 (and (stringp (nth 1 thing))
921 (numberp (nth 2 thing))
922 (numberp (nthcdr 3 thing)))
923 ))
924 )
925
926 (defun semantic-lex-token-with-text-p (thing)
927 "Return non-nil if THING is a semantic lex token.
928 This is an exhaustively robust check."
929 (and (consp thing)
930 (symbolp (car thing))
931 (= (length thing) 4)
932 (stringp (nth 1 thing))
933 (numberp (nth 2 thing))
934 (numberp (nth 3 thing)))
935 )
936
937 (defun semantic-lex-token-without-text-p (thing)
938 "Return non-nil if THING is a semantic lex token.
939 This is an exhaustively robust check."
940 (and (consp thing)
941 (symbolp (car thing))
942 (= (length thing) 3)
943 (numberp (nth 1 thing))
944 (numberp (nth 2 thing)))
945 )
946
947 (eval-and-compile
948
949 (defun semantic-lex-expand-block-specs (specs)
950 "Expand block specifications SPECS into a Lisp form.
951 SPECS is a list of (BLOCK BEGIN END) elements where BLOCK, BEGIN, and
952 END are token class symbols that indicate to produce one collapsed
953 BLOCK token from tokens found between BEGIN and END ones.
954 BLOCK must be a non-nil symbol, and at least one of the BEGIN or END
955 symbols must be non-nil too.
956 When BEGIN is non-nil, generate a call to `semantic-lex-start-block'
957 when a BEGIN token class is encountered.
958 When END is non-nil, generate a call to `semantic-lex-end-block' when
959 an END token class is encountered."
960 (let ((class (make-symbol "class"))
961 (form nil))
962 (dolist (spec specs)
963 (when (car spec)
964 (when (nth 1 spec)
965 (push `((eq ',(nth 1 spec) ,class)
966 (semantic-lex-start-block ',(car spec)))
967 form))
968 (when (nth 2 spec)
969 (push `((eq ',(nth 2 spec) ,class)
970 (semantic-lex-end-block ',(car spec)))
971 form))))
972 (when form
973 `((let ((,class (semantic-lex-token-class
974 (car semantic-lex-token-stream))))
975 (cond ,@(nreverse form))))
976 )))
977 )
978
979 (defmacro semantic-lex-push-token (token &rest blockspecs)
980 "Push TOKEN in the lexical analyzer token stream.
981 Return the lexical analysis current end point.
982 If optional arguments BLOCKSPECS is non-nil, it specifies to process
983 collapsed block tokens. See `semantic-lex-expand-block-specs' for
984 more details.
985 This macro should only be called within the bounds of
986 `define-lex-analyzer'. It changes the values of the lexical analyzer
987 variables `token-stream' and `semantic-lex-end-point'. If you need to
988 move `semantic-lex-end-point' somewhere else, just modify this
989 variable after calling `semantic-lex-push-token'."
990 `(progn
991 (push ,token semantic-lex-token-stream)
992 ,@(semantic-lex-expand-block-specs blockspecs)
993 (setq semantic-lex-end-point
994 (semantic-lex-token-end (car semantic-lex-token-stream)))
995 ))
996
997 (defsubst semantic-lex-token-class (token)
998 "Fetch the class of the lexical token TOKEN.
999 See also the function `semantic-lex-token'."
1000 (car token))
1001
1002 (defsubst semantic-lex-token-text (token)
1003 "Fetch the text associated with the lexical token TOKEN.
1004 See also the function `semantic-lex-token'."
1005 (if (stringp (car (cdr token)))
1006 (car (cdr token))
1007 (buffer-substring-no-properties
1008 (semantic-lex-token-start token)
1009 (semantic-lex-token-end token))))
1010
1011 (defun semantic-lex-init ()
1012 "Initialize any lexical state for this buffer."
1013 (unless semantic-lex-comment-regex
1014 (setq semantic-lex-comment-regex
1015 (if comment-start-skip
1016 (concat "\\(\\s<\\|" comment-start-skip "\\)")
1017 "\\(\\s<\\)")))
1018 ;; Setup the lexer syntax-table
1019 (setq semantic-lex-syntax-table (copy-syntax-table (syntax-table)))
1020 (dolist (mod semantic-lex-syntax-modifications)
1021 (modify-syntax-entry
1022 (car mod) (nth 1 mod) semantic-lex-syntax-table)))
1023
1024 ;;;###autoload
1025 (define-overloadable-function semantic-lex (start end &optional depth length)
1026 "Lexically analyze text in the current buffer between START and END.
1027 Optional argument DEPTH indicates at what level to scan over entire
1028 lists. The last argument, LENGTH specifies that `semantic-lex'
1029 should only return LENGTH tokens. The return value is a token stream.
1030 Each element is a list, such of the form
1031 (symbol start-expression . end-expression)
1032 where SYMBOL denotes the token type.
1033 See `semantic-lex-tokens' variable for details on token types. END
1034 does not mark the end of the text scanned, only the end of the
1035 beginning of text scanned. Thus, if a string extends past END, the
1036 end of the return token will be larger than END. To truly restrict
1037 scanning, use `narrow-to-region'."
1038 (funcall semantic-lex-analyzer start end depth length))
1039
1040 (defsubst semantic-lex-buffer (&optional depth)
1041 "Lex the current buffer.
1042 Optional argument DEPTH is the depth to scan into lists."
1043 (semantic-lex (point-min) (point-max) depth))
1044
1045 (defsubst semantic-lex-list (semlist depth)
1046 "Lex the body of SEMLIST to DEPTH."
1047 (semantic-lex (semantic-lex-token-start semlist)
1048 (semantic-lex-token-end semlist)
1049 depth))
1050 \f
1051 ;;; Analyzer creation macros
1052 ;;
1053 ;; An individual analyzer is a condition and code that goes with it.
1054 ;;
1055 ;; Created analyzers become variables with the code associated with them
1056 ;; as the symbol value. These analyzers are assembled into a lexer
1057 ;; to create new lexical analyzers.
1058
1059 (defcustom semantic-lex-debug-analyzers nil
1060 "Non nil means to debug analyzers with syntax protection.
1061 Only in effect if `debug-on-error' is also non-nil."
1062 :group 'semantic
1063 :type 'boolean)
1064
1065 (defmacro semantic-lex-unterminated-syntax-protection (syntax &rest forms)
1066 "For SYNTAX, execute FORMS with protection for unterminated syntax.
1067 If FORMS throws an error, treat this as a syntax problem, and
1068 execute the unterminated syntax code. FORMS should return a position.
1069 Irregardless of an error, the cursor should be moved to the end of
1070 the desired syntax, and a position returned.
1071 If `debug-on-error' is set, errors are not caught, so that you can
1072 debug them.
1073 Avoid using a large FORMS since it is duplicated."
1074 `(if (and debug-on-error semantic-lex-debug-analyzers)
1075 (progn ,@forms)
1076 (condition-case nil
1077 (progn ,@forms)
1078 (error
1079 (semantic-lex-unterminated-syntax-detected ,syntax)))))
1080 (put 'semantic-lex-unterminated-syntax-protection
1081 'lisp-indent-function 1)
1082
1083 (defmacro define-lex-analyzer (name doc condition &rest forms)
1084 "Create a single lexical analyzer NAME with DOC.
1085 When an analyzer is called, the current buffer and point are
1086 positioned in a buffer at the location to be analyzed.
1087 CONDITION is an expression which returns t if FORMS should be run.
1088 Within the bounds of CONDITION and FORMS, the use of backquote
1089 can be used to evaluate expressions at compile time.
1090 While forms are running, the following variables will be locally bound:
1091 `semantic-lex-analysis-bounds' - The bounds of the current analysis.
1092 of the form (START . END)
1093 `semantic-lex-maximum-depth' - The maximum depth of semantic-list
1094 for the current analysis.
1095 `semantic-lex-current-depth' - The current depth of `semantic-list' that has
1096 been descended.
1097 `semantic-lex-end-point' - End Point after match.
1098 Analyzers should set this to a buffer location if their
1099 match string does not represent the end of the matched text.
1100 `semantic-lex-token-stream' - The token list being collected.
1101 Add new lexical tokens to this list.
1102 Proper action in FORMS is to move the value of `semantic-lex-end-point' to
1103 after the location of the analyzed entry, and to add any discovered tokens
1104 at the beginning of `semantic-lex-token-stream'.
1105 This can be done by using `semantic-lex-push-token'."
1106 `(eval-and-compile
1107 (defvar ,name nil ,doc)
1108 (defun ,name nil)
1109 ;; Do this part separately so that re-evaluation rebuilds this code.
1110 (setq ,name '(,condition ,@forms))
1111 ;; Build a single lexical analyzer function, so the doc for
1112 ;; function help is automatically provided, and perhaps the
1113 ;; function could be useful for testing and debugging one
1114 ;; analyzer.
1115 (fset ',name (lambda () ,doc
1116 (let ((semantic-lex-token-stream nil)
1117 (semantic-lex-end-point (point))
1118 (semantic-lex-analysis-bounds
1119 (cons (point) (point-max)))
1120 (semantic-lex-current-depth 0)
1121 (semantic-lex-maximum-depth
1122 semantic-lex-depth)
1123 )
1124 (when ,condition ,@forms)
1125 semantic-lex-token-stream)))
1126 ))
1127
1128 (defmacro define-lex-regex-analyzer (name doc regexp &rest forms)
1129 "Create a lexical analyzer with NAME and DOC that will match REGEXP.
1130 FORMS are evaluated upon a successful match.
1131 See `define-lex-analyzer' for more about analyzers."
1132 `(define-lex-analyzer ,name
1133 ,doc
1134 (looking-at ,regexp)
1135 ,@forms
1136 ))
1137
1138 (defmacro define-lex-simple-regex-analyzer (name doc regexp toksym
1139 &optional index
1140 &rest forms)
1141 "Create a lexical analyzer with NAME and DOC that match REGEXP.
1142 TOKSYM is the symbol to use when creating a semantic lexical token.
1143 INDEX is the index into the match that defines the bounds of the token.
1144 Index should be a plain integer, and not specified in the macro as an
1145 expression.
1146 FORMS are evaluated upon a successful match BEFORE the new token is
1147 created. It is valid to ignore FORMS.
1148 See `define-lex-analyzer' for more about analyzers."
1149 `(define-lex-analyzer ,name
1150 ,doc
1151 (looking-at ,regexp)
1152 ,@forms
1153 (semantic-lex-push-token
1154 (semantic-lex-token ,toksym
1155 (match-beginning ,(or index 0))
1156 (match-end ,(or index 0))))
1157 ))
1158
1159 (defmacro define-lex-block-analyzer (name doc spec1 &rest specs)
1160 "Create a lexical analyzer NAME for paired delimiters blocks.
1161 It detects a paired delimiters block or the corresponding open or
1162 close delimiter depending on the value of the variable
1163 `semantic-lex-current-depth'. DOC is the documentation string of the lexical
1164 analyzer. SPEC1 and SPECS specify the token symbols and open, close
1165 delimiters used. Each SPEC has the form:
1166
1167 \(BLOCK-SYM (OPEN-DELIM OPEN-SYM) (CLOSE-DELIM CLOSE-SYM))
1168
1169 where BLOCK-SYM is the symbol returned in a block token. OPEN-DELIM
1170 and CLOSE-DELIM are respectively the open and close delimiters
1171 identifying a block. OPEN-SYM and CLOSE-SYM are respectively the
1172 symbols returned in open and close tokens."
1173 (let ((specs (cons spec1 specs))
1174 spec open olist clist)
1175 (while specs
1176 (setq spec (car specs)
1177 specs (cdr specs)
1178 open (nth 1 spec)
1179 ;; build alist ((OPEN-DELIM OPEN-SYM BLOCK-SYM) ...)
1180 olist (cons (list (car open) (cadr open) (car spec)) olist)
1181 ;; build alist ((CLOSE-DELIM CLOSE-SYM) ...)
1182 clist (cons (nth 2 spec) clist)))
1183 `(define-lex-analyzer ,name
1184 ,doc
1185 (and
1186 (looking-at "\\(\\s(\\|\\s)\\)")
1187 (let ((text (match-string 0)) match)
1188 (cond
1189 ((setq match (assoc text ',olist))
1190 (if (or (not semantic-lex-maximum-depth)
1191 (< semantic-lex-current-depth semantic-lex-maximum-depth))
1192 (progn
1193 (setq semantic-lex-current-depth (1+ semantic-lex-current-depth))
1194 (semantic-lex-push-token
1195 (semantic-lex-token
1196 (nth 1 match)
1197 (match-beginning 0) (match-end 0))))
1198 (semantic-lex-push-token
1199 (semantic-lex-token
1200 (nth 2 match)
1201 (match-beginning 0)
1202 (save-excursion
1203 (semantic-lex-unterminated-syntax-protection (nth 2 match)
1204 (forward-list 1)
1205 (point)))
1206 ))
1207 ))
1208 ((setq match (assoc text ',clist))
1209 (setq semantic-lex-current-depth (1- semantic-lex-current-depth))
1210 (semantic-lex-push-token
1211 (semantic-lex-token
1212 (nth 1 match)
1213 (match-beginning 0) (match-end 0)))))))
1214 )))
1215 \f
1216 ;;; Analyzers
1217 ;;
1218 ;; Pre-defined common analyzers.
1219 ;;
1220 (define-lex-analyzer semantic-lex-default-action
1221 "The default action when no other lexical actions match text.
1222 This action will just throw an error."
1223 t
1224 (error "Unmatched Text during Lexical Analysis"))
1225
1226 (define-lex-analyzer semantic-lex-beginning-of-line
1227 "Detect and create a beginning of line token (BOL)."
1228 (and (bolp)
1229 ;; Just insert a (bol N . N) token in the token stream,
1230 ;; without moving the point. N is the point at the
1231 ;; beginning of line.
1232 (semantic-lex-push-token (semantic-lex-token 'bol (point) (point)))
1233 nil) ;; CONTINUE
1234 ;; We identify and add the BOL token onto the stream, but since
1235 ;; semantic-lex-end-point doesn't move, we always fail CONDITION, and have no
1236 ;; FORMS body.
1237 nil)
1238
1239 (define-lex-simple-regex-analyzer semantic-lex-newline
1240 "Detect and create newline tokens."
1241 "\\s-*\\(\n\\|\\s>\\)" 'newline 1)
1242
1243 (define-lex-regex-analyzer semantic-lex-newline-as-whitespace
1244 "Detect and create newline tokens.
1245 Use this ONLY if newlines are not whitespace characters (such as when
1246 they are comment end characters) AND when you want whitespace tokens."
1247 "\\s-*\\(\n\\|\\s>\\)"
1248 ;; Language wants whitespaces. Create a token for it.
1249 (if (eq (semantic-lex-token-class (car semantic-lex-token-stream))
1250 'whitespace)
1251 ;; Merge whitespace tokens together if they are adjacent. Two
1252 ;; whitespace tokens may be sperated by a comment which is not in
1253 ;; the token stream.
1254 (setcdr (semantic-lex-token-bounds (car semantic-lex-token-stream))
1255 (match-end 0))
1256 (semantic-lex-push-token
1257 (semantic-lex-token
1258 'whitespace (match-beginning 0) (match-end 0)))))
1259
1260 (define-lex-regex-analyzer semantic-lex-ignore-newline
1261 "Detect and ignore newline tokens.
1262 Use this ONLY if newlines are not whitespace characters (such as when
1263 they are comment end characters)."
1264 "\\s-*\\(\n\\|\\s>\\)"
1265 (setq semantic-lex-end-point (match-end 0)))
1266
1267 (define-lex-regex-analyzer semantic-lex-whitespace
1268 "Detect and create whitespace tokens."
1269 ;; catch whitespace when needed
1270 "\\s-+"
1271 ;; Language wants whitespaces. Create a token for it.
1272 (if (eq (semantic-lex-token-class (car semantic-lex-token-stream))
1273 'whitespace)
1274 ;; Merge whitespace tokens together if they are adjacent. Two
1275 ;; whitespace tokens may be sperated by a comment which is not in
1276 ;; the token stream.
1277 (progn
1278 (setq semantic-lex-end-point (match-end 0))
1279 (setcdr (semantic-lex-token-bounds (car semantic-lex-token-stream))
1280 semantic-lex-end-point))
1281 (semantic-lex-push-token
1282 (semantic-lex-token
1283 'whitespace (match-beginning 0) (match-end 0)))))
1284
1285 (define-lex-regex-analyzer semantic-lex-ignore-whitespace
1286 "Detect and skip over whitespace tokens."
1287 ;; catch whitespace when needed
1288 "\\s-+"
1289 ;; Skip over the detected whitespace, do not create a token for it.
1290 (setq semantic-lex-end-point (match-end 0)))
1291
1292 (define-lex-simple-regex-analyzer semantic-lex-number
1293 "Detect and create number tokens.
1294 See `semantic-lex-number-expression' for details on matching numbers,
1295 and number formats."
1296 semantic-lex-number-expression 'number)
1297
1298 (define-lex-regex-analyzer semantic-lex-symbol-or-keyword
1299 "Detect and create symbol and keyword tokens."
1300 "\\(\\sw\\|\\s_\\)+"
1301 (semantic-lex-push-token
1302 (semantic-lex-token
1303 (or (semantic-lex-keyword-p (match-string 0)) 'symbol)
1304 (match-beginning 0) (match-end 0))))
1305
1306 (define-lex-simple-regex-analyzer semantic-lex-charquote
1307 "Detect and create charquote tokens."
1308 ;; Character quoting characters (ie, \n as newline)
1309 "\\s\\+" 'charquote)
1310
1311 (define-lex-simple-regex-analyzer semantic-lex-punctuation
1312 "Detect and create punctuation tokens."
1313 "\\(\\s.\\|\\s$\\|\\s'\\)" 'punctuation)
1314
1315 (define-lex-analyzer semantic-lex-punctuation-type
1316 "Detect and create a punctuation type token.
1317 Recognized punctuations are defined in the current table of lexical
1318 types, as the value of the `punctuation' token type."
1319 (and (looking-at "\\(\\s.\\|\\s$\\|\\s'\\)+")
1320 (let* ((key (match-string 0))
1321 (pos (match-beginning 0))
1322 (end (match-end 0))
1323 (len (- end pos))
1324 (lst (semantic-lex-type-value "punctuation" t))
1325 (def (car lst)) ;; default lexical symbol or nil
1326 (lst (cdr lst)) ;; alist of (LEX-SYM . PUNCT-STRING)
1327 (elt nil))
1328 (if lst
1329 ;; Starting with the longest one, search if the
1330 ;; punctuation string is defined for this language.
1331 (while (and (> len 0) (not (setq elt (rassoc key lst))))
1332 (setq len (1- len)
1333 key (substring key 0 len))))
1334 (if elt ;; Return the punctuation token found
1335 (semantic-lex-push-token
1336 (semantic-lex-token (car elt) pos (+ pos len)))
1337 (if def ;; Return a default generic token
1338 (semantic-lex-push-token
1339 (semantic-lex-token def pos end))
1340 ;; Nothing match
1341 )))))
1342
1343 (define-lex-regex-analyzer semantic-lex-paren-or-list
1344 "Detect open parenthesis.
1345 Return either a paren token or a semantic list token depending on
1346 `semantic-lex-current-depth'."
1347 "\\s("
1348 (if (or (not semantic-lex-maximum-depth)
1349 (< semantic-lex-current-depth semantic-lex-maximum-depth))
1350 (progn
1351 (setq semantic-lex-current-depth (1+ semantic-lex-current-depth))
1352 (semantic-lex-push-token
1353 (semantic-lex-token
1354 'open-paren (match-beginning 0) (match-end 0))))
1355 (semantic-lex-push-token
1356 (semantic-lex-token
1357 'semantic-list (match-beginning 0)
1358 (save-excursion
1359 (semantic-lex-unterminated-syntax-protection 'semantic-list
1360 (forward-list 1)
1361 (point))
1362 )))
1363 ))
1364
1365 (define-lex-simple-regex-analyzer semantic-lex-open-paren
1366 "Detect and create an open parenthisis token."
1367 "\\s(" 'open-paren 0 (setq semantic-lex-current-depth (1+ semantic-lex-current-depth)))
1368
1369 (define-lex-simple-regex-analyzer semantic-lex-close-paren
1370 "Detect and create a close paren token."
1371 "\\s)" 'close-paren 0 (setq semantic-lex-current-depth (1- semantic-lex-current-depth)))
1372
1373 (define-lex-regex-analyzer semantic-lex-string
1374 "Detect and create a string token."
1375 "\\s\""
1376 ;; Zing to the end of this string.
1377 (semantic-lex-push-token
1378 (semantic-lex-token
1379 'string (point)
1380 (save-excursion
1381 (semantic-lex-unterminated-syntax-protection 'string
1382 (forward-sexp 1)
1383 (point))
1384 ))))
1385
1386 (define-lex-regex-analyzer semantic-lex-comments
1387 "Detect and create a comment token."
1388 semantic-lex-comment-regex
1389 (save-excursion
1390 (forward-comment 1)
1391 ;; Generate newline token if enabled
1392 (if (bolp) (backward-char 1))
1393 (setq semantic-lex-end-point (point))
1394 ;; Language wants comments or want them as whitespaces,
1395 ;; link them together.
1396 (if (eq (semantic-lex-token-class (car semantic-lex-token-stream)) 'comment)
1397 (setcdr (semantic-lex-token-bounds (car semantic-lex-token-stream))
1398 semantic-lex-end-point)
1399 (semantic-lex-push-token
1400 (semantic-lex-token
1401 'comment (match-beginning 0) semantic-lex-end-point)))))
1402
1403 (define-lex-regex-analyzer semantic-lex-comments-as-whitespace
1404 "Detect comments and create a whitespace token."
1405 semantic-lex-comment-regex
1406 (save-excursion
1407 (forward-comment 1)
1408 ;; Generate newline token if enabled
1409 (if (bolp) (backward-char 1))
1410 (setq semantic-lex-end-point (point))
1411 ;; Language wants comments or want them as whitespaces,
1412 ;; link them together.
1413 (if (eq (semantic-lex-token-class (car semantic-lex-token-stream)) 'whitespace)
1414 (setcdr (semantic-lex-token-bounds (car semantic-lex-token-stream))
1415 semantic-lex-end-point)
1416 (semantic-lex-push-token
1417 (semantic-lex-token
1418 'whitespace (match-beginning 0) semantic-lex-end-point)))))
1419
1420 (define-lex-regex-analyzer semantic-lex-ignore-comments
1421 "Detect and create a comment token."
1422 semantic-lex-comment-regex
1423 (let ((comment-start-point (point)))
1424 (forward-comment 1)
1425 (if (eq (point) comment-start-point)
1426 ;; In this case our start-skip string failed
1427 ;; to work properly. Lets try and move over
1428 ;; whatever white space we matched to begin
1429 ;; with.
1430 (skip-syntax-forward "-.'"
1431 (save-excursion
1432 (end-of-line)
1433 (point)))
1434 ;; We may need to back up so newlines or whitespace is generated.
1435 (if (bolp)
1436 (backward-char 1)))
1437 (if (eq (point) comment-start-point)
1438 (error "Strange comment syntax prevents lexical analysis"))
1439 (setq semantic-lex-end-point (point))))
1440 \f
1441 ;;; Comment lexer
1442 ;;
1443 ;; Predefined lexers that could be used instead of creating new
1444 ;; analyers.
1445
1446 (define-lex semantic-comment-lexer
1447 "A simple lexical analyzer that handles comments.
1448 This lexer will only return comment tokens. It is the default lexer
1449 used by `semantic-find-doc-snarf-comment' to snarf up the comment at
1450 point."
1451 semantic-lex-ignore-whitespace
1452 semantic-lex-ignore-newline
1453 semantic-lex-comments
1454 semantic-lex-default-action)
1455
1456 ;;; Test Lexer
1457 ;;
1458 (define-lex semantic-simple-lexer
1459 "A simple lexical analyzer that handles simple buffers.
1460 This lexer ignores comments and whitespace, and will return
1461 syntax as specified by the syntax table."
1462 semantic-lex-ignore-whitespace
1463 semantic-lex-ignore-newline
1464 semantic-lex-number
1465 semantic-lex-symbol-or-keyword
1466 semantic-lex-charquote
1467 semantic-lex-paren-or-list
1468 semantic-lex-close-paren
1469 semantic-lex-string
1470 semantic-lex-ignore-comments
1471 semantic-lex-punctuation
1472 semantic-lex-default-action)
1473 \f
1474 ;;; Analyzers generated from grammar.
1475 ;;
1476 ;; Some analyzers are hand written. Analyzers created with these
1477 ;; functions are generated from the grammar files.
1478
1479 (defmacro define-lex-keyword-type-analyzer (name doc syntax)
1480 "Define a keyword type analyzer NAME with DOC string.
1481 SYNTAX is the regexp that matches a keyword syntactic expression."
1482 (let ((key (make-symbol "key")))
1483 `(define-lex-analyzer ,name
1484 ,doc
1485 (and (looking-at ,syntax)
1486 (let ((,key (semantic-lex-keyword-p (match-string 0))))
1487 (when ,key
1488 (semantic-lex-push-token
1489 (semantic-lex-token
1490 ,key (match-beginning 0) (match-end 0)))))))
1491 ))
1492
1493 (defmacro define-lex-sexp-type-analyzer (name doc syntax token)
1494 "Define a sexp type analyzer NAME with DOC string.
1495 SYNTAX is the regexp that matches the beginning of the s-expression.
1496 TOKEN is the lexical token returned when SYNTAX matches."
1497 `(define-lex-regex-analyzer ,name
1498 ,doc
1499 ,syntax
1500 (semantic-lex-push-token
1501 (semantic-lex-token
1502 ,token (point)
1503 (save-excursion
1504 (semantic-lex-unterminated-syntax-protection ,token
1505 (forward-sexp 1)
1506 (point))))))
1507 )
1508
1509 (defmacro define-lex-regex-type-analyzer (name doc syntax matches default)
1510 "Define a regexp type analyzer NAME with DOC string.
1511 SYNTAX is the regexp that matches a syntactic expression.
1512 MATCHES is an alist of lexical elements used to refine the syntactic
1513 expression.
1514 DEFAULT is the default lexical token returned when no MATCHES."
1515 (if matches
1516 (let* ((val (make-symbol "val"))
1517 (lst (make-symbol "lst"))
1518 (elt (make-symbol "elt"))
1519 (pos (make-symbol "pos"))
1520 (end (make-symbol "end")))
1521 `(define-lex-analyzer ,name
1522 ,doc
1523 (and (looking-at ,syntax)
1524 (let* ((,val (match-string 0))
1525 (,pos (match-beginning 0))
1526 (,end (match-end 0))
1527 (,lst ,matches)
1528 ,elt)
1529 (while (and ,lst (not ,elt))
1530 (if (string-match (cdar ,lst) ,val)
1531 (setq ,elt (caar ,lst))
1532 (setq ,lst (cdr ,lst))))
1533 (semantic-lex-push-token
1534 (semantic-lex-token (or ,elt ,default) ,pos ,end))))
1535 ))
1536 `(define-lex-simple-regex-analyzer ,name
1537 ,doc
1538 ,syntax ,default)
1539 ))
1540
1541 (defmacro define-lex-string-type-analyzer (name doc syntax matches default)
1542 "Define a string type analyzer NAME with DOC string.
1543 SYNTAX is the regexp that matches a syntactic expression.
1544 MATCHES is an alist of lexical elements used to refine the syntactic
1545 expression.
1546 DEFAULT is the default lexical token returned when no MATCHES."
1547 (if matches
1548 (let* ((val (make-symbol "val"))
1549 (lst (make-symbol "lst"))
1550 (elt (make-symbol "elt"))
1551 (pos (make-symbol "pos"))
1552 (end (make-symbol "end"))
1553 (len (make-symbol "len")))
1554 `(define-lex-analyzer ,name
1555 ,doc
1556 (and (looking-at ,syntax)
1557 (let* ((,val (match-string 0))
1558 (,pos (match-beginning 0))
1559 (,end (match-end 0))
1560 (,len (- ,end ,pos))
1561 (,lst ,matches)
1562 ,elt)
1563 ;; Starting with the longest one, search if a lexical
1564 ;; value match a token defined for this language.
1565 (while (and (> ,len 0) (not (setq ,elt (rassoc ,val ,lst))))
1566 (setq ,len (1- ,len)
1567 ,val (substring ,val 0 ,len)))
1568 (when ,elt ;; Adjust token end position.
1569 (setq ,elt (car ,elt)
1570 ,end (+ ,pos ,len)))
1571 (semantic-lex-push-token
1572 (semantic-lex-token (or ,elt ,default) ,pos ,end))))
1573 ))
1574 `(define-lex-simple-regex-analyzer ,name
1575 ,doc
1576 ,syntax ,default)
1577 ))
1578
1579 (defmacro define-lex-block-type-analyzer (name doc syntax matches)
1580 "Define a block type analyzer NAME with DOC string.
1581
1582 SYNTAX is the regexp that matches block delimiters, typically the
1583 open (`\\\\s(') and close (`\\\\s)') parenthesis syntax classes.
1584
1585 MATCHES is a pair (OPEN-SPECS . CLOSE-SPECS) that defines blocks.
1586
1587 OPEN-SPECS is a list of (OPEN-DELIM OPEN-TOKEN BLOCK-TOKEN) elements
1588 where:
1589
1590 OPEN-DELIM is a string: the block open delimiter character.
1591
1592 OPEN-TOKEN is the lexical token class associated to the OPEN-DELIM
1593 delimiter.
1594
1595 BLOCK-TOKEN is the lexical token class associated to the block
1596 that starts at the OPEN-DELIM delimiter.
1597
1598 CLOSE-SPECS is a list of (CLOSE-DELIM CLOSE-TOKEN) elements where:
1599
1600 CLOSE-DELIM is a string: the block end delimiter character.
1601
1602 CLOSE-TOKEN is the lexical token class associated to the
1603 CLOSE-DELIM delimiter.
1604
1605 Each element in OPEN-SPECS must have a corresponding element in
1606 CLOSE-SPECS.
1607
1608 The lexer will return a BLOCK-TOKEN token when the value of
1609 `semantic-lex-current-depth' is greater than or equal to the maximum
1610 depth of parenthesis tracking (see also the function `semantic-lex').
1611 Otherwise it will return OPEN-TOKEN and CLOSE-TOKEN tokens.
1612
1613 TO DO: Put the following in the developer's guide and just put a
1614 reference here.
1615
1616 In the grammar:
1617
1618 The value of a block token must be a string that contains a readable
1619 sexp of the form:
1620
1621 \"(OPEN-TOKEN CLOSE-TOKEN)\"
1622
1623 OPEN-TOKEN and CLOSE-TOKEN represent the block delimiters, and must be
1624 lexical tokens of respectively `open-paren' and `close-paren' types.
1625 Their value is the corresponding delimiter character as a string.
1626
1627 Here is a small example to analyze a parenthesis block:
1628
1629 %token <block> PAREN_BLOCK \"(LPAREN RPAREN)\"
1630 %token <open-paren> LPAREN \"(\"
1631 %token <close-paren> RPAREN \")\"
1632
1633 When the lexer encounters the open-paren delimiter \"(\":
1634
1635 - If the maximum depth of parenthesis tracking is not reached (that
1636 is, current depth < max depth), it returns a (LPAREN start . end)
1637 token, then continue analysis inside the block. Later, when the
1638 corresponding close-paren delimiter \")\" will be encountered, it
1639 will return a (RPAREN start . end) token.
1640
1641 - If the maximum depth of parenthesis tracking is reached (current
1642 depth >= max depth), it returns the whole parenthesis block as
1643 a (PAREN_BLOCK start . end) token."
1644 (let* ((val (make-symbol "val"))
1645 (lst (make-symbol "lst"))
1646 (elt (make-symbol "elt")))
1647 `(define-lex-analyzer ,name
1648 ,doc
1649 (and
1650 (looking-at ,syntax) ;; "\\(\\s(\\|\\s)\\)"
1651 (let ((,val (match-string 0))
1652 (,lst ,matches)
1653 ,elt)
1654 (cond
1655 ((setq ,elt (assoc ,val (car ,lst)))
1656 (if (or (not semantic-lex-maximum-depth)
1657 (< semantic-lex-current-depth semantic-lex-maximum-depth))
1658 (progn
1659 (setq semantic-lex-current-depth (1+ semantic-lex-current-depth))
1660 (semantic-lex-push-token
1661 (semantic-lex-token
1662 (nth 1 ,elt)
1663 (match-beginning 0) (match-end 0))))
1664 (semantic-lex-push-token
1665 (semantic-lex-token
1666 (nth 2 ,elt)
1667 (match-beginning 0)
1668 (save-excursion
1669 (semantic-lex-unterminated-syntax-protection (nth 2 ,elt)
1670 (forward-list 1)
1671 (point)))))))
1672 ((setq ,elt (assoc ,val (cdr ,lst)))
1673 (setq semantic-lex-current-depth (1- semantic-lex-current-depth))
1674 (semantic-lex-push-token
1675 (semantic-lex-token
1676 (nth 1 ,elt)
1677 (match-beginning 0) (match-end 0))))
1678 ))))
1679 ))
1680 \f
1681 ;;; Lexical Safety
1682 ;;
1683 ;; The semantic lexers, unlike other lexers, can throw errors on
1684 ;; unbalanced syntax. Since editing is all about changeging test
1685 ;; we need to provide a convenient way to protect against syntactic
1686 ;; inequalities.
1687
1688 (defmacro semantic-lex-catch-errors (symbol &rest forms)
1689 "Using SYMBOL, execute FORMS catching lexical errors.
1690 If FORMS results in a call to the parser that throws a lexical error,
1691 the error will be caught here without the buffer's cache being thrown
1692 out of date.
1693 If there is an error, the syntax that failed is returned.
1694 If there is no error, then the last value of FORMS is returned."
1695 (let ((ret (make-symbol "ret"))
1696 (syntax (make-symbol "syntax"))
1697 (start (make-symbol "start"))
1698 (end (make-symbol "end")))
1699 `(let* ((semantic-lex-unterminated-syntax-end-function
1700 (lambda (,syntax ,start ,end)
1701 (throw ',symbol ,syntax)))
1702 ;; Delete the below when semantic-flex is fully retired.
1703 (semantic-flex-unterminated-syntax-end-function
1704 semantic-lex-unterminated-syntax-end-function)
1705 (,ret (catch ',symbol
1706 (save-excursion
1707 ,@forms
1708 nil))))
1709 ;; Great Sadness. Assume that FORMS execute within the
1710 ;; confines of the current buffer only! Mark this thing
1711 ;; unparseable iff the special symbol was thrown. This
1712 ;; will prevent future calls from parsing, but will allow
1713 ;; then to still return the cache.
1714 (when ,ret
1715 ;; Leave this message off. If an APP using this fcn wants
1716 ;; a message, they can do it themselves. This cleans up
1717 ;; problems with the idle scheduler obscuring useful data.
1718 ;;(message "Buffer not currently parsable (%S)." ,ret)
1719 (semantic-parse-tree-unparseable))
1720 ,ret)))
1721 (put 'semantic-lex-catch-errors 'lisp-indent-function 1)
1722
1723 \f
1724 ;;; Interfacing with edebug
1725 ;;
1726 (add-hook
1727 'edebug-setup-hook
1728 #'(lambda ()
1729
1730 (def-edebug-spec define-lex
1731 (&define name stringp (&rest symbolp))
1732 )
1733 (def-edebug-spec define-lex-analyzer
1734 (&define name stringp form def-body)
1735 )
1736 (def-edebug-spec define-lex-regex-analyzer
1737 (&define name stringp form def-body)
1738 )
1739 (def-edebug-spec define-lex-simple-regex-analyzer
1740 (&define name stringp form symbolp [ &optional form ] def-body)
1741 )
1742 (def-edebug-spec define-lex-block-analyzer
1743 (&define name stringp form (&rest form))
1744 )
1745 (def-edebug-spec semantic-lex-catch-errors
1746 (symbolp def-body)
1747 )
1748
1749 ))
1750 \f
1751 ;;; Compatibility with Semantic 1.x lexical analysis
1752 ;;
1753 ;; NOTE: DELETE THIS SOMEDAY SOON
1754
1755 (semantic-alias-obsolete 'semantic-flex-start 'semantic-lex-token-start "23.2")
1756 (semantic-alias-obsolete 'semantic-flex-end 'semantic-lex-token-end "23.2")
1757 (semantic-alias-obsolete 'semantic-flex-text 'semantic-lex-token-text "23.2")
1758 (semantic-alias-obsolete 'semantic-flex-make-keyword-table 'semantic-lex-make-keyword-table "23.2")
1759 (semantic-alias-obsolete 'semantic-flex-keyword-p 'semantic-lex-keyword-p "23.2")
1760 (semantic-alias-obsolete 'semantic-flex-keyword-put 'semantic-lex-keyword-put "23.2")
1761 (semantic-alias-obsolete 'semantic-flex-keyword-get 'semantic-lex-keyword-get "23.2")
1762 (semantic-alias-obsolete 'semantic-flex-map-keywords 'semantic-lex-map-keywords "23.2")
1763 (semantic-alias-obsolete 'semantic-flex-keywords 'semantic-lex-keywords "23.2")
1764 (semantic-alias-obsolete 'semantic-flex-buffer 'semantic-lex-buffer "23.2")
1765 (semantic-alias-obsolete 'semantic-flex-list 'semantic-lex-list "23.2")
1766
1767 ;; This simple scanner uses the syntax table to generate a stream of
1768 ;; simple tokens of the form:
1769 ;;
1770 ;; (SYMBOL START . END)
1771 ;;
1772 ;; Where symbol is the type of thing it is. START and END mark that
1773 ;; objects boundary.
1774
1775 (defvar semantic-flex-tokens semantic-lex-tokens
1776 "An alist of of semantic token types.
1777 See variable `semantic-lex-tokens'.")
1778
1779 (defvar semantic-flex-unterminated-syntax-end-function
1780 (lambda (syntax syntax-start flex-end) flex-end)
1781 "Function called when unterminated syntax is encountered.
1782 This should be set to one function. That function should take three
1783 parameters. The SYNTAX, or type of syntax which is unterminated.
1784 SYNTAX-START where the broken syntax begins.
1785 FLEX-END is where the lexical analysis was asked to end.
1786 This function can be used for languages that can intelligently fix up
1787 broken syntax, or the exit lexical analysis via `throw' or `signal'
1788 when finding unterminated syntax.")
1789
1790 (defvar semantic-flex-extensions nil
1791 "Buffer local extensions to the lexical analyzer.
1792 This should contain an alist with a key of a regex and a data element of
1793 a function. The function should both move point, and return a lexical
1794 token of the form:
1795 ( TYPE START . END)
1796 nil is also a valid return value.
1797 TYPE can be any type of symbol, as long as it doesn't occur as a
1798 nonterminal in the language definition.")
1799 (make-variable-buffer-local 'semantic-flex-extensions)
1800
1801 (defvar semantic-flex-syntax-modifications nil
1802 "Changes to the syntax table for this buffer.
1803 These changes are active only while the buffer is being flexed.
1804 This is a list where each element has the form:
1805 (CHAR CLASS)
1806 CHAR is the char passed to `modify-syntax-entry',
1807 and CLASS is the string also passed to `modify-syntax-entry' to define
1808 what syntax class CHAR has.")
1809 (make-variable-buffer-local 'semantic-flex-syntax-modifications)
1810
1811 (defvar semantic-ignore-comments t
1812 "Default comment handling.
1813 t means to strip comments when flexing. Nil means to keep comments
1814 as part of the token stream.")
1815 (make-variable-buffer-local 'semantic-ignore-comments)
1816
1817 (defvar semantic-flex-enable-newlines nil
1818 "When flexing, report 'newlines as syntactic elements.
1819 Useful for languages where the newline is a special case terminator.
1820 Only set this on a per mode basis, not globally.")
1821 (make-variable-buffer-local 'semantic-flex-enable-newlines)
1822
1823 (defvar semantic-flex-enable-whitespace nil
1824 "When flexing, report 'whitespace as syntactic elements.
1825 Useful for languages where the syntax is whitespace dependent.
1826 Only set this on a per mode basis, not globally.")
1827 (make-variable-buffer-local 'semantic-flex-enable-whitespace)
1828
1829 (defvar semantic-flex-enable-bol nil
1830 "When flexing, report beginning of lines as syntactic elements.
1831 Useful for languages like python which are indentation sensitive.
1832 Only set this on a per mode basis, not globally.")
1833 (make-variable-buffer-local 'semantic-flex-enable-bol)
1834
1835 (defvar semantic-number-expression semantic-lex-number-expression
1836 "See variable `semantic-lex-number-expression'.")
1837 (make-variable-buffer-local 'semantic-number-expression)
1838
1839 (defvar semantic-flex-depth 0
1840 "Default flexing depth.
1841 This specifies how many lists to create tokens in.")
1842 (make-variable-buffer-local 'semantic-flex-depth)
1843
1844 (defun semantic-flex (start end &optional depth length)
1845 "Using the syntax table, do something roughly equivalent to flex.
1846 Semantically check between START and END. Optional argument DEPTH
1847 indicates at what level to scan over entire lists.
1848 The return value is a token stream. Each element is a list, such of
1849 the form (symbol start-expression . end-expression) where SYMBOL
1850 denotes the token type.
1851 See `semantic-flex-tokens' variable for details on token types.
1852 END does not mark the end of the text scanned, only the end of the
1853 beginning of text scanned. Thus, if a string extends past END, the
1854 end of the return token will be larger than END. To truly restrict
1855 scanning, use `narrow-to-region'.
1856 The last argument, LENGTH specifies that `semantic-flex' should only
1857 return LENGTH tokens."
1858 (message "`semantic-flex' is an obsolete function. Use `define-lex' to create lexers.")
1859 (if (not semantic-flex-keywords-obarray)
1860 (setq semantic-flex-keywords-obarray [ nil ]))
1861 (let ((ts nil)
1862 (pos (point))
1863 (ep nil)
1864 (curdepth 0)
1865 (cs (if comment-start-skip
1866 (concat "\\(\\s<\\|" comment-start-skip "\\)")
1867 (concat "\\(\\s<\\)")))
1868 (newsyntax (copy-syntax-table (syntax-table)))
1869 (mods semantic-flex-syntax-modifications)
1870 ;; Use the default depth if it is not specified.
1871 (depth (or depth semantic-flex-depth)))
1872 ;; Update the syntax table
1873 (while mods
1874 (modify-syntax-entry (car (car mods)) (car (cdr (car mods))) newsyntax)
1875 (setq mods (cdr mods)))
1876 (with-syntax-table newsyntax
1877 (goto-char start)
1878 (while (and (< (point) end) (or (not length) (<= (length ts) length)))
1879 (cond
1880 ;; catch beginning of lines when needed.
1881 ;; Must be done before catching any other tokens!
1882 ((and semantic-flex-enable-bol
1883 (bolp)
1884 ;; Just insert a (bol N . N) token in the token stream,
1885 ;; without moving the point. N is the point at the
1886 ;; beginning of line.
1887 (setq ts (cons (cons 'bol (cons (point) (point))) ts))
1888 nil)) ;; CONTINUE
1889 ;; special extensions, includes whitespace, nl, etc.
1890 ((and semantic-flex-extensions
1891 (let ((fe semantic-flex-extensions)
1892 (r nil))
1893 (while fe
1894 (if (looking-at (car (car fe)))
1895 (setq ts (cons (funcall (cdr (car fe))) ts)
1896 r t
1897 fe nil
1898 ep (point)))
1899 (setq fe (cdr fe)))
1900 (if (and r (not (car ts))) (setq ts (cdr ts)))
1901 r)))
1902 ;; catch newlines when needed
1903 ((looking-at "\\s-*\\(\n\\|\\s>\\)")
1904 (if semantic-flex-enable-newlines
1905 (setq ep (match-end 1)
1906 ts (cons (cons 'newline
1907 (cons (match-beginning 1) ep))
1908 ts))))
1909 ;; catch whitespace when needed
1910 ((looking-at "\\s-+")
1911 (if semantic-flex-enable-whitespace
1912 ;; Language wants whitespaces, link them together.
1913 (if (eq (car (car ts)) 'whitespace)
1914 (setcdr (cdr (car ts)) (match-end 0))
1915 (setq ts (cons (cons 'whitespace
1916 (cons (match-beginning 0)
1917 (match-end 0)))
1918 ts)))))
1919 ;; numbers
1920 ((and semantic-number-expression
1921 (looking-at semantic-number-expression))
1922 (setq ts (cons (cons 'number
1923 (cons (match-beginning 0)
1924 (match-end 0)))
1925 ts)))
1926 ;; symbols
1927 ((looking-at "\\(\\sw\\|\\s_\\)+")
1928 (setq ts (cons (cons
1929 ;; Get info on if this is a keyword or not
1930 (or (semantic-lex-keyword-p (match-string 0))
1931 'symbol)
1932 (cons (match-beginning 0) (match-end 0)))
1933 ts)))
1934 ;; Character quoting characters (ie, \n as newline)
1935 ((looking-at "\\s\\+")
1936 (setq ts (cons (cons 'charquote
1937 (cons (match-beginning 0) (match-end 0)))
1938 ts)))
1939 ;; Open parens, or semantic-lists.
1940 ((looking-at "\\s(")
1941 (if (or (not depth) (< curdepth depth))
1942 (progn
1943 (setq curdepth (1+ curdepth))
1944 (setq ts (cons (cons 'open-paren
1945 (cons (match-beginning 0) (match-end 0)))
1946 ts)))
1947 (setq ts (cons
1948 (cons 'semantic-list
1949 (cons (match-beginning 0)
1950 (save-excursion
1951 (condition-case nil
1952 (forward-list 1)
1953 ;; This case makes flex robust
1954 ;; to broken lists.
1955 (error
1956 (goto-char
1957 (funcall
1958 semantic-flex-unterminated-syntax-end-function
1959 'semantic-list
1960 start end))))
1961 (setq ep (point)))))
1962 ts))))
1963 ;; Close parens
1964 ((looking-at "\\s)")
1965 (setq ts (cons (cons 'close-paren
1966 (cons (match-beginning 0) (match-end 0)))
1967 ts))
1968 (setq curdepth (1- curdepth)))
1969 ;; String initiators
1970 ((looking-at "\\s\"")
1971 ;; Zing to the end of this string.
1972 (setq ts (cons (cons 'string
1973 (cons (match-beginning 0)
1974 (save-excursion
1975 (condition-case nil
1976 (forward-sexp 1)
1977 ;; This case makes flex
1978 ;; robust to broken strings.
1979 (error
1980 (goto-char
1981 (funcall
1982 semantic-flex-unterminated-syntax-end-function
1983 'string
1984 start end))))
1985 (setq ep (point)))))
1986 ts)))
1987 ;; comments
1988 ((looking-at cs)
1989 (if (and semantic-ignore-comments
1990 (not semantic-flex-enable-whitespace))
1991 ;; If the language doesn't deal with comments nor
1992 ;; whitespaces, ignore them here.
1993 (let ((comment-start-point (point)))
1994 (forward-comment 1)
1995 (if (eq (point) comment-start-point)
1996 ;; In this case our start-skip string failed
1997 ;; to work properly. Lets try and move over
1998 ;; whatever white space we matched to begin
1999 ;; with.
2000 (skip-syntax-forward "-.'"
2001 (save-excursion
2002 (end-of-line)
2003 (point)))
2004 ;;(forward-comment 1)
2005 ;; Generate newline token if enabled
2006 (if (and semantic-flex-enable-newlines
2007 (bolp))
2008 (backward-char 1)))
2009 (if (eq (point) comment-start-point)
2010 (error "Strange comment syntax prevents lexical analysis"))
2011 (setq ep (point)))
2012 (let ((tk (if semantic-ignore-comments 'whitespace 'comment)))
2013 (save-excursion
2014 (forward-comment 1)
2015 ;; Generate newline token if enabled
2016 (if (and semantic-flex-enable-newlines
2017 (bolp))
2018 (backward-char 1))
2019 (setq ep (point)))
2020 ;; Language wants comments or want them as whitespaces,
2021 ;; link them together.
2022 (if (eq (car (car ts)) tk)
2023 (setcdr (cdr (car ts)) ep)
2024 (setq ts (cons (cons tk (cons (match-beginning 0) ep))
2025 ts))))))
2026 ;; punctuation
2027 ((looking-at "\\(\\s.\\|\\s$\\|\\s'\\)")
2028 (setq ts (cons (cons 'punctuation
2029 (cons (match-beginning 0) (match-end 0)))
2030 ts)))
2031 ;; unknown token
2032 (t
2033 (error "What is that?")))
2034 (goto-char (or ep (match-end 0)))
2035 (setq ep nil)))
2036 ;; maybe catch the last beginning of line when needed
2037 (and semantic-flex-enable-bol
2038 (= (point) end)
2039 (bolp)
2040 (setq ts (cons (cons 'bol (cons (point) (point))) ts)))
2041 (goto-char pos)
2042 ;;(message "Flexing muscles...done")
2043 (nreverse ts)))
2044
2045 (provide 'semantic/lex)
2046
2047 ;; Local variables:
2048 ;; generated-autoload-file: "loaddefs.el"
2049 ;; generated-autoload-load-name: "semantic/lex"
2050 ;; End:
2051
2052 ;; arch-tag: a47664fc-48d9-4b36-921f-cab0ea8cdf92
2053 ;;; semantic/lex.el ends here