code.delx.au - gnu-emacs-elpa/blob - packages/lex/lex.el

   1 ;;; lex.el --- Lexical analyser construction  -*- lexical-binding:t -*-
   2
   3 ;; Copyright (C) 2008,2013,2014  Free Software Foundation, Inc.
   4
   5 ;; Author: Stefan Monnier <monnier@iro.umontreal.ca>
   6 ;; Keywords:
   7 ;; Version: 1.1
   8
   9 ;; This program is free software; you can redistribute it and/or modify
  10 ;; it under the terms of the GNU General Public License as published by
  11 ;; the Free Software Foundation, either version 3 of the License, or
  12 ;; (at your option) any later version.
  13
  14 ;; This program is distributed in the hope that it will be useful,
  15 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17 ;; GNU General Public License for more details.
  18
  19 ;; You should have received a copy of the GNU General Public License
  20 ;; along with this program.  If not, see <http://www.gnu.org/licenses/>.
  21
  22 ;;; Commentary:
  23
  24 ;; Format of regexps is the same as used for `rx' and `sregex'.
  25 ;; Additions:
  26 ;; - (ere RE) specify regexps using the ERE syntax.
  27 ;; - (inter REs...) (aka `&') make a regexp that only matches
  28 ;;   if all its branches match.  E.g. (inter (ere ".*a.*") (ere ".*b.*"))
  29 ;;   match any string that contain both an "a" and a "b", in any order.
  30 ;; - (case-fold REs...) and (case-sensitive REs...) make a regexp that
  31 ;;   is case sensitive or not, regardless of case-fold-search.
  32
  33 ;; Input format of lexers:
  34 ;;
  35 ;; ALIST of the form ((RE . VAL) ...)
  36
  37 ;; Format of compiled DFA lexers:
  38 ;;
  39 ;; nil                     ; The trivial lexer that fails
  40 ;; (CHAR . LEXER)
  41 ;; (table . CHAR-TABLE)
  42 ;; (stop VAL . LEXER)      ; Match the empty string at point or LEXER.
  43 ;; (check (PREDICATE . ARG) SUCCESS-LEXER . FAILURE-LEXER)
  44
  45 ;; Intermediate NFA nodes may additionally look like:
  46 ;; (or LEXERs...)
  47 ;; (orelse LEXERs...)
  48 ;; (and LEXERs...)
  49 ;; (join CONT . EXIT)
  50 ;; Note: we call those things "NFA"s but they're not really NFAs.
  51
  52 ;;; Bugs:
  53
  54 ;; - `inter' doesn't work right.  Matching `join' to the corresponding `and'
  55 ;;   is done incorrectly in some cases.
  56 ;; - since `negate' uses intersections, it doesn't work right either.
  57 ;; - "(\<)*" leads to a DFA that gets stuck in a cycle.
  58
  59 ;;; Todo:
  60
  61 ;; - dfa "no-fail" simplifier
  62 ;; - dfa minimization
  63 ;; - dfa compaction (different representation)
  64 ;; - submatches
  65 ;; - backrefs?
  66 ;; - search rather than just match
  67 ;; - extensions:
  68 ;;   - repeated submatches
  69 ;;   - negation
  70 ;;   - lookbehind and lookahead
  71 ;;   - match(&search?) backward
  72 ;;   - agrep
  73
  74 ;;; Notes
  75
  76
  77
  78 ;; Search
  79 ;; ------
  80
  81 ;; To turn a match into a search, the basic idea is to use ".*RE" to get
  82 ;; a search-DFA as opposed to the match-DFA generated from "RE".
  83
  84 ;; Search in Plan9's regexp library is done as follows: match ".*RE" until
  85 ;; reaching the first match and then continue with only "RE".  The first
  86 ;; ".*RE" match corresponds to a search success for the leftmost shortest
  87 ;; match.  If we want the longest match, we need to continue.  But if we
  88 ;; continue with ".*RE" then we have no idea when to stop, so we should only
  89 ;; continue with "RE".
  90 ;; Downside: we may still match things after the "leftmost longest" match,
  91 ;; but hopefully will stop soon after.  I.e. we may look at chars past the
  92 ;; end of the leftmost longest match, but hopefully not too many.
  93
  94 ;; Alternatives:
  95 ;; - Like emacs/src/regexp.c, we can just start a match at every buffer
  96 ;;   position.  Advantage: no need for submatch info in order to find
  97 ;;   (match-beginning 0), no need for a separate search-DFA.
  98 ;;   Downsize: O(N^2) rather than O(N).  But it's no worse than what we live
  99 ;;   with for decades in src/regexp.c.
 100 ;;
 101 ;; - After the shortest-search, stop the search and do a longest-match
 102 ;;   starting at position (match-beginning 0).  The good thing is that we
 103 ;;   will not look at any char further than needed.  Also we don't need to
 104 ;;   figure out how to switch from ".*RE" to "RE" in the middle of the search.
 105 ;;   The downside is that we end up looking twice at the chars common to the
 106 ;;   shortest and longest matches.  Also this doesn't work: the shortest
 107 ;;   match may not be the leftmost match, so we can't just start the match
 108 ;;   at (match-beginning 0).
 109 ;;
 110 ;; - Generate a specialized search&match-DFA which encodes the job done by
 111 ;;   Plan9's regexp library.  I.e. do a specialized merge on
 112 ;;   (or LEXER (anything . LEXER)) where whenever we get a `stop' we don't
 113 ;;   merge any more.  After matching such a lexer, we still have to figure
 114 ;;   which of the matches we had is the leftmost longest match, of course.
 115 ;;   Actually, it's not that easy: the tail of a `stop' in the match-DFA can
 116 ;;   only match things whose (match-beginning 0) may be the same as the one
 117 ;;   of the `stop', whereas we also want to accept longer matches that start
 118 ;;   before (match-beginning 0).  So we want to keep merging on the tail of
 119 ;;   `stop' nodes, but only "partially" (whatever that means).
 120
 121 ;; - Better yet, do what TRE does: after the shortest-search, use the
 122 ;;   submatch data to figure out the NFA states (corresponding to the
 123 ;;   current search-DFA state) which are only reachable from later starting
 124 ;;   positions than (match-beginning 0), remove them and figure out from
 125 ;;   that the match-DFA state to which to switch.  Problem is: there might
 126 ;;   not be any such state in the match-DFA.
 127 ;;
 128 ;; - In the end I do a mix of the last 2: .*?RE
 129 ;;   This uses the `orelse' merge operator, which contrary to `or' only
 130 ;;   matches the righthand side when the lefthand side fails to match.
 131 ;;   It turns out to be fairly simple to implement, and is optimal.
 132 ;;
 133 ;; Lookahead
 134 ;; ---------
 135
 136 ;; I suspect that the (?=<RE>) lookahead can be encoded using something like
 137 ;; `andalso'.  Of course, it can also trivially be encoded as a predicate,
 138 ;; but then we get an O(N^2) complexity.
 139
 140 ;; Merging operators.
 141 ;; ------------------
 142
 143 ;; The NFA merging operators (or, and, orelse) seem to work fine on their own,
 144 ;; but I'm not convinced they always DTRT when combined.  It's not even
 145 ;; clear that the NFA->DFA conversion terminates in all such cases.
 146
 147 ;; Intersection
 148 ;; ------------
 149
 150 ;; Implementing the `inter' regexp operator turns out to be more difficult
 151 ;; than it seemed.  The problem is basically in the `join'.  Each `and' has
 152 ;; to have its own matching `join', but preserving this invariant is
 153 ;; tricky.  Among other things, we cannot flatten nested `and's like we do
 154 ;; for `or's and `orelse's.
 155
 156 ;; Submatch info
 157 ;; -------------
 158
 159 ;; Keeping track of submatch info with a DFA is tricky business and can slow
 160 ;; down the matcher or make it use algorithmically more memory
 161 ;; (e.g. O(textsize)).  Here are some approaches:
 162
 163 ;; - Reproduce what an NFA matcher would do: when compiling the DFA, keep
 164 ;;   track of the NFA nodes corresponding to each DFA node, and for every
 165 ;;   transition, check the mapping between "incoming NFA nodes" and
 166 ;;   "outgoing NFA nodes" to maintain the list of submatch-info (one element
 167 ;;   per NFA node).
 168
 169 ;; - Keep a log of the states traversed during matching, so at the end it
 170 ;;   can be used to reproduce the parse tree or submatch info, based on
 171 ;;   auxiliary tables constructed during the DFA construction.
 172
 173 ;; - Some submatch info can be maintained cheaply: basically a submatch
 174 ;;   position can be represented by a single global variable in the case
 175 ;;   where we have the following property: every ε transition in the NFA
 176 ;;   which corresponds to this submatch point has the following property:
 177 ;;   no other ε transition for this same submatch can be traversed between
 178 ;;   the text position where this transition is traversed and the position
 179 ;;   where the target NFA subgraph fails to match.
 180
 181 ;;
 182
 183 ;;; Code:
 184
 185 (eval-when-compile (require 'cl-lib))
 186
 187 (eval-and-compile
 188   (unless (fboundp 'case-table-get-table)
 189     ;; Copied from 24.4
 190     (defun case-table-get-table (case-table table)
 191       "Return the TABLE of CASE-TABLE.
 192 TABLE can be `down', `up', `eqv' or `canon'."
 193       (let ((slot-nb (cdr (assq table '((up . 0) (canon . 1) (eqv . 2))))))
 194         (or (if (eq table 'down) case-table)
 195             (char-table-extra-slot case-table slot-nb)
 196             (let ((old (standard-case-table)))
 197               (unwind-protect
 198                   (progn
 199                     (set-standard-case-table case-table)
 200                     (char-table-extra-slot case-table slot-nb))
 201                 (or (eq case-table old)
 202                     (set-standard-case-table old)))))))))
 203
 204 (defun copy-char-table (ct1)
 205   (let* ((subtype (char-table-subtype ct1))
 206          (ct2 (make-char-table subtype)))
 207     (map-char-table (lambda (c v) (set-char-table-range ct2 c v)) ct1)
 208     (dotimes (i (or (get subtype 'char-table-extra-slots) 0))
 209       (set-char-table-extra-slot ct2 i (char-table-extra-slot ct1 i)))
 210     ct2))
 211
 212 (defun lex--char-table->alist (ct)
 213   (let ((res ()))
 214     (map-char-table (lambda (k v)
 215                       (push (cons (if (consp k)
 216                                       ;; If k is a cons cell, we have to
 217                                       ;; copy it because map-char-table
 218                                       ;; reuses it.
 219                                       (cons (car k) (cdr k))
 220                                     ;; Otherwise, create a trivial cons-cell
 221                                     ;; so we have fewer cases to handle.
 222                                     (cons k k))
 223                                   v)
 224                             res))
 225                     ct)
 226     res))
 227
 228 (defun lex--merge-into (op al1 al2 ct)
 229   (cl-assert (memq op '(and or orelse)))
 230   ;; We assume that map-char-table calls its function with increasing
 231   ;; `key' arguments.
 232   (while (and al1 al2)
 233     (let ((k1 (caar al1)) (k2 (caar al2)))
 234       (cond
 235        ;; Perfect overlap.
 236        ((equal k1 k2)
 237         (set-char-table-range ct k1
 238                               (lex--merge op (cdr (pop al1)) (cdr (pop al2)))))
 239        ;; k1 strictly greater than k2.
 240        ((and (consp k1) (consp k2) (> (car k1) (cdr k2)))
 241         (let ((v (cdr (pop al1))))
 242           (if (not (eq op 'and)) (set-char-table-range ct k1 v))))
 243        ;; k2 strictly greater than k1.
 244        ((and (consp k1) (consp k2) (> (car k2) (cdr k1)))
 245         (let ((v (cdr (pop al2))))
 246           (if (not (eq op 'and)) (set-char-table-range ct k2 v))))
 247        ;; There's partial overlap.
 248        ((and (consp k1) (consp k2) (> (cdr k1) (cdr k2)))
 249         (if (not (eq op 'and))
 250             (set-char-table-range ct (cons (1+ (cdr k2)) (cdr k1)) (cdar al1)))
 251         (setcdr k1 (cdr k2)))
 252        ((and (consp k1) (consp k2) (< (cdr k1) (cdr k2)))
 253         (if (not (eq op 'and))
 254             (set-char-table-range ct (cons (1+ (cdr k1)) (cdr k2)) (cdar al2)))
 255         (setcdr k2 (cdr k1)))
 256        ;; Now the tails are equal.
 257        ((and (consp k1) (consp k2) (> (car k1) (car k2)))
 258         (set-char-table-range ct k1 (lex--merge op (cdr (pop al1)) (cdar al2)))
 259         (setcdr k2 (1- (car k1))))
 260        ((and (consp k1) (consp k2) (< (car k1) (car k2)))
 261         (set-char-table-range ct k2 (lex--merge op (cdar al1) (cdr (pop al2))))
 262         (setcdr k1 (1- (car k2))))
 263        (t (cl-assert nil)))))
 264   (if (not (eq op 'and))
 265       (dolist (x (or al1 al2))
 266         (set-char-table-range ct (car x) (cdr x))))
 267   ct)
 268
 269 (defvar lex--states)
 270 (defvar lex--memoize)
 271
 272 (defun lex--set-eq (l1 l2)
 273   (let ((len (length l2)))
 274     (setq l2 (copy-sequence l2))
 275     (while (consp l1)
 276       (cl-assert (= len (length l2)))
 277       (unless (> len
 278                  (setq len (length (setq l2 (delq (pop l1) l2)))))
 279         (setq l1 t)))
 280     (not l1)))
 281
 282 (define-hash-table-test 'lex--set-eq 'lex--set-eq
 283   (lambda (l)
 284     (let ((hash 0))
 285       (while l
 286         (let ((x (pop l)))
 287           (if (memq x l) (progn (debug) nil)
 288             (setq hash (+ hash (sxhash x))))))
 289       hash)))
 290
 291
 292 (defun lex--flatten-state (state)
 293   (cl-assert (memq (car state) '(and or orelse)))
 294   (let ((op (car state))
 295         (todo (cdr state))
 296         (done (list state))
 297         (res nil))
 298     (while todo
 299       (setq state (pop todo))
 300       (cond
 301        ((null state) (if (eq op 'and) (setq res nil todo nil)))
 302        ((memq state done) nil)
 303        ((eq (car-safe state) op)
 304         (push state done)
 305         (setq todo (append (cdr state) todo)))
 306        (t (unless (memq state res) (push state res)))))
 307     (cons op (nreverse res))))
 308
 309 (defun lex--merge-2 (op lex1 lex2)
 310   (cl-assert (memq op '(and or orelse)))
 311   ;; The order between lex1 and lex2 matters: preference is given to lex1.
 312   (cond
 313    ;; `lex1' and `lex2' might actually be the same when we use this code to
 314    ;; cancel out the `and' and the `join' from lex--merge-and-join.
 315    ;; ((eq lex1 lex2) (debug) lex1)  ;CHECK: ruled out by `lex--flatten-state'?
 316    ;; ((equal lex1 lex2) lex1)             ;Stack overflow :-(
 317
 318    ;; Handle the 2 possible nil cases.
 319    ;; CHECK: ruled out by `lex--flatten-state'?
 320    ((null lex1) (debug) (if (eq op 'and) nil lex2))
 321    ((null lex2) (debug) (if (eq op 'and) nil lex1))
 322
 323    ;; Do the predicate cases before the `stop' because the stop should
 324    ;; always come after the checks.
 325    ;; TODO: add optimizations for pairs of `checks' which are redundant,
 326    ;; or mutually exclusive, ... although we can also do it in lex-optimize.
 327    ((and (eq (car lex1) 'check) (eq (car lex2) 'check)
 328          (equal (nth 1 lex1) (nth 1 lex2))) ; Same predicate.
 329     (cl-list* 'check (nth 1 lex1)
 330               (lex--merge op (nth 2 lex1) (nth 2 lex2))
 331               (lex--merge op (nthcdr 3 lex1) (nthcdr 3 lex2))))
 332    ((eq (car lex1) 'check)
 333     (cl-list* 'check (nth 1 lex1)
 334               (lex--merge op (nth 2 lex1) lex2)
 335               (lex--merge op (nthcdr 3 lex1) lex2)))
 336    ((eq (car lex2) 'check)
 337     (cl-list* 'check (nth 1 lex2)
 338               (lex--merge op lex1 (nth 2 lex2))
 339               (lex--merge op lex1 (nthcdr 3 lex2))))
 340
 341    ;; Joins have the form (join CONT . EXIT) where EXIT is a lexer
 342    ;; corresponding to the rest of the regexp after the `and' sub-regexp.
 343    ;; All the joins corresponding to the same `and' have the same EXIT.
 344    ;; CONT is a lexer that contains another join inside, it corresponds to
 345    ;; the decision to not yet leave the `and'.
 346    ((and (eq (car lex1) 'join) (eq (car lex2) 'join))
 347     (cl-assert (eq (cddr lex1) (cddr lex2))) ;Check they're the same join.
 348     (let ((in (lex--merge op (cadr lex1) (cadr lex2))))
 349       (if (eq op 'and)
 350           ;; Eliminate the join once it was all merged.
 351           ;; FIXME: This arbitrarily chooses `or' instead of `orelse',
 352           ;; and it arbitrarily gives CONT precedence over EXIT.
 353           (lex--merge 'or in (cddr lex1))
 354         `(join ,in ,@(cddr lex1)))))
 355    ;; If one the two lex's is a join but the other not, the other must
 356    ;; contain a corresponding join somewhere inside.
 357    ((eq (car lex1) 'join)
 358     (let ((next (lex--merge op (nth 1 lex1) lex2)))
 359       ;; lex1 is a valid exit point but lex2 isn't.
 360       (if (eq op 'and)
 361           next
 362         ;; FIXME: lex1 is implicitly an `or(else)' between (cadr lex1) and
 363         ;; (cddr lex1).  Here we construct an `or(else)' between `next' and
 364         ;; (cddr lex1).  I.e. we lose the `op' and we do not preserve the
 365         ;; ordering between lex2 and (cddr lex1).
 366         `(join ,next ,@(cddr lex1)))))
 367    ((eq (car lex2) 'join)
 368     (let ((next (lex--merge op lex1 (nth 1 lex2))))
 369       (if (eq op 'and) next `(join ,next ,@(cddr lex2)))))
 370
 371    ;; The three `stop' cases.
 372    ((and (eq (car lex1) 'stop) (eq (car lex2) 'stop))
 373     ;; Here is where we give precedence to `lex1'.
 374     (if (eq op 'orelse) lex1
 375       (cl-list* 'stop (cadr lex1) (lex--merge op (cddr lex1) (cddr lex2)))))
 376    ((eq (car lex1) 'stop)
 377     (let ((next (lex--merge op (cddr lex1) lex2)))
 378       (pcase op
 379         (`or     (cl-list* 'stop (cadr lex1) next))
 380         (`orelse lex1)
 381         ;; CHECK: We should have hit a `join' before reaching a `stop'.
 382         (`and    (debug) next)
 383         (_       (error "lex.el: got %S but expected one of or/and/orelse"
 384                         op)))))
 385    ((eq (car lex2) 'stop)
 386     (let ((next (lex--merge op lex1 (cddr lex2))))
 387       ;; For `orelse', we want here to delay the `stop' until the point
 388       ;; where we know that lex1 doesn't match.  Sadly, I don't know how to
 389       ;; do it.
 390       (pcase op
 391         ;; FIXME: One thing we can do is to mark the value attached to the
 392         ;; `stop' so as to indicate that an earlier match may finish later.
 393         ;; This way, if the match is not `earlystop', we know it's one of
 394         ;; the leftmost ones, and maybe the search loop can avoid some work
 395         ;; when determining which is the leftmost longest match.
 396         (`orelse (cl-list* 'stop `(earlystop ,(cadr lex2)) next))
 397         ((or `or `orelse) (cl-list* 'stop (cadr lex2) next))
 398         ;; CHECK: We should have hit a `join' before reaching a `stop'.
 399         (`and    (debug) next)
 400         (_       (error "lex.el: got %S but expected one of or/and/orelse"
 401                         op)))))
 402
 403    ;; The most general case.
 404    ((and (eq (car lex1) 'table) (eq (car lex2) 'table))
 405     (let ((al1 (lex--char-table->alist (cdr lex1)))
 406           (al2 (lex--char-table->alist (cdr lex2)))
 407           (ct (make-char-table 'lexer)))
 408       (lex--merge-into op al1 al2 ct)
 409       (cons 'table ct)))
 410
 411    ((and (characterp (car lex1)) (characterp (car lex2))
 412          (eq (car lex1) (car lex2)))
 413     (cons (car lex1) (lex--merge op (cdr lex1) (cdr lex2))))
 414    ((and (characterp (car lex1)) (characterp (car lex2)))
 415     (unless (eq op 'and)
 416       (let ((ct (make-char-table 'lexer)))
 417         (aset ct (car lex1) (cdr lex1))
 418         (aset ct (car lex2) (cdr lex2))
 419         (cons 'table ct))))
 420    ((and (characterp (car lex1)) (eq (car lex2) 'table))
 421     (let ((next (lex--merge op (cdr lex1) (aref (cdr lex2) (car lex1)))))
 422       (if (eq op 'and)
 423           (if next (cons (car lex1) next))
 424         (let ((ct (copy-sequence (cdr lex2))))
 425           (aset ct (car lex1) next)
 426           (cons 'table ct)))))
 427    ((and (eq (car lex1) 'table) (characterp (car lex2)))
 428     (let ((next (lex--merge op (aref (cdr lex1) (car lex2)) (cdr lex2))))
 429       (if (eq op 'and)
 430           (if next (cons (car lex2) next))
 431         (let ((ct (copy-sequence (cdr lex1))))
 432           (aset ct (car lex2) next)
 433           (cons 'table ct)))))
 434
 435    ((or (memq (car lex1) '(or orelse and))  ;state
 436         (memq (car lex2) '(or orelse and))) ;state
 437     ;; `state' nodes are nodes whose content is not known yet, so we
 438     ;; have to delay the merge via the memoization table.
 439     ;; `or' and `and' nodes should only happen when the other `op' is being
 440     ;; performed, in which case we can't do the merge either before lex1
 441     ;; and lex2 have both been merged.
 442     (lex--merge op lex1 lex2))
 443    (t (cl-assert nil))))
 444
 445 (defun lex--merge-now (&rest state)
 446   (cl-assert (memq (car state) '(and or orelse)))
 447   ;; Re-flatten, in case one of the sub-states was changed.
 448   (setq state (lex--flatten-state state))
 449   (if (<= (length state) 2)
 450       (if (eq (car state) 'and)
 451           ;; Need to strip out the `join's.
 452           (lex--merge-and-join (cadr state))
 453         (cadr state))
 454     (let ((op (pop state))
 455           (res (pop state)))
 456       (dolist (lex state)
 457         ;; CHECK: we fold the lexers  using left-associativity.
 458         ;; For `orelse', that means that `earlystop' never accumulates,
 459         ;; whereas if we folded in a right-associative way, we could get
 460         ;; some (earlystop (earlystop (earlystop V))).  Not sure which one's
 461         ;; preferable, so let's stick with what we have for now.
 462         (setq res (lex--merge-2 op res lex)))
 463       res)))
 464
 465 (defun lex--merge-and-join (lex)
 466   (lex--merge-2 'and lex lex))
 467
 468
 469 (defun lex--merge (&rest state)
 470   (cl-assert (memq (car state) '(and or orelse)))
 471   (setq state (lex--flatten-state state))
 472   (if (and (<= (length state) 2)
 473            (not (eq (car state) 'and)))
 474       (cadr state)
 475     (or (gethash state lex--memoize)
 476         (progn
 477           ;; (debug)
 478           (cl-assert (memq (car state) '(and or orelse)))
 479           (push state lex--states)
 480           ;; The `state' node will be later on modified via setcar/setcdr,
 481           ;; se be careful to use a copy of it for the key.
 482           (puthash (cons (car state) (cdr state)) state lex--memoize)
 483           state))))
 484
 485 (defun lex--compile-category (category)
 486   (if (and (integerp category) (< category 128))
 487       category
 488     (if (symbolp category)
 489         (if (= 1 (length (symbol-name category)))
 490             (aref (symbol-name category) 0)
 491           (require 'rx)
 492           (defvar rx-categories)
 493           (cdr (assq category rx-categories))))))
 494
 495 (defun lex--compile-syntax (&rest syntaxes)
 496   (mapcar (lambda (x)
 497             (if (and (integerp x) (< x 32)) x
 498               (if (symbolp x)
 499                   (setq x (if (= 1 (length (symbol-name x)))
 500                               (symbol-name x)
 501                             (require 'rx)
 502                             (defvar rx-syntax)
 503                             (cdr (assq x rx-syntax)))))
 504               (if (characterp x) (setq x (string x)))
 505               (car (string-to-syntax x))))
 506           syntaxes))
 507
 508 (defconst lex--char-classes
 509   `((alnum alpha digit)
 510     (alpha word (?a . ?z) (?A . ?Z))
 511     (blank ?\s ?\t)
 512     (cntrl (?\0 . ?\C-_))
 513     (digit (?0 . ?9))
 514     ;; Include all multibyte chars, plus all the bytes except 128-159.
 515     (graph (?! . ?~) multibyte (#x3fffa0 . #x3fffff))
 516     ;; src/regexp.c handles case-folding inconsistently: lower and upper
 517     ;; match both lower- and uppercase ascii chars, but lower also matches
 518     ;; uppercase non-ascii chars whereas upper does not match lowercase
 519     ;; nonascii chars.  Here I simply ignore case-fold for [:lower:] and
 520     ;; [:upper:] because it's simpler and doesn't seem worse.
 521     (lower (check (lex--match-lower)))
 522     (upper (check (lex--match-upper)))
 523     (print graph ?\s)
 524     (punct (check (not (lex--match-syntax . ,(lex--compile-syntax "w"))))
 525            (?! . ?/) (?: . ?@) (?\[ . ?`) (?\{ . ?~))
 526     (space (check (lex--match-syntax . ,(lex--compile-syntax " "))))
 527     (xdigit digit (?a . ?f) (?A . ?F))
 528     (ascii (?\0 . ?\177))
 529     (nonascii (?\200 . #x3fffff))
 530     (unibyte ascii (#x3fff00 . #x3fffff))
 531     (multibyte (#x100 . #x3ffeff))
 532     (word (check (lex--match-syntax . ,(lex--compile-syntax "w"))))
 533     ;; `rx' alternative names.
 534     (numeric digit)
 535     (num digit)
 536     (control cntrl)
 537     (hex-digit xdigit)
 538     (hex xdigit)
 539     (graphic graph)
 540     (printing print)
 541     (alphanumeric alnum)
 542     (letter alpha)
 543     (alphabetic alpha)
 544     (lower-case lower)
 545     (upper-case upper)
 546     (punctuation punct)
 547     (whitespace space)
 548     (white space))
 549   "Definition of char classes.
 550 Each element has the form (CLASS . DEFINITION) where definition
 551 is a list of elements that can be either CHAR or (CHAR . CHAR),
 552 or CLASS (another char class) or (check (PREDICATE . ARG))
 553 or (check (not (PREDICATE . ARG))).")
 554
 555 (defvar lex--char-equiv-table nil
 556   "Equiv-case table to use to compile case-insensitive regexps.")
 557
 558 (defun lex--char-equiv (char)
 559   (when lex--char-equiv-table
 560     (let ((chars ())
 561           (tmp char))
 562       (while (and (setq tmp (aref lex--char-equiv-table tmp))
 563                   (not (eq tmp char)))
 564         (push tmp chars))
 565       (if chars (cons char chars)))))
 566
 567 ;; For convenience we use lex itself to tokenize charset strings, so we
 568 ;; define it in another file.
 569 (autoload 'lex--parse-charset "lex-parse-re")
 570
 571 (defun lex--nfa (re state)
 572   (cl-assert state)                   ;If `state' is nil we can't match anyway.
 573   (cond
 574    ((characterp re)
 575     (let ((chars (lex--char-equiv re)))
 576       (if (null chars)
 577           (cons re state)
 578         (let ((ct (make-char-table 'lexer)))
 579           (dolist (char chars) (aset ct char state))
 580           (cons 'table ct)))))
 581    ((stringp re)
 582     (if (null lex--char-equiv-table)
 583         ;; (Very) minor optimization.
 584         (nconc (mapcar 'identity re) state)
 585       (lex--nfa `(seq ,@(mapcar 'identity re)) state)))
 586    (t
 587     (pcase (or (car-safe re) re)
 588       ((or `: `seq `sequence
 589            ;; Hack!
 590            `group)
 591        (dolist (elem (reverse (cdr re)))
 592          (setq state (lex--nfa elem state)))
 593        state)
 594       ((or `char `in `not-char)
 595        (let ((chars (cdr re))
 596              (checks nil)
 597              (fail nil)
 598              (char nil)  ;The char seen, or nil if none, or t if more than one.
 599              (ct (make-char-table 'lexer)))
 600          (when (or (eq 'not (car chars)) (eq 'not-char (car re)))
 601            (setq chars (cdr chars))
 602            (set-char-table-range ct t state)
 603            (setq fail state)
 604            (setq state nil))
 605          (while chars
 606            (let ((range (pop chars)))
 607              (cond
 608               ((stringp range)
 609                (setq chars (append (cdr (lex--parse-charset range)) chars)))
 610               ((symbolp range)
 611                (setq range (or (cdr (assq range lex--char-classes))
 612                                (error "Uknown char class `%s'" range)))
 613                (setq chars (append range chars)))
 614               ((and (consp range) (eq 'check (car range)))
 615                (push (cadr range) checks))
 616               (t
 617                (setq char (if (or char (not (characterp range))
 618                                   (and lex--char-equiv-table
 619                                        (lex--char-equiv range)))
 620                               t range))
 621                ;; Set the range, first, regardless of case-folding.  This is
 622                ;; important because case-tables like to be set with few
 623                ;; large ranges rather than many small ones, as is done in
 624                ;; the case-fold loop.
 625                (set-char-table-range ct range state)
 626                (when (and lex--char-equiv-table
 627                           ;; Avoid looping over all characters.
 628                           (not (equal range '(#x100 . #x3ffeff))))
 629                  ;; Add all the case-equiv chars.
 630                  (let ((i (if (consp range) (car range) range))
 631                        (max (if (consp range) (cdr range) range))
 632                        char)
 633                    (while (<= i max)
 634                      (setq char i)
 635                      (while (and (setq char (aref lex--char-equiv-table char))
 636                                  (not (eq char i)))
 637                        (aset ct char state))
 638                      (setq i (1+ i)))))))))
 639
 640          (let ((res (if (or (eq char t) fail)
 641                         (cons 'table ct)
 642                       (if char (cons char state)))))
 643            (if (and (not fail) checks)
 644                (setq state (lex--nfa 'anything state)))
 645            (dolist (check checks)
 646              (setq res
 647                    (if fail
 648                        ;; We do an `and' of the negation of the check and res.
 649                        (if (eq (car-safe check) 'not)
 650                            (list 'check (cadr check) res)
 651                          (cl-list* 'check check nil res))
 652                      ;; An `or' of the check and res.
 653                      (if (eq (car-safe check) 'not)
 654                          (list 'check (cadr check) res state)
 655                        (cl-list* 'check check state res)))))
 656            res)))
 657
 658       ((or `union `or `| `orelse)
 659        (let ((newstate
 660               (cons (if (eq (car re) 'orelse) 'orelse 'or)
 661                     (mapcar (lambda (re) (lex--nfa re state)) (cdr re)))))
 662          (push newstate lex--states)
 663          newstate))
 664
 665       ((or `inter `intersection `&)
 666        (if (<= (length re) 2)
 667            ;; Avoid constructing degenerate `and' nodes.
 668            (lex--nfa (cadr re) state)
 669          ;; Just using `and' is not enough because we have to enforce that the
 670          ;; sub-regexps (rather than the whole regexp) match the same string.
 671          ;; So we need to mark the juncture point.
 672          (let* ((join `(join nil ,@state))
 673                 (newstate
 674                  `(and ,@(mapcar (lambda (re) (lex--nfa re join)) (cdr re)))))
 675            (push newstate lex--states)
 676            newstate)))
 677
 678       ((or `0+ `zero-or-more `* `*\?)
 679        (let ((newstate (list 'state)))
 680          (let ((lexer (lex--nfa (cons 'seq (cdr re)) newstate)))
 681            (setcdr newstate (if (memq (car re) '(*\?))
 682                                 (list state lexer)
 683                               (list lexer state))))
 684          (setcar newstate (if (memq (car re) '(*\?)) 'orelse 'or))
 685          (push newstate lex--states)
 686          newstate))
 687
 688       ((or `string-end `eos `eot `buffer-end `eob)
 689        `(check (lex--match-eobp) ,state))
 690       ((or `string-start `bos `bot `buffer-start `bob)
 691        `(check (lex--match-bobp) ,state))
 692       ((or `line-end `eol) `(check (lex--match-eolp) ,state))
 693       ((or `line-start `bol) `(check (lex--match-bolp) ,state))
 694       ((or `word-start `bow) `(check (lex--match-bowp) ,state))
 695       ((or `word-end `eow) `(check (lex--match-eowp) ,state))
 696       (`symbol-start `(check (lex--match-bosp) ,state))
 697       (`symbol-end `(check (lex--match-eosp) ,state))
 698       (`not-word-boundary `(check (lex--match-not-word-boundary) ,state))
 699       (`word-boundary `(check (lex--match-not-word-boundary) nil . ,state))
 700       (`syntax `(check (lex--match-syntax
 701                         . ,(apply 'lex--compile-syntax (cdr re)))
 702                        ,(lex--nfa 'anything state)))
 703       (`not-syntax `(check (lex--match-syntax
 704                             . ,(apply 'lex--compile-syntax (cdr re)))
 705                            nil . ,(lex--nfa 'anything state)))
 706       (`category `(check (lex--match-category
 707                           . ,(lex--compile-category (cadr re)))
 708                          ,(lex--nfa 'anything state)))
 709       (`not-category `(check (lex--match-category
 710                               . ,(lex--compile-category (cadr re)))
 711                              nil . ,(lex--nfa 'anything state)))
 712
 713       ;; `rx' accepts char-classes directly as regexps.  Let's reluctantly
 714       ;; do the same.
 715       ((or `digit `numeric `num `control `cntrl `hex-digit `hex `xdigit `blank
 716            `graphic `graph `printing `print `alphanumeric `alnum `letter
 717            `alphabetic `alpha `ascii `nonascii `lower `lower-case `upper
 718            `upper-case `punctuation `punct `space `whitespace `white)
 719        (lex--nfa `(char ,re) state))
 720
 721       (`case-sensitive
 722        (let ((lex--char-equiv-table nil))
 723          (lex--nfa `(seq ,@(cdr re)) state)))
 724
 725       (`case-fold
 726        (let ((lex--char-equiv-table
 727               (case-table-get-table (current-case-table) 'eqv)))
 728          (lex--nfa `(seq ,@(cdr re)) state)))
 729
 730       ((or `point
 731            ;; Sub groups!
 732            `submatch `group `backref
 733            ;; Greediness control
 734            `minimal-match `maximal-match)
 735        (error "`%s' Not implemented" (or (car-safe re) re)))
 736
 737       ((or `not-newline `nonl `dot) (lex--nfa '(char not ?\n) state))
 738       (`anything (lex--nfa '(char not) state))
 739       ((or `word `wordchar) (lex--nfa '(syntax w) state))
 740       (`not-wordchar (lex--nfa '(not-syntax w) state))
 741
 742       (`any
 743        ;; `rx' uses it for (char ...) sets, and sregex uses it for `dot'.
 744        (lex--nfa (if (consp re) (cons 'char (cdr re)) '(char not ?\n)) state))
 745
 746       (`negate
 747        ;; We could define negation directly on regexps, but it's easier to
 748        ;; do it on NFAs since those have fewer cases to deal with.
 749        (let ((posnfa
 750               ;; Trow away the mergable states generated while computing the
 751               ;; posnfa, since it's only an intermediate datastructure.
 752               (let (lex--states)
 753                 (lex--nfa `(seq ,@(cdr re)) '(stop negate)))))
 754          (lex-negate posnfa state)))
 755
 756       (`not
 757        ;; The `not' as used in `rx' should be deprecated so we can make it
 758        ;; an alias for `negate', whose semantics is different.  E.g.
 759        ;; (negate (char ...)) matches the empty string and 2-char strings.
 760        (setq re (cadr re))
 761        (pcase (or (car-safe re) re)
 762          (`word-boundary
 763           (message "`not' deprecated: use not-word-boundary")
 764           (lex--nfa 'not-word-boundary state))
 765          ((or `any `in `char)
 766           (message "`not' deprecated: use (%s not ...)" (or (car-safe re) re))
 767           (lex--nfa (cl-list* (car re) 'not (cdr re)) state))
 768          ((or `category `syntax)
 769           (message "`not' deprecated: use not-%s" (car re))
 770           (lex--nfa (cons (intern (format "not-%s" (car re))) (cdr re)) state))
 771          (elem (error "lex.el: unexpected argument `%S' to `not'." elem))))
 772
 773       (`and
 774        ;; `rx' defined `and' as `sequence', but we may want to define it
 775        ;; as intersection instead.
 776        (error "`and' is deprecated, use `seq', `:', or `sequence' instead"))
 777
 778       ((or `1+ `one-or-more `+ `+\?)
 779        (lex--nfa `(seq (seq ,@(cdr re))
 780                        (,(if (memq (car re) '(+\?)) '*\? '0+) ,@(cdr re)))
 781                  state))
 782       ((or `opt `zero-or-one `optional `\?)
 783        (lex--nfa `(or (seq ,@(cdr re)) "") state))
 784       (`\?\?
 785        (lex--nfa `(orelse "" (seq ,@(cdr re))) state))
 786       ((or `repeat `** `=)
 787        (let ((min (nth 1 re))
 788              (max (nth 2 re))
 789              (res (nthcdr 3 re)))
 790          (unless res
 791            (setq res (list max)) (setq max min))
 792          (lex--nfa `(seq ,@(append (make-list (or min 0)
 793                                               (if (eq (length res) 1)
 794                                                   (car res)
 795                                                 (cons 'seq res)))
 796                                    (if (null max)
 797                                        `((0+ ,@res))
 798                                      (make-list (- max (or min 0))
 799                                                 `(opt ,@res)))))
 800                    state)))
 801       (`>= (lex--nfa `(repeat ,(nth 1 re) nil ,@(nthcdr 2 re)) state))
 802
 803       ((or `bre `re `ere)
 804        (lex--nfa (lex-parse-re (nth 1 re) (car re)) state))
 805       (elem (error "lex.el: unknown RE element %S" elem))))))
 806
 807 (defun lex--negate-inftail (state howmany)
 808   ;; We hashcons the infinite tails and store them in the memoize table.
 809   ;; This is an abuse, but saves us from passing it around as an
 810   ;; extra argument.
 811   (let ((inftail-1+ (gethash state lex--memoize)))
 812     (unless inftail-1+
 813       ;; Precompute the final infinitely repeating tail.
 814       (setq inftail-1+ `(table . ,(make-char-table 'lexer)))
 815       (set-char-table-range (cdr inftail-1+) t `(or ,state ,inftail-1+))
 816       (push (aref (cdr inftail-1+) 0) lex--states)
 817       (puthash state inftail-1+ lex--memoize))
 818     (pcase howmany
 819       (`1+ inftail-1+)
 820       (`0+ (aref (cdr inftail-1+) 0))
 821       (_ (error "lex.el: howmany is `%S' instead of one of 1+/0+" howmany)))))
 822
 823 (defun lex--negate-now (nfa state)
 824   (pcase (car nfa)
 825     (`nil (lex--negate-inftail state '0+))
 826     (`check
 827      `(check ,(nth 1 nfa) ,(lex--negate-memo (nth 2 nfa) state)
 828              ,@(lex--negate-memo (nthcdr 3 nfa) state)))
 829     (`stop
 830      (if (cddr nfa)
 831          ;; This is valid but should normally not happen.
 832          (lex--negate-now `(or (stop ,(cadr nfa)) ,(cddr nfa)) state)
 833        (lex--negate-inftail state '1+)))
 834
 835     ((or `or `orelse)
 836      (let ((join `(join nil . ,state)))
 837        `(and ,@(mapcar (lambda (nfa) (lex--negate-memo nfa join)) (cdr nfa)))))
 838
 839     (`and
 840      `(or ,@(mapcar (lambda (nfa) (lex--negate-memo nfa state)) (cdr nfa))))
 841
 842     (`join
 843      ;; The join says: either exit the `and' because we matched all branches,
 844      ;; or keep matching further.  Negation makes the synchrony between
 845      ;; `and' branches irrelevant, so we can consider it as an `or(else)'.
 846      (if (cadr nfa)
 847          ;; This is valid but should normally not happen.
 848          (lex--negate-now `(or ,(cadr nfa) ,(cddr nfa)) state)
 849        (lex-negate (cddr nfa) state)))
 850     (_
 851      (let ((ct (make-char-table 'lexer)))
 852        ;; Get inftail-0+ from the hashtable.
 853        (set-char-table-range ct t (lex--negate-inftail state '0+))
 854        (if (characterp (car nfa))
 855            (aset ct (car nfa) (lex--negate-memo (cdr nfa) state))
 856          (cl-assert (eq 'table (car nfa)))
 857          (map-char-table (lambda (range nfa)
 858                            (set-char-table-range ct range
 859                                                  (lex--negate-memo nfa state)))
 860                          (cdr nfa)))
 861        `(or ,state (table ,@ct))))))
 862
 863 (defun lex--negate-memo (nfa state)
 864   ;; Make sure our `inftail' abuse of the hastable doesn't break anything.
 865   (cl-assert (not (eq nfa state)))
 866   (or (gethash nfa lex--memoize)
 867       (let ((newstate (cons 'state nil)))
 868         (puthash nfa newstate lex--memoize)
 869         (let ((res (lex--negate-now nfa state)))
 870           (when (memq (car res) '(or and orelse))
 871             (push newstate lex--states))
 872           (if (null res)
 873               (setq res '(?a))
 874             (setcar newstate (car res))
 875             (setcdr newstate (cdr res))
 876             newstate)))))
 877
 878 (defun lex-negate (nfa state)
 879   "Concatenate the negation of NFA with STATE.
 880 Returns a new NFA."
 881   (let ((lex--memoize (make-hash-table :test 'eq)))
 882     (lex--negate-memo nfa state)))
 883
 884 (defun lex--dfa-wrapper (f)
 885   (let* ((lex--states ())
 886          (res (funcall f))
 887          (postponed ())
 888          (lex--memoize (make-hash-table :test 'lex--set-eq))
 889          (states-dfa (make-hash-table :test 'eq)))
 890
 891     (while lex--states
 892       (dolist (state (prog1 lex--states (setq lex--states nil)))
 893         (let ((merged (apply 'lex--merge-now state)))
 894           (if (memq (car merged) '(and or orelse))
 895               ;; The merge could not be performed for some reason:
 896               ;; let's re-schedule it.
 897               (push state postponed)
 898             (puthash state merged states-dfa))))
 899
 900       (unless lex--states
 901         ;; If states-dfa is empty it means we haven't made any progress,
 902         ;; so we're stuck in an infinite loop.  Hopefully this cannot happen?
 903         (cl-assert (not (zerop (hash-table-count states-dfa))))
 904         (maphash (lambda (k v)
 905                    (unless v
 906                      ;; With `intersection', lex--merge may end up returning
 907                      ;; nil if the intersection is empty, so `v' can be
 908                      ;; nil here.  In since `k' is necessarily a cons cell,
 909                      ;; we can't turn it into nil, so we turn it into
 910                      ;; a more costly lexer that also fails for all inputs.
 911                      (setq v '(?a)))
 912                    (setcar k (car v))
 913                    (setcdr k (cdr v)))
 914                  states-dfa)
 915         (clrhash states-dfa)
 916         (setq lex--states postponed)
 917         (setq postponed nil)))
 918
 919     res))
 920
 921 (defun lex-compile (alist)
 922   (lex--dfa-wrapper
 923    (lambda ()
 924      (let* ((lex--char-equiv-table
 925              (if case-fold-search
 926                  (case-table-get-table (current-case-table) 'eqv)))
 927             (newstate
 928              `(or
 929                ,@(mapcar (lambda (x) (lex--nfa (car x) (list 'stop (cdr x))))
 930                          alist))))
 931        (push newstate lex--states)
 932        newstate))))
 933
 934 (defun lex-search-dfa (match-dfa)
 935   ;; This constructs a search-DFA whose last match should be the leftmost
 936   ;; longest match.
 937   (lex--dfa-wrapper
 938    (lambda ()
 939      (lex--nfa '(*\? (char not)) match-dfa))))
 940
 941
 942 (defun lex--terminate-if (new old)
 943   (cond
 944    ((eq new t) t)
 945    ((eq old t) t)
 946    (t (while new (let ((x (pop new))) (if (not (memq x old)) (push x old))))
 947       old)))
 948
 949 (defun lex--optimize-1 (lexer)
 950   (let ((terminate nil))
 951     (cons
 952      (pcase (car lexer)
 953        (`table
 954         (let ((ct (cdr lexer))
 955               (char nil))
 956           ;; Optimize each entry.
 957           (map-char-table
 958            (lambda (range v)
 959              (let ((cell (lex--optimize v)))
 960                (setq terminate (lex--terminate-if (cdr cell) terminate))
 961                (set-char-table-range ct range (car cell))))
 962            ct)
 963           ;; Optimize the internal representation of the table.
 964           (optimize-char-table (cdr lexer) 'eq)
 965           ;; Eliminate the table if possible.
 966           (map-char-table
 967            (lambda (range _v)
 968              (setq char
 969                    (if (and (characterp range) (null char))
 970                        range t)))
 971            ct)
 972           (pcase char
 973             (`nil nil)
 974             (`t lexer)
 975             (_ (setcar lexer 'char) (setcdr lexer (aref ct char)) lexer))))
 976        (`stop
 977         (let ((cell (lex--optimize (cddr lexer))))
 978           (setq terminate t)
 979           (setf (cddr lexer) (car cell)))
 980         lexer)
 981        (`check
 982         (let* ((test (nth 1 lexer))
 983                (cellf (lex--optimize (nthcdr 3 lexer)))
 984                (fail (setf (nthcdr 3 lexer) (car cellf)))
 985                (cells (lex--optimize (nth 2 lexer)))
 986                (succ (setf (nth 2 lexer) (car cells))))
 987           (setq terminate (lex--terminate-if (cdr cellf) terminate))
 988           (setq terminate (lex--terminate-if (cdr cells) terminate))
 989           ;; TODO: the check-optimizations below only work on consecutive
 990           ;; pairs of checks.  We need to be more agressive and make sure
 991           ;; the optimized DFA never does twice the same test at the same
 992           ;; position.  Most importantly: don't do the same test in
 993           ;; a tight loop as in "(^\<)*".
 994           (when (eq 'check (car succ))
 995             (cond
 996              ((equal test (nth 1 succ)) ;Same successful test.
 997               (setf (nth 2 lexer) (setq succ (nth 2 succ))))
 998              ;; TODO: we can add rules such as bobp -> eolp,
 999              ;; bosp -> bowp, (syntax X) -> (syntax Y X), ...
1000              ))
1001           (when (eq 'check (car fail))
1002             (cond
1003              ((equal test (nth 1 fail)) ;Same failing test.
1004               (setf (nthcdr 3 lexer) (setq fail (nthcdr 3 succ))))
1005              ;; TODO: we can add rules such as !eolp -> !bobp,
1006              ;; !bowp -> !bosp, !(syntax Y X) -> !(syntax X), ...
1007              ))
1008           (if (or succ fail) lexer)))
1009        (_
1010         (cl-assert (characterp (car lexer)))
1011         (let ((cell (lex--optimize (cdr lexer))))
1012           (setq terminate (lex--terminate-if (cdr cell) terminate))
1013           (if (setf (cdr lexer) (car cell))
1014               lexer))))
1015      (if (consp terminate)
1016          (delq lexer terminate)
1017        terminate))))
1018
1019 (defun lex--optimize (lexer)
1020   (when lexer
1021     ;; The lex--memoize cache maps lexer states to (LEXER . TERMINATE) where
1022     ;; TERMINATE is either t to say that LEXER can terminate or a list of
1023     ;; lexers which means that LEXER terminates only if one of the lexers in
1024     ;; the list terminates.
1025     (let ((cache (gethash lexer lex--memoize)))
1026       (if cache
1027           ;; Optimize (char C) to nil.
1028           (if (and (characterp (caar cache)) (null (cdar cache))) nil cache)
1029         ;; Store a value indicating that we're in the process of computing it,
1030         ;; so when we encounter a loop, we don't recurse indefinitely.
1031         ;; Not knowing any better, we start by stating the tautology that
1032         ;; `lexer' terminates if and only if `lexer' terminates.
1033         (let ((cell (cons lexer (list lexer))))
1034           (puthash lexer cell lex--memoize)
1035           (let ((res (lex--optimize-1 lexer)))
1036             (if (and (car res) (cdr res))
1037                 res
1038               (setcar lexer ?a)
1039               (setcdr lexer nil)
1040               (puthash lexer '(nil) lex--memoize)
1041               nil)))))))
1042
1043 (defun lex-optimize (lexer)
1044   (let ((lex--memoize (make-hash-table :test 'eq)))
1045     (prog1 (car (lex--optimize lexer))
1046       (message "Visited %d states" (hash-table-count lex--memoize)))))
1047
1048 (defmacro lex-case (object posvar &rest cases)
1049   (declare (indent 2))
1050   (let* ((i -1)
1051          (alist (mapcar (lambda (case) (cons (car case) (cl-incf i))) cases))
1052          (lex (lex-compile alist))
1053          (tmpsym (make-symbol "tmp")))
1054     (setq i -1)
1055     `(let ((,tmpsym (lex-match-string ',lex ,object ,posvar)))
1056        (pcase (car ,tmpsym)
1057          ,@(mapcar (lambda (case)
1058                      `(,(cl-incf i)
1059                        (set-match-data
1060                         (list ,posvar (setq ,posvar (cadr ,tmpsym))))
1061                        ,@(cdr case)))
1062                    cases)))))
1063
1064 ;;; Matching engine
1065
1066 (defun lex--match-bobp (_arg pos &optional string)
1067   (= pos (if string 0 (point-min))))
1068
1069 (defun lex--match-eobp (_arg pos &optional string)
1070   (= pos (if string (length string) (point-max))))
1071
1072 (defun lex--match-bolp (_arg pos &optional string)
1073   (if string (or (= pos 0) (eq ?\n (aref string (1- pos))))
1074     (memq (char-before pos) '(nil ?\n))))
1075
1076 (defun lex--match-eolp (_arg pos &optional string)
1077   (if string (or (= pos (length string)) (eq ?\n (aref string pos)))
1078     (memq (char-after pos) '(nil ?\n))))
1079
1080 (defun lex--match-bowp (_arg pos &optional string)
1081   (and (not (if string (and (> pos 0)
1082                             (eq ?w (char-syntax (aref string (1- pos)))))
1083               (and (> pos (point-min)) (eq 2 (car (syntax-after (1- pos)))))))
1084        (if string (and (< pos (length string))
1085                        (eq ?w (char-syntax (aref string pos))))
1086          (eq 2 (car (syntax-after pos))))))
1087
1088 (defun lex--match-eowp (_arg pos &optional string)
1089   (and (if string (and (> pos 0)
1090                        (eq ?w (char-syntax (aref string (1- pos)))))
1091          (and (> pos (point-min)) (eq 2 (car (syntax-after (1- pos))))))
1092        (not (if string (and (< pos (length string))
1093                             (eq ?w (char-syntax (aref string pos))))
1094               (eq 2 (car (syntax-after pos)))))))
1095
1096 (defun lex--match-bosp (_arg pos &optional string)
1097   (and (not (if string
1098                 (and (> pos 0)
1099                      (memq (char-syntax (aref string (1- pos))) '(?w ?_)))
1100               (and (> pos (point-min))
1101                    (memq (car (syntax-after (1- pos))) '(2 3)))))
1102        (if string (and (< pos (length string))
1103                        (memq (char-syntax (aref string pos)) '(?w ?_)))
1104          (memq (car (syntax-after pos)) '(2 3)))))
1105
1106 (defun lex--match-eosp (_arg pos &optional string)
1107   (and (if string (and (> pos 0)
1108                        (memq (char-syntax (aref string (1- pos))) '(?w ?_)))
1109          (and (> pos (point-min)) (memq (car (syntax-after (1- pos))) '(2 3))))
1110        (not (if string (and (< pos (length string))
1111                             (memq (char-syntax (aref string pos)) '(?w ?_)))
1112               (memq (car (syntax-after pos)) '(2 3))))))
1113
1114 (defun lex--match-not-word-boundary (_arg pos &optional string)
1115   (eq (if string (and (> pos 0)
1116                       (eq ?w (char-syntax (aref string (1- pos)))))
1117         (and (> pos (point-min)) (eq 2 (car (syntax-after (1- pos))))))
1118       (if string (and (< pos (length string))
1119                       (eq ?w (char-syntax (aref string pos))))
1120         (eq 2 (car (syntax-after pos))))))
1121
1122 (defun lex--match-upper (_arg pos &optional string)
1123   (when (< pos (if string (length string) (point-max)))
1124     (let ((char (if string (aref string pos) (char-after pos))))
1125       (not (eq (downcase char) char)))))
1126
1127 (defun lex--match-lower (_arg pos &optional string)
1128   (when (< pos (if string (length string) (point-max)))
1129     (let ((char (if string (aref string pos) (char-after pos))))
1130       (not (eq (upcase char) char)))))
1131
1132
1133 (defun lex--match-category (category pos &optional string)
1134   (when (< pos (if string (length string) (point-max)))
1135     (aref (char-category-set (if string (aref string pos)
1136                                (char-after pos)))
1137           category)))
1138
1139 (defun lex--match-syntax (syntaxes pos &optional string)
1140   (when (< pos (if string (length string) (point-max)))
1141     (memq (car (if string (aref (syntax-table) (aref string pos))
1142                  (syntax-after pos)))
1143           syntaxes)))
1144
1145
1146 (defun lex-match-string (lex string &optional start stop)
1147   "Match LEX against STRING between START and STOP.
1148 Return a triplet (VALUE ENDPOS . LEXER) where VALUE is the
1149 value of returned by the lexer for the match found (or nil), ENDPOS
1150 is the end position of the match found (or nil), and LEXER is the
1151 state of the engine at STOP, which can be passed back to
1152 `lex-match-string' to continue the match elsewhere."
1153   ;; FIXME: Move this to C.
1154   (unless start (setq start 0))
1155   (unless stop  (setq stop (length string)))
1156   (let ((match (list nil nil))
1157         (lastlex lex))
1158     (while
1159         (progn
1160           (while (eq (car lex) 'check)
1161             (setq lex (if (funcall (car (nth 1 lex)) (cdr (nth 1 lex))
1162                                    start string)
1163                           (nth 2 lex) (nthcdr 3 lex))))
1164           (when (eq (car lex) 'stop)
1165             ;; Don't stop yet, we're looking for the longest match.
1166             (setq match (list (cadr lex) start))
1167             (message "Found match: %s" match)
1168             (setq lex (cddr lex)))
1169           (cl-assert (not (eq (car lex) 'stop)))
1170           (and lex (< start stop)))
1171       (let ((c (aref string start)))
1172         (setq start (1+ start))
1173         (setq lex (cond
1174                    ((eq (car lex) 'table) (aref (cdr lex) c))
1175                    ((integerp (car lex)) (if (eq c (car lex)) (cdr lex)))))
1176         (setq lastlex lex)))
1177     (message "Final search pos considered: %s" start)
1178     ;; The difference between `lex' and `lastlex' is basically that `lex'
1179     ;; may depend on data after `stop' (if there was an `end-of-file' or
1180     ;; `word-boundary' or basically any `check').  So let's return `lastlex'
1181     ;; so it can be correctly used to continue the match with a different
1182     ;; content than what's after `stop'.
1183     (nconc match lastlex)))
1184
1185 (defun lex-match-string-first (lex string &optional start stop)
1186   "Match LEX against STRING between START and STOP.
1187 Return a triplet (VALUE ENDPOS . LEXER) where VALUE is the
1188 value of returned by the lexer for the match found (or nil), ENDPOS
1189 is the end position of the match found (or nil), and LEXER is the
1190 state of the engine at STOP, which can be passed back to
1191 `lex-match-string' to continue the match elsewhere."
1192   ;; FIXME: Move this to C.
1193   (unless start (setq start 0))
1194   (unless stop  (setq stop (length string)))
1195   (let ((match (list nil nil))
1196         (lastlex lex))
1197     (catch 'found
1198       (while
1199           (progn
1200             (while (eq (car lex) 'check)
1201               (setq lex (if (funcall (car (nth 1 lex)) (cdr (nth 1 lex))
1202                                      start string)
1203                             (nth 2 lex) (nthcdr 3 lex))))
1204             (when (eq (car lex) 'stop)
1205               (throw 'found (cl-list* (cadr lex) start (cddr lex))))
1206             (cl-assert (not (eq (car lex) 'stop)))
1207             (and (not match) lex (< start stop)))
1208         (let ((c (aref string start)))
1209           (setq start (1+ start))
1210           (setq lex (cond
1211                      ((eq (car lex) 'table) (aref (cdr lex) c))
1212                      ((integerp (car lex)) (if (eq c (car lex)) (cdr lex)))))
1213           (setq lastlex lex)))
1214       ;; The difference between `lex' and `lastlex' is basically that `lex'
1215       ;; may depend on data after `stop' (if there was an `end-of-file' or
1216       ;; `word-boundary' or basically any `check').  So let's return `lastlex'
1217       ;; so it can be correctly used to continue the match with a different
1218       ;; content than what's after `stop'.
1219       (cl-list* nil start lastlex))))
1220
1221 (defun lex-match-buffer (lex &optional stop)
1222   "Match LEX against buffer between point and STOP.
1223 Return a triplet (VALUE ENDPOS . LEXER) where VALUE is the
1224 value of returned by the lexer for the match found (or nil), ENDPOS
1225 is the end position of the match found (or nil), and LEXER is the
1226 state of the engine at STOP, which can be passed back to
1227 continue the match elsewhere."
1228   ;; FIXME: Move this to C.
1229   (unless stop  (setq stop (point-max)))
1230   (let ((start (point))
1231         (match (list nil nil))
1232         (lastlex lex))
1233     (while
1234         (progn
1235           (while (eq (car lex) 'check)
1236             (setq lex (if (funcall (car (nth 1 lex)) (cdr (nth 1 lex))
1237                                    start)
1238                           (nth 2 lex) (nthcdr 3 lex))))
1239           (when (eq (car lex) 'stop)
1240             ;; Don't stop yet, we're looking for the longest match.
1241             (setq match (list (cadr lex) start))
1242             (message "Found match: %s" match)
1243             (setq lex (cddr lex)))
1244           (cl-assert (not (eq (car lex) 'stop)))
1245           (and lex (< start stop)))
1246       (let ((c (char-after start)))
1247         (setq start (1+ start))
1248         (setq lex (cond
1249                    ((eq (car lex) 'table) (aref (cdr lex) c))
1250                    ((integerp (car lex)) (if (eq c (car lex)) (cdr lex)))))
1251         (setq lastlex lex)))
1252     (message "Final search pos considered: %s" start)
1253     ;; The difference between `lex' and `lastlex' is basically that `lex'
1254     ;; may depend on data after `stop' (if there was an `end-of-file' or
1255     ;; `word-boundary' or basically any `check').  So let's return `lastlex'
1256     ;; so it can be correctly used to continue the match with a different
1257     ;; content than what's after `stop'.
1258     (nconc match lastlex)))
1259
1260 (provide 'lex)
1261 ;;; lex.el ends here