]> code.delx.au - gnu-emacs/blob - src/regex.c
*** empty log message ***
[gnu-emacs] / src / regex.c
1 /* Extended regular expression matching and search library, version
2 0.12. (Implements POSIX draft P1003.2/D11.2, except for some of the
3 internationalization features.)
4
5 Copyright (C) 1993,94,95,96,97,98,99,2000 Free Software Foundation, Inc.
6
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
20 USA. */
21
22 /* TODO:
23 - structure the opcode space into opcode+flag.
24 - merge with glibc's regex.[ch].
25 - replace (succeed_n + jump_n + set_number_at) with something that doesn't
26 need to modify the compiled regexp so that re_match can be reentrant.
27 - get rid of on_failure_jump_smart by doing the optimization in re_comp
28 rather than at run-time, so that re_match can be reentrant.
29 */
30
31 /* AIX requires this to be the first thing in the file. */
32 #if defined _AIX && !defined REGEX_MALLOC
33 #pragma alloca
34 #endif
35
36 #undef _GNU_SOURCE
37 #define _GNU_SOURCE
38
39 #ifdef HAVE_CONFIG_H
40 # include <config.h>
41 #endif
42
43 #if defined STDC_HEADERS && !defined emacs
44 # include <stddef.h>
45 #else
46 /* We need this for `regex.h', and perhaps for the Emacs include files. */
47 # include <sys/types.h>
48 #endif
49
50 /* Whether to use ISO C Amendment 1 wide char functions.
51 Those should not be used for Emacs since it uses its own. */
52 #if defined _LIBC
53 #define WIDE_CHAR_SUPPORT 1
54 #else
55 #define WIDE_CHAR_SUPPORT \
56 (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC && !emacs)
57 #endif
58
59 /* For platform which support the ISO C amendement 1 functionality we
60 support user defined character classes. */
61 #if WIDE_CHAR_SUPPORT
62 /* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */
63 # include <wchar.h>
64 # include <wctype.h>
65 #endif
66
67 #ifdef _LIBC
68 /* We have to keep the namespace clean. */
69 # define regfree(preg) __regfree (preg)
70 # define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
71 # define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
72 # define regerror(errcode, preg, errbuf, errbuf_size) \
73 __regerror(errcode, preg, errbuf, errbuf_size)
74 # define re_set_registers(bu, re, nu, st, en) \
75 __re_set_registers (bu, re, nu, st, en)
76 # define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
77 __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
78 # define re_match(bufp, string, size, pos, regs) \
79 __re_match (bufp, string, size, pos, regs)
80 # define re_search(bufp, string, size, startpos, range, regs) \
81 __re_search (bufp, string, size, startpos, range, regs)
82 # define re_compile_pattern(pattern, length, bufp) \
83 __re_compile_pattern (pattern, length, bufp)
84 # define re_set_syntax(syntax) __re_set_syntax (syntax)
85 # define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
86 __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
87 # define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
88
89 /* Make sure we call libc's function even if the user overrides them. */
90 # define btowc __btowc
91 # define iswctype __iswctype
92 # define wctype __wctype
93
94 # define WEAK_ALIAS(a,b) weak_alias (a, b)
95
96 /* We are also using some library internals. */
97 # include <locale/localeinfo.h>
98 # include <locale/elem-hash.h>
99 # include <langinfo.h>
100 #else
101 # define WEAK_ALIAS(a,b)
102 #endif
103
104 /* This is for other GNU distributions with internationalized messages. */
105 #if HAVE_LIBINTL_H || defined _LIBC
106 # include <libintl.h>
107 #else
108 # define gettext(msgid) (msgid)
109 #endif
110
111 #ifndef gettext_noop
112 /* This define is so xgettext can find the internationalizable
113 strings. */
114 # define gettext_noop(String) String
115 #endif
116
117 /* The `emacs' switch turns on certain matching commands
118 that make sense only in Emacs. */
119 #ifdef emacs
120
121 # include "lisp.h"
122 # include "buffer.h"
123
124 /* Make syntax table lookup grant data in gl_state. */
125 # define SYNTAX_ENTRY_VIA_PROPERTY
126
127 # include "syntax.h"
128 # include "character.h"
129 # include "category.h"
130
131 # ifdef malloc
132 # undef malloc
133 # endif
134 # define malloc xmalloc
135 # ifdef realloc
136 # undef realloc
137 # endif
138 # define realloc xrealloc
139 # ifdef free
140 # undef free
141 # endif
142 # define free xfree
143
144 /* Converts the pointer to the char to BEG-based offset from the start. */
145 # define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d))
146 # define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object)))
147
148 # define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte)
149 # define RE_TARGET_MULTIBYTE_P(bufp) ((bufp)->target_multibyte)
150 # define RE_STRING_CHAR(p, s) \
151 (multibyte ? (STRING_CHAR (p, s)) : (*(p)))
152 # define RE_STRING_CHAR_AND_LENGTH(p, s, len) \
153 (multibyte ? (STRING_CHAR_AND_LENGTH (p, s, len)) : ((len) = 1, *(p)))
154
155 /* Set C a (possibly converted to multibyte) character before P. P
156 points into a string which is the virtual concatenation of STR1
157 (which ends at END1) or STR2 (which ends at END2). */
158 # define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
159 do { \
160 if (multibyte) \
161 { \
162 re_char *dtemp = (p) == (str2) ? (end1) : (p); \
163 re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \
164 while (dtemp-- > dlimit && !CHAR_HEAD_P (*dtemp)); \
165 c = STRING_CHAR (dtemp, (p) - dtemp); \
166 } \
167 else \
168 { \
169 (c = ((p) == (str2) ? (end1) : (p))[-1]); \
170 MAKE_CHAR_MULTIBYTE (c); \
171 } \
172 } while (0)
173
174 /* Set C a (possibly converted to multibyte) character at P, and set
175 LEN to the byte length of that character. */
176 # define GET_CHAR_AFTER(c, p, len) \
177 do { \
178 if (multibyte) \
179 c = STRING_CHAR_AND_LENGTH (p, 0, len); \
180 else \
181 { \
182 c = *p; \
183 len = 1; \
184 MAKE_CHAR_MULTIBYTE (c); \
185 } \
186 } while (0)
187
188
189 #else /* not emacs */
190
191 /* If we are not linking with Emacs proper,
192 we can't use the relocating allocator
193 even if config.h says that we can. */
194 # undef REL_ALLOC
195
196 # if defined STDC_HEADERS || defined _LIBC
197 # include <stdlib.h>
198 # else
199 char *malloc ();
200 char *realloc ();
201 # endif
202
203 /* When used in Emacs's lib-src, we need to get bzero and bcopy somehow.
204 If nothing else has been done, use the method below. */
205 # ifdef INHIBIT_STRING_HEADER
206 # if !(defined HAVE_BZERO && defined HAVE_BCOPY)
207 # if !defined bzero && !defined bcopy
208 # undef INHIBIT_STRING_HEADER
209 # endif
210 # endif
211 # endif
212
213 /* This is the normal way of making sure we have memcpy, memcmp and bzero.
214 This is used in most programs--a few other programs avoid this
215 by defining INHIBIT_STRING_HEADER. */
216 # ifndef INHIBIT_STRING_HEADER
217 # if defined HAVE_STRING_H || defined STDC_HEADERS || defined _LIBC
218 # include <string.h>
219 # ifndef bzero
220 # ifndef _LIBC
221 # define bzero(s, n) (memset (s, '\0', n), (s))
222 # else
223 # define bzero(s, n) __bzero (s, n)
224 # endif
225 # endif
226 # else
227 # include <strings.h>
228 # ifndef memcmp
229 # define memcmp(s1, s2, n) bcmp (s1, s2, n)
230 # endif
231 # ifndef memcpy
232 # define memcpy(d, s, n) (bcopy (s, d, n), (d))
233 # endif
234 # endif
235 # endif
236
237 /* Define the syntax stuff for \<, \>, etc. */
238
239 /* Sword must be nonzero for the wordchar pattern commands in re_match_2. */
240 enum syntaxcode { Swhitespace = 0, Sword = 1 };
241
242 # ifdef SWITCH_ENUM_BUG
243 # define SWITCH_ENUM_CAST(x) ((int)(x))
244 # else
245 # define SWITCH_ENUM_CAST(x) (x)
246 # endif
247
248 /* Dummy macros for non-Emacs environments. */
249 # define BASE_LEADING_CODE_P(c) (0)
250 # define CHAR_CHARSET(c) 0
251 # define CHARSET_LEADING_CODE_BASE(c) 0
252 # define MAX_MULTIBYTE_LENGTH 1
253 # define RE_MULTIBYTE_P(x) 0
254 # define RE_TARGET_MULTIBYTE_P(x) 0
255 # define WORD_BOUNDARY_P(c1, c2) (0)
256 # define CHAR_HEAD_P(p) (1)
257 # define SINGLE_BYTE_CHAR_P(c) (1)
258 # define SAME_CHARSET_P(c1, c2) (1)
259 # define MULTIBYTE_FORM_LENGTH(p, s) (1)
260 # define STRING_CHAR(p, s) (*(p))
261 # define RE_STRING_CHAR STRING_CHAR
262 # define CHAR_STRING(c, s) (*(s) = (c), 1)
263 # define STRING_CHAR_AND_LENGTH(p, s, actual_len) ((actual_len) = 1, *(p))
264 # define RE_STRING_CHAR_AND_LENGTH STRING_CHAR_AND_LENGTH
265 # define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
266 (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1)))
267 # define GET_CHAR_AFTER(c, p, len) \
268 (c = *p, len = 1)
269 # define MAKE_CHAR(charset, c1, c2) (c1)
270 # define BYTE8_TO_CHAR(c) (c)
271 # define CHAR_BYTE8_P(c) (0)
272 # define MAKE_CHAR_MULTIBYTE(c) (c)
273 # define MAKE_CHAR_UNIBYTE(c) (c)
274 # define CHAR_LEADING_CODE(c) (c)
275 #endif /* not emacs */
276
277 #ifndef RE_TRANSLATE
278 # define RE_TRANSLATE(TBL, C) ((unsigned char)(TBL)[C])
279 # define RE_TRANSLATE_P(TBL) (TBL)
280 #endif
281 \f
282 /* Get the interface, including the syntax bits. */
283 #include "regex.h"
284
285 /* isalpha etc. are used for the character classes. */
286 #include <ctype.h>
287
288 #ifdef emacs
289
290 /* 1 if C is an ASCII character. */
291 # define IS_REAL_ASCII(c) ((c) < 0200)
292
293 /* 1 if C is a unibyte character. */
294 # define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P ((c)))
295
296 /* The Emacs definitions should not be directly affected by locales. */
297
298 /* In Emacs, these are only used for single-byte characters. */
299 # define ISDIGIT(c) ((c) >= '0' && (c) <= '9')
300 # define ISCNTRL(c) ((c) < ' ')
301 # define ISXDIGIT(c) (((c) >= '0' && (c) <= '9') \
302 || ((c) >= 'a' && (c) <= 'f') \
303 || ((c) >= 'A' && (c) <= 'F'))
304
305 /* This is only used for single-byte characters. */
306 # define ISBLANK(c) ((c) == ' ' || (c) == '\t')
307
308 /* The rest must handle multibyte characters. */
309
310 # define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \
311 ? (c) > ' ' && !((c) >= 0177 && (c) <= 0237) \
312 : 1)
313
314 # define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \
315 ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \
316 : 1)
317
318 # define ISALNUM(c) (IS_REAL_ASCII (c) \
319 ? (((c) >= 'a' && (c) <= 'z') \
320 || ((c) >= 'A' && (c) <= 'Z') \
321 || ((c) >= '0' && (c) <= '9')) \
322 : SYNTAX (c) == Sword)
323
324 # define ISALPHA(c) (IS_REAL_ASCII (c) \
325 ? (((c) >= 'a' && (c) <= 'z') \
326 || ((c) >= 'A' && (c) <= 'Z')) \
327 : SYNTAX (c) == Sword)
328
329 # define ISLOWER(c) (LOWERCASEP (c))
330
331 # define ISPUNCT(c) (IS_REAL_ASCII (c) \
332 ? ((c) > ' ' && (c) < 0177 \
333 && !(((c) >= 'a' && (c) <= 'z') \
334 || ((c) >= 'A' && (c) <= 'Z') \
335 || ((c) >= '0' && (c) <= '9'))) \
336 : SYNTAX (c) != Sword)
337
338 # define ISSPACE(c) (SYNTAX (c) == Swhitespace)
339
340 # define ISUPPER(c) (UPPERCASEP (c))
341
342 # define ISWORD(c) (SYNTAX (c) == Sword)
343
344 #else /* not emacs */
345
346 /* Jim Meyering writes:
347
348 "... Some ctype macros are valid only for character codes that
349 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
350 using /bin/cc or gcc but without giving an ansi option). So, all
351 ctype uses should be through macros like ISPRINT... If
352 STDC_HEADERS is defined, then autoconf has verified that the ctype
353 macros don't need to be guarded with references to isascii. ...
354 Defining isascii to 1 should let any compiler worth its salt
355 eliminate the && through constant folding."
356 Solaris defines some of these symbols so we must undefine them first. */
357
358 # undef ISASCII
359 # if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII)
360 # define ISASCII(c) 1
361 # else
362 # define ISASCII(c) isascii(c)
363 # endif
364
365 /* 1 if C is an ASCII character. */
366 # define IS_REAL_ASCII(c) ((c) < 0200)
367
368 /* This distinction is not meaningful, except in Emacs. */
369 # define ISUNIBYTE(c) 1
370
371 # ifdef isblank
372 # define ISBLANK(c) (ISASCII (c) && isblank (c))
373 # else
374 # define ISBLANK(c) ((c) == ' ' || (c) == '\t')
375 # endif
376 # ifdef isgraph
377 # define ISGRAPH(c) (ISASCII (c) && isgraph (c))
378 # else
379 # define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
380 # endif
381
382 # undef ISPRINT
383 # define ISPRINT(c) (ISASCII (c) && isprint (c))
384 # define ISDIGIT(c) (ISASCII (c) && isdigit (c))
385 # define ISALNUM(c) (ISASCII (c) && isalnum (c))
386 # define ISALPHA(c) (ISASCII (c) && isalpha (c))
387 # define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
388 # define ISLOWER(c) (ISASCII (c) && islower (c))
389 # define ISPUNCT(c) (ISASCII (c) && ispunct (c))
390 # define ISSPACE(c) (ISASCII (c) && isspace (c))
391 # define ISUPPER(c) (ISASCII (c) && isupper (c))
392 # define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
393
394 # define ISWORD(c) ISALPHA(c)
395
396 # ifdef _tolower
397 # define TOLOWER(c) _tolower(c)
398 # else
399 # define TOLOWER(c) tolower(c)
400 # endif
401
402 /* How many characters in the character set. */
403 # define CHAR_SET_SIZE 256
404
405 # ifdef SYNTAX_TABLE
406
407 extern char *re_syntax_table;
408
409 # else /* not SYNTAX_TABLE */
410
411 static char re_syntax_table[CHAR_SET_SIZE];
412
413 static void
414 init_syntax_once ()
415 {
416 register int c;
417 static int done = 0;
418
419 if (done)
420 return;
421
422 bzero (re_syntax_table, sizeof re_syntax_table);
423
424 for (c = 0; c < CHAR_SET_SIZE; ++c)
425 if (ISALNUM (c))
426 re_syntax_table[c] = Sword;
427
428 re_syntax_table['_'] = Sword;
429
430 done = 1;
431 }
432
433 # endif /* not SYNTAX_TABLE */
434
435 # define SYNTAX(c) re_syntax_table[(c)]
436
437 #endif /* not emacs */
438 \f
439 #ifndef NULL
440 # define NULL (void *)0
441 #endif
442
443 /* We remove any previous definition of `SIGN_EXTEND_CHAR',
444 since ours (we hope) works properly with all combinations of
445 machines, compilers, `char' and `unsigned char' argument types.
446 (Per Bothner suggested the basic approach.) */
447 #undef SIGN_EXTEND_CHAR
448 #if __STDC__
449 # define SIGN_EXTEND_CHAR(c) ((signed char) (c))
450 #else /* not __STDC__ */
451 /* As in Harbison and Steele. */
452 # define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
453 #endif
454 \f
455 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
456 use `alloca' instead of `malloc'. This is because using malloc in
457 re_search* or re_match* could cause memory leaks when C-g is used in
458 Emacs; also, malloc is slower and causes storage fragmentation. On
459 the other hand, malloc is more portable, and easier to debug.
460
461 Because we sometimes use alloca, some routines have to be macros,
462 not functions -- `alloca'-allocated space disappears at the end of the
463 function it is called in. */
464
465 #ifdef REGEX_MALLOC
466
467 # define REGEX_ALLOCATE malloc
468 # define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
469 # define REGEX_FREE free
470
471 #else /* not REGEX_MALLOC */
472
473 /* Emacs already defines alloca, sometimes. */
474 # ifndef alloca
475
476 /* Make alloca work the best possible way. */
477 # ifdef __GNUC__
478 # define alloca __builtin_alloca
479 # else /* not __GNUC__ */
480 # ifdef HAVE_ALLOCA_H
481 # include <alloca.h>
482 # endif /* HAVE_ALLOCA_H */
483 # endif /* not __GNUC__ */
484
485 # endif /* not alloca */
486
487 # define REGEX_ALLOCATE alloca
488
489 /* Assumes a `char *destination' variable. */
490 # define REGEX_REALLOCATE(source, osize, nsize) \
491 (destination = (char *) alloca (nsize), \
492 memcpy (destination, source, osize))
493
494 /* No need to do anything to free, after alloca. */
495 # define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
496
497 #endif /* not REGEX_MALLOC */
498
499 /* Define how to allocate the failure stack. */
500
501 #if defined REL_ALLOC && defined REGEX_MALLOC
502
503 # define REGEX_ALLOCATE_STACK(size) \
504 r_alloc (&failure_stack_ptr, (size))
505 # define REGEX_REALLOCATE_STACK(source, osize, nsize) \
506 r_re_alloc (&failure_stack_ptr, (nsize))
507 # define REGEX_FREE_STACK(ptr) \
508 r_alloc_free (&failure_stack_ptr)
509
510 #else /* not using relocating allocator */
511
512 # ifdef REGEX_MALLOC
513
514 # define REGEX_ALLOCATE_STACK malloc
515 # define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
516 # define REGEX_FREE_STACK free
517
518 # else /* not REGEX_MALLOC */
519
520 # define REGEX_ALLOCATE_STACK alloca
521
522 # define REGEX_REALLOCATE_STACK(source, osize, nsize) \
523 REGEX_REALLOCATE (source, osize, nsize)
524 /* No need to explicitly free anything. */
525 # define REGEX_FREE_STACK(arg) ((void)0)
526
527 # endif /* not REGEX_MALLOC */
528 #endif /* not using relocating allocator */
529
530
531 /* True if `size1' is non-NULL and PTR is pointing anywhere inside
532 `string1' or just past its end. This works if PTR is NULL, which is
533 a good thing. */
534 #define FIRST_STRING_P(ptr) \
535 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
536
537 /* (Re)Allocate N items of type T using malloc, or fail. */
538 #define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
539 #define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
540 #define RETALLOC_IF(addr, n, t) \
541 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
542 #define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
543
544 #define BYTEWIDTH 8 /* In bits. */
545
546 #define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
547
548 #undef MAX
549 #undef MIN
550 #define MAX(a, b) ((a) > (b) ? (a) : (b))
551 #define MIN(a, b) ((a) < (b) ? (a) : (b))
552
553 /* Type of source-pattern and string chars. */
554 typedef const unsigned char re_char;
555
556 typedef char boolean;
557 #define false 0
558 #define true 1
559
560 static int re_match_2_internal _RE_ARGS ((struct re_pattern_buffer *bufp,
561 re_char *string1, int size1,
562 re_char *string2, int size2,
563 int pos,
564 struct re_registers *regs,
565 int stop));
566 \f
567 /* These are the command codes that appear in compiled regular
568 expressions. Some opcodes are followed by argument bytes. A
569 command code can specify any interpretation whatsoever for its
570 arguments. Zero bytes may appear in the compiled regular expression. */
571
572 typedef enum
573 {
574 no_op = 0,
575
576 /* Succeed right away--no more backtracking. */
577 succeed,
578
579 /* Followed by one byte giving n, then by n literal bytes. */
580 exactn,
581
582 /* Matches any (more or less) character. */
583 anychar,
584
585 /* Matches any one char belonging to specified set. First
586 following byte is number of bitmap bytes. Then come bytes
587 for a bitmap saying which chars are in. Bits in each byte
588 are ordered low-bit-first. A character is in the set if its
589 bit is 1. A character too large to have a bit in the map is
590 automatically not in the set.
591
592 If the length byte has the 0x80 bit set, then that stuff
593 is followed by a range table:
594 2 bytes of flags for character sets (low 8 bits, high 8 bits)
595 See RANGE_TABLE_WORK_BITS below.
596 2 bytes, the number of pairs that follow (upto 32767)
597 pairs, each 2 multibyte characters,
598 each multibyte character represented as 3 bytes. */
599 charset,
600
601 /* Same parameters as charset, but match any character that is
602 not one of those specified. */
603 charset_not,
604
605 /* Start remembering the text that is matched, for storing in a
606 register. Followed by one byte with the register number, in
607 the range 0 to one less than the pattern buffer's re_nsub
608 field. */
609 start_memory,
610
611 /* Stop remembering the text that is matched and store it in a
612 memory register. Followed by one byte with the register
613 number, in the range 0 to one less than `re_nsub' in the
614 pattern buffer. */
615 stop_memory,
616
617 /* Match a duplicate of something remembered. Followed by one
618 byte containing the register number. */
619 duplicate,
620
621 /* Fail unless at beginning of line. */
622 begline,
623
624 /* Fail unless at end of line. */
625 endline,
626
627 /* Succeeds if at beginning of buffer (if emacs) or at beginning
628 of string to be matched (if not). */
629 begbuf,
630
631 /* Analogously, for end of buffer/string. */
632 endbuf,
633
634 /* Followed by two byte relative address to which to jump. */
635 jump,
636
637 /* Followed by two-byte relative address of place to resume at
638 in case of failure. */
639 on_failure_jump,
640
641 /* Like on_failure_jump, but pushes a placeholder instead of the
642 current string position when executed. */
643 on_failure_keep_string_jump,
644
645 /* Just like `on_failure_jump', except that it checks that we
646 don't get stuck in an infinite loop (matching an empty string
647 indefinitely). */
648 on_failure_jump_loop,
649
650 /* Just like `on_failure_jump_loop', except that it checks for
651 a different kind of loop (the kind that shows up with non-greedy
652 operators). This operation has to be immediately preceded
653 by a `no_op'. */
654 on_failure_jump_nastyloop,
655
656 /* A smart `on_failure_jump' used for greedy * and + operators.
657 It analyses the loop before which it is put and if the
658 loop does not require backtracking, it changes itself to
659 `on_failure_keep_string_jump' and short-circuits the loop,
660 else it just defaults to changing itself into `on_failure_jump'.
661 It assumes that it is pointing to just past a `jump'. */
662 on_failure_jump_smart,
663
664 /* Followed by two-byte relative address and two-byte number n.
665 After matching N times, jump to the address upon failure.
666 Does not work if N starts at 0: use on_failure_jump_loop
667 instead. */
668 succeed_n,
669
670 /* Followed by two-byte relative address, and two-byte number n.
671 Jump to the address N times, then fail. */
672 jump_n,
673
674 /* Set the following two-byte relative address to the
675 subsequent two-byte number. The address *includes* the two
676 bytes of number. */
677 set_number_at,
678
679 wordbeg, /* Succeeds if at word beginning. */
680 wordend, /* Succeeds if at word end. */
681
682 wordbound, /* Succeeds if at a word boundary. */
683 notwordbound, /* Succeeds if not at a word boundary. */
684
685 /* Matches any character whose syntax is specified. Followed by
686 a byte which contains a syntax code, e.g., Sword. */
687 syntaxspec,
688
689 /* Matches any character whose syntax is not that specified. */
690 notsyntaxspec
691
692 #ifdef emacs
693 ,before_dot, /* Succeeds if before point. */
694 at_dot, /* Succeeds if at point. */
695 after_dot, /* Succeeds if after point. */
696
697 /* Matches any character whose category-set contains the specified
698 category. The operator is followed by a byte which contains a
699 category code (mnemonic ASCII character). */
700 categoryspec,
701
702 /* Matches any character whose category-set does not contain the
703 specified category. The operator is followed by a byte which
704 contains the category code (mnemonic ASCII character). */
705 notcategoryspec
706 #endif /* emacs */
707 } re_opcode_t;
708 \f
709 /* Common operations on the compiled pattern. */
710
711 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
712
713 #define STORE_NUMBER(destination, number) \
714 do { \
715 (destination)[0] = (number) & 0377; \
716 (destination)[1] = (number) >> 8; \
717 } while (0)
718
719 /* Same as STORE_NUMBER, except increment DESTINATION to
720 the byte after where the number is stored. Therefore, DESTINATION
721 must be an lvalue. */
722
723 #define STORE_NUMBER_AND_INCR(destination, number) \
724 do { \
725 STORE_NUMBER (destination, number); \
726 (destination) += 2; \
727 } while (0)
728
729 /* Put into DESTINATION a number stored in two contiguous bytes starting
730 at SOURCE. */
731
732 #define EXTRACT_NUMBER(destination, source) \
733 do { \
734 (destination) = *(source) & 0377; \
735 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
736 } while (0)
737
738 #ifdef DEBUG
739 static void extract_number _RE_ARGS ((int *dest, re_char *source));
740 static void
741 extract_number (dest, source)
742 int *dest;
743 re_char *source;
744 {
745 int temp = SIGN_EXTEND_CHAR (*(source + 1));
746 *dest = *source & 0377;
747 *dest += temp << 8;
748 }
749
750 # ifndef EXTRACT_MACROS /* To debug the macros. */
751 # undef EXTRACT_NUMBER
752 # define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
753 # endif /* not EXTRACT_MACROS */
754
755 #endif /* DEBUG */
756
757 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
758 SOURCE must be an lvalue. */
759
760 #define EXTRACT_NUMBER_AND_INCR(destination, source) \
761 do { \
762 EXTRACT_NUMBER (destination, source); \
763 (source) += 2; \
764 } while (0)
765
766 #ifdef DEBUG
767 static void extract_number_and_incr _RE_ARGS ((int *destination,
768 re_char **source));
769 static void
770 extract_number_and_incr (destination, source)
771 int *destination;
772 re_char **source;
773 {
774 extract_number (destination, *source);
775 *source += 2;
776 }
777
778 # ifndef EXTRACT_MACROS
779 # undef EXTRACT_NUMBER_AND_INCR
780 # define EXTRACT_NUMBER_AND_INCR(dest, src) \
781 extract_number_and_incr (&dest, &src)
782 # endif /* not EXTRACT_MACROS */
783
784 #endif /* DEBUG */
785 \f
786 /* Store a multibyte character in three contiguous bytes starting
787 DESTINATION, and increment DESTINATION to the byte after where the
788 character is stored. Therefore, DESTINATION must be an lvalue. */
789
790 #define STORE_CHARACTER_AND_INCR(destination, character) \
791 do { \
792 (destination)[0] = (character) & 0377; \
793 (destination)[1] = ((character) >> 8) & 0377; \
794 (destination)[2] = (character) >> 16; \
795 (destination) += 3; \
796 } while (0)
797
798 /* Put into DESTINATION a character stored in three contiguous bytes
799 starting at SOURCE. */
800
801 #define EXTRACT_CHARACTER(destination, source) \
802 do { \
803 (destination) = ((source)[0] \
804 | ((source)[1] << 8) \
805 | ((source)[2] << 16)); \
806 } while (0)
807
808
809 /* Macros for charset. */
810
811 /* Size of bitmap of charset P in bytes. P is a start of charset,
812 i.e. *P is (re_opcode_t) charset or (re_opcode_t) charset_not. */
813 #define CHARSET_BITMAP_SIZE(p) ((p)[1] & 0x7F)
814
815 /* Nonzero if charset P has range table. */
816 #define CHARSET_RANGE_TABLE_EXISTS_P(p) ((p)[1] & 0x80)
817
818 /* Return the address of range table of charset P. But not the start
819 of table itself, but the before where the number of ranges is
820 stored. `2 +' means to skip re_opcode_t and size of bitmap,
821 and the 2 bytes of flags at the start of the range table. */
822 #define CHARSET_RANGE_TABLE(p) (&(p)[4 + CHARSET_BITMAP_SIZE (p)])
823
824 /* Extract the bit flags that start a range table. */
825 #define CHARSET_RANGE_TABLE_BITS(p) \
826 ((p)[2 + CHARSET_BITMAP_SIZE (p)] \
827 + (p)[3 + CHARSET_BITMAP_SIZE (p)] * 0x100)
828
829 /* Test if C is listed in the bitmap of charset P. */
830 #define CHARSET_LOOKUP_BITMAP(p, c) \
831 ((c) < CHARSET_BITMAP_SIZE (p) * BYTEWIDTH \
832 && (p)[2 + (c) / BYTEWIDTH] & (1 << ((c) % BYTEWIDTH)))
833
834 /* Return the address of end of RANGE_TABLE. COUNT is number of
835 ranges (which is a pair of (start, end)) in the RANGE_TABLE. `* 2'
836 is start of range and end of range. `* 3' is size of each start
837 and end. */
838 #define CHARSET_RANGE_TABLE_END(range_table, count) \
839 ((range_table) + (count) * 2 * 3)
840
841 /* Test if C is in RANGE_TABLE. A flag NOT is negated if C is in.
842 COUNT is number of ranges in RANGE_TABLE. */
843 #define CHARSET_LOOKUP_RANGE_TABLE_RAW(not, c, range_table, count) \
844 do \
845 { \
846 re_wchar_t range_start, range_end; \
847 re_char *p; \
848 re_char *range_table_end \
849 = CHARSET_RANGE_TABLE_END ((range_table), (count)); \
850 \
851 for (p = (range_table); p < range_table_end; p += 2 * 3) \
852 { \
853 EXTRACT_CHARACTER (range_start, p); \
854 EXTRACT_CHARACTER (range_end, p + 3); \
855 \
856 if (range_start <= (c) && (c) <= range_end) \
857 { \
858 (not) = !(not); \
859 break; \
860 } \
861 } \
862 } \
863 while (0)
864
865 /* Test if C is in range table of CHARSET. The flag NOT is negated if
866 C is listed in it. */
867 #define CHARSET_LOOKUP_RANGE_TABLE(not, c, charset) \
868 do \
869 { \
870 /* Number of ranges in range table. */ \
871 int count; \
872 re_char *range_table = CHARSET_RANGE_TABLE (charset); \
873 \
874 EXTRACT_NUMBER_AND_INCR (count, range_table); \
875 CHARSET_LOOKUP_RANGE_TABLE_RAW ((not), (c), range_table, count); \
876 } \
877 while (0)
878 \f
879 /* If DEBUG is defined, Regex prints many voluminous messages about what
880 it is doing (if the variable `debug' is nonzero). If linked with the
881 main program in `iregex.c', you can enter patterns and strings
882 interactively. And if linked with the main program in `main.c' and
883 the other test files, you can run the already-written tests. */
884
885 #ifdef DEBUG
886
887 /* We use standard I/O for debugging. */
888 # include <stdio.h>
889
890 /* It is useful to test things that ``must'' be true when debugging. */
891 # include <assert.h>
892
893 static int debug = -100000;
894
895 # define DEBUG_STATEMENT(e) e
896 # define DEBUG_PRINT1(x) if (debug > 0) printf (x)
897 # define DEBUG_PRINT2(x1, x2) if (debug > 0) printf (x1, x2)
898 # define DEBUG_PRINT3(x1, x2, x3) if (debug > 0) printf (x1, x2, x3)
899 # define DEBUG_PRINT4(x1, x2, x3, x4) if (debug > 0) printf (x1, x2, x3, x4)
900 # define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
901 if (debug > 0) print_partial_compiled_pattern (s, e)
902 # define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
903 if (debug > 0) print_double_string (w, s1, sz1, s2, sz2)
904
905
906 /* Print the fastmap in human-readable form. */
907
908 void
909 print_fastmap (fastmap)
910 char *fastmap;
911 {
912 unsigned was_a_range = 0;
913 unsigned i = 0;
914
915 while (i < (1 << BYTEWIDTH))
916 {
917 if (fastmap[i++])
918 {
919 was_a_range = 0;
920 putchar (i - 1);
921 while (i < (1 << BYTEWIDTH) && fastmap[i])
922 {
923 was_a_range = 1;
924 i++;
925 }
926 if (was_a_range)
927 {
928 printf ("-");
929 putchar (i - 1);
930 }
931 }
932 }
933 putchar ('\n');
934 }
935
936
937 /* Print a compiled pattern string in human-readable form, starting at
938 the START pointer into it and ending just before the pointer END. */
939
940 void
941 print_partial_compiled_pattern (start, end)
942 re_char *start;
943 re_char *end;
944 {
945 int mcnt, mcnt2;
946 re_char *p = start;
947 re_char *pend = end;
948
949 if (start == NULL)
950 {
951 printf ("(null)\n");
952 return;
953 }
954
955 /* Loop over pattern commands. */
956 while (p < pend)
957 {
958 printf ("%d:\t", p - start);
959
960 switch ((re_opcode_t) *p++)
961 {
962 case no_op:
963 printf ("/no_op");
964 break;
965
966 case succeed:
967 printf ("/succeed");
968 break;
969
970 case exactn:
971 mcnt = *p++;
972 printf ("/exactn/%d", mcnt);
973 do
974 {
975 putchar ('/');
976 putchar (*p++);
977 }
978 while (--mcnt);
979 break;
980
981 case start_memory:
982 printf ("/start_memory/%d", *p++);
983 break;
984
985 case stop_memory:
986 printf ("/stop_memory/%d", *p++);
987 break;
988
989 case duplicate:
990 printf ("/duplicate/%d", *p++);
991 break;
992
993 case anychar:
994 printf ("/anychar");
995 break;
996
997 case charset:
998 case charset_not:
999 {
1000 register int c, last = -100;
1001 register int in_range = 0;
1002 int length = CHARSET_BITMAP_SIZE (p - 1);
1003 int has_range_table = CHARSET_RANGE_TABLE_EXISTS_P (p - 1);
1004
1005 printf ("/charset [%s",
1006 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
1007
1008 assert (p + *p < pend);
1009
1010 for (c = 0; c < 256; c++)
1011 if (c / 8 < length
1012 && (p[1 + (c/8)] & (1 << (c % 8))))
1013 {
1014 /* Are we starting a range? */
1015 if (last + 1 == c && ! in_range)
1016 {
1017 putchar ('-');
1018 in_range = 1;
1019 }
1020 /* Have we broken a range? */
1021 else if (last + 1 != c && in_range)
1022 {
1023 putchar (last);
1024 in_range = 0;
1025 }
1026
1027 if (! in_range)
1028 putchar (c);
1029
1030 last = c;
1031 }
1032
1033 if (in_range)
1034 putchar (last);
1035
1036 putchar (']');
1037
1038 p += 1 + length;
1039
1040 if (has_range_table)
1041 {
1042 int count;
1043 printf ("has-range-table");
1044
1045 /* ??? Should print the range table; for now, just skip it. */
1046 p += 2; /* skip range table bits */
1047 EXTRACT_NUMBER_AND_INCR (count, p);
1048 p = CHARSET_RANGE_TABLE_END (p, count);
1049 }
1050 }
1051 break;
1052
1053 case begline:
1054 printf ("/begline");
1055 break;
1056
1057 case endline:
1058 printf ("/endline");
1059 break;
1060
1061 case on_failure_jump:
1062 extract_number_and_incr (&mcnt, &p);
1063 printf ("/on_failure_jump to %d", p + mcnt - start);
1064 break;
1065
1066 case on_failure_keep_string_jump:
1067 extract_number_and_incr (&mcnt, &p);
1068 printf ("/on_failure_keep_string_jump to %d", p + mcnt - start);
1069 break;
1070
1071 case on_failure_jump_nastyloop:
1072 extract_number_and_incr (&mcnt, &p);
1073 printf ("/on_failure_jump_nastyloop to %d", p + mcnt - start);
1074 break;
1075
1076 case on_failure_jump_loop:
1077 extract_number_and_incr (&mcnt, &p);
1078 printf ("/on_failure_jump_loop to %d", p + mcnt - start);
1079 break;
1080
1081 case on_failure_jump_smart:
1082 extract_number_and_incr (&mcnt, &p);
1083 printf ("/on_failure_jump_smart to %d", p + mcnt - start);
1084 break;
1085
1086 case jump:
1087 extract_number_and_incr (&mcnt, &p);
1088 printf ("/jump to %d", p + mcnt - start);
1089 break;
1090
1091 case succeed_n:
1092 extract_number_and_incr (&mcnt, &p);
1093 extract_number_and_incr (&mcnt2, &p);
1094 printf ("/succeed_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
1095 break;
1096
1097 case jump_n:
1098 extract_number_and_incr (&mcnt, &p);
1099 extract_number_and_incr (&mcnt2, &p);
1100 printf ("/jump_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
1101 break;
1102
1103 case set_number_at:
1104 extract_number_and_incr (&mcnt, &p);
1105 extract_number_and_incr (&mcnt2, &p);
1106 printf ("/set_number_at location %d to %d", p - 2 + mcnt - start, mcnt2);
1107 break;
1108
1109 case wordbound:
1110 printf ("/wordbound");
1111 break;
1112
1113 case notwordbound:
1114 printf ("/notwordbound");
1115 break;
1116
1117 case wordbeg:
1118 printf ("/wordbeg");
1119 break;
1120
1121 case wordend:
1122 printf ("/wordend");
1123
1124 case syntaxspec:
1125 printf ("/syntaxspec");
1126 mcnt = *p++;
1127 printf ("/%d", mcnt);
1128 break;
1129
1130 case notsyntaxspec:
1131 printf ("/notsyntaxspec");
1132 mcnt = *p++;
1133 printf ("/%d", mcnt);
1134 break;
1135
1136 # ifdef emacs
1137 case before_dot:
1138 printf ("/before_dot");
1139 break;
1140
1141 case at_dot:
1142 printf ("/at_dot");
1143 break;
1144
1145 case after_dot:
1146 printf ("/after_dot");
1147 break;
1148
1149 case categoryspec:
1150 printf ("/categoryspec");
1151 mcnt = *p++;
1152 printf ("/%d", mcnt);
1153 break;
1154
1155 case notcategoryspec:
1156 printf ("/notcategoryspec");
1157 mcnt = *p++;
1158 printf ("/%d", mcnt);
1159 break;
1160 # endif /* emacs */
1161
1162 case begbuf:
1163 printf ("/begbuf");
1164 break;
1165
1166 case endbuf:
1167 printf ("/endbuf");
1168 break;
1169
1170 default:
1171 printf ("?%d", *(p-1));
1172 }
1173
1174 putchar ('\n');
1175 }
1176
1177 printf ("%d:\tend of pattern.\n", p - start);
1178 }
1179
1180
1181 void
1182 print_compiled_pattern (bufp)
1183 struct re_pattern_buffer *bufp;
1184 {
1185 re_char *buffer = bufp->buffer;
1186
1187 print_partial_compiled_pattern (buffer, buffer + bufp->used);
1188 printf ("%ld bytes used/%ld bytes allocated.\n",
1189 bufp->used, bufp->allocated);
1190
1191 if (bufp->fastmap_accurate && bufp->fastmap)
1192 {
1193 printf ("fastmap: ");
1194 print_fastmap (bufp->fastmap);
1195 }
1196
1197 printf ("re_nsub: %d\t", bufp->re_nsub);
1198 printf ("regs_alloc: %d\t", bufp->regs_allocated);
1199 printf ("can_be_null: %d\t", bufp->can_be_null);
1200 printf ("no_sub: %d\t", bufp->no_sub);
1201 printf ("not_bol: %d\t", bufp->not_bol);
1202 printf ("not_eol: %d\t", bufp->not_eol);
1203 printf ("syntax: %lx\n", bufp->syntax);
1204 fflush (stdout);
1205 /* Perhaps we should print the translate table? */
1206 }
1207
1208
1209 void
1210 print_double_string (where, string1, size1, string2, size2)
1211 re_char *where;
1212 re_char *string1;
1213 re_char *string2;
1214 int size1;
1215 int size2;
1216 {
1217 int this_char;
1218
1219 if (where == NULL)
1220 printf ("(null)");
1221 else
1222 {
1223 if (FIRST_STRING_P (where))
1224 {
1225 for (this_char = where - string1; this_char < size1; this_char++)
1226 putchar (string1[this_char]);
1227
1228 where = string2;
1229 }
1230
1231 for (this_char = where - string2; this_char < size2; this_char++)
1232 putchar (string2[this_char]);
1233 }
1234 }
1235
1236 #else /* not DEBUG */
1237
1238 # undef assert
1239 # define assert(e)
1240
1241 # define DEBUG_STATEMENT(e)
1242 # define DEBUG_PRINT1(x)
1243 # define DEBUG_PRINT2(x1, x2)
1244 # define DEBUG_PRINT3(x1, x2, x3)
1245 # define DEBUG_PRINT4(x1, x2, x3, x4)
1246 # define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1247 # define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
1248
1249 #endif /* not DEBUG */
1250 \f
1251 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1252 also be assigned to arbitrarily: each pattern buffer stores its own
1253 syntax, so it can be changed between regex compilations. */
1254 /* This has no initializer because initialized variables in Emacs
1255 become read-only after dumping. */
1256 reg_syntax_t re_syntax_options;
1257
1258
1259 /* Specify the precise syntax of regexps for compilation. This provides
1260 for compatibility for various utilities which historically have
1261 different, incompatible syntaxes.
1262
1263 The argument SYNTAX is a bit mask comprised of the various bits
1264 defined in regex.h. We return the old syntax. */
1265
1266 reg_syntax_t
1267 re_set_syntax (syntax)
1268 reg_syntax_t syntax;
1269 {
1270 reg_syntax_t ret = re_syntax_options;
1271
1272 re_syntax_options = syntax;
1273 return ret;
1274 }
1275 WEAK_ALIAS (__re_set_syntax, re_set_syntax)
1276 \f
1277 /* This table gives an error message for each of the error codes listed
1278 in regex.h. Obviously the order here has to be same as there.
1279 POSIX doesn't require that we do anything for REG_NOERROR,
1280 but why not be nice? */
1281
1282 static const char *re_error_msgid[] =
1283 {
1284 gettext_noop ("Success"), /* REG_NOERROR */
1285 gettext_noop ("No match"), /* REG_NOMATCH */
1286 gettext_noop ("Invalid regular expression"), /* REG_BADPAT */
1287 gettext_noop ("Invalid collation character"), /* REG_ECOLLATE */
1288 gettext_noop ("Invalid character class name"), /* REG_ECTYPE */
1289 gettext_noop ("Trailing backslash"), /* REG_EESCAPE */
1290 gettext_noop ("Invalid back reference"), /* REG_ESUBREG */
1291 gettext_noop ("Unmatched [ or [^"), /* REG_EBRACK */
1292 gettext_noop ("Unmatched ( or \\("), /* REG_EPAREN */
1293 gettext_noop ("Unmatched \\{"), /* REG_EBRACE */
1294 gettext_noop ("Invalid content of \\{\\}"), /* REG_BADBR */
1295 gettext_noop ("Invalid range end"), /* REG_ERANGE */
1296 gettext_noop ("Memory exhausted"), /* REG_ESPACE */
1297 gettext_noop ("Invalid preceding regular expression"), /* REG_BADRPT */
1298 gettext_noop ("Premature end of regular expression"), /* REG_EEND */
1299 gettext_noop ("Regular expression too big"), /* REG_ESIZE */
1300 gettext_noop ("Unmatched ) or \\)"), /* REG_ERPAREN */
1301 };
1302 \f
1303 /* Avoiding alloca during matching, to placate r_alloc. */
1304
1305 /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1306 searching and matching functions should not call alloca. On some
1307 systems, alloca is implemented in terms of malloc, and if we're
1308 using the relocating allocator routines, then malloc could cause a
1309 relocation, which might (if the strings being searched are in the
1310 ralloc heap) shift the data out from underneath the regexp
1311 routines.
1312
1313 Here's another reason to avoid allocation: Emacs
1314 processes input from X in a signal handler; processing X input may
1315 call malloc; if input arrives while a matching routine is calling
1316 malloc, then we're scrod. But Emacs can't just block input while
1317 calling matching routines; then we don't notice interrupts when
1318 they come in. So, Emacs blocks input around all regexp calls
1319 except the matching calls, which it leaves unprotected, in the
1320 faith that they will not malloc. */
1321
1322 /* Normally, this is fine. */
1323 #define MATCH_MAY_ALLOCATE
1324
1325 /* When using GNU C, we are not REALLY using the C alloca, no matter
1326 what config.h may say. So don't take precautions for it. */
1327 #ifdef __GNUC__
1328 # undef C_ALLOCA
1329 #endif
1330
1331 /* The match routines may not allocate if (1) they would do it with malloc
1332 and (2) it's not safe for them to use malloc.
1333 Note that if REL_ALLOC is defined, matching would not use malloc for the
1334 failure stack, but we would still use it for the register vectors;
1335 so REL_ALLOC should not affect this. */
1336 #if (defined C_ALLOCA || defined REGEX_MALLOC) && defined emacs
1337 # undef MATCH_MAY_ALLOCATE
1338 #endif
1339
1340 \f
1341 /* Failure stack declarations and macros; both re_compile_fastmap and
1342 re_match_2 use a failure stack. These have to be macros because of
1343 REGEX_ALLOCATE_STACK. */
1344
1345
1346 /* Approximate number of failure points for which to initially allocate space
1347 when matching. If this number is exceeded, we allocate more
1348 space, so it is not a hard limit. */
1349 #ifndef INIT_FAILURE_ALLOC
1350 # define INIT_FAILURE_ALLOC 20
1351 #endif
1352
1353 /* Roughly the maximum number of failure points on the stack. Would be
1354 exactly that if always used TYPICAL_FAILURE_SIZE items each time we failed.
1355 This is a variable only so users of regex can assign to it; we never
1356 change it ourselves. We always multiply it by TYPICAL_FAILURE_SIZE
1357 before using it, so it should probably be a byte-count instead. */
1358 # if defined MATCH_MAY_ALLOCATE
1359 /* Note that 4400 was enough to cause a crash on Alpha OSF/1,
1360 whose default stack limit is 2mb. In order for a larger
1361 value to work reliably, you have to try to make it accord
1362 with the process stack limit. */
1363 size_t re_max_failures = 40000;
1364 # else
1365 size_t re_max_failures = 4000;
1366 # endif
1367
1368 union fail_stack_elt
1369 {
1370 re_char *pointer;
1371 /* This should be the biggest `int' that's no bigger than a pointer. */
1372 long integer;
1373 };
1374
1375 typedef union fail_stack_elt fail_stack_elt_t;
1376
1377 typedef struct
1378 {
1379 fail_stack_elt_t *stack;
1380 size_t size;
1381 size_t avail; /* Offset of next open position. */
1382 size_t frame; /* Offset of the cur constructed frame. */
1383 } fail_stack_type;
1384
1385 #define FAIL_STACK_EMPTY() (fail_stack.frame == 0)
1386 #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1387
1388
1389 /* Define macros to initialize and free the failure stack.
1390 Do `return -2' if the alloc fails. */
1391
1392 #ifdef MATCH_MAY_ALLOCATE
1393 # define INIT_FAIL_STACK() \
1394 do { \
1395 fail_stack.stack = (fail_stack_elt_t *) \
1396 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * TYPICAL_FAILURE_SIZE \
1397 * sizeof (fail_stack_elt_t)); \
1398 \
1399 if (fail_stack.stack == NULL) \
1400 return -2; \
1401 \
1402 fail_stack.size = INIT_FAILURE_ALLOC; \
1403 fail_stack.avail = 0; \
1404 fail_stack.frame = 0; \
1405 } while (0)
1406
1407 # define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
1408 #else
1409 # define INIT_FAIL_STACK() \
1410 do { \
1411 fail_stack.avail = 0; \
1412 fail_stack.frame = 0; \
1413 } while (0)
1414
1415 # define RESET_FAIL_STACK() ((void)0)
1416 #endif
1417
1418
1419 /* Double the size of FAIL_STACK, up to a limit
1420 which allows approximately `re_max_failures' items.
1421
1422 Return 1 if succeeds, and 0 if either ran out of memory
1423 allocating space for it or it was already too large.
1424
1425 REGEX_REALLOCATE_STACK requires `destination' be declared. */
1426
1427 /* Factor to increase the failure stack size by
1428 when we increase it.
1429 This used to be 2, but 2 was too wasteful
1430 because the old discarded stacks added up to as much space
1431 were as ultimate, maximum-size stack. */
1432 #define FAIL_STACK_GROWTH_FACTOR 4
1433
1434 #define GROW_FAIL_STACK(fail_stack) \
1435 (((fail_stack).size * sizeof (fail_stack_elt_t) \
1436 >= re_max_failures * TYPICAL_FAILURE_SIZE) \
1437 ? 0 \
1438 : ((fail_stack).stack \
1439 = (fail_stack_elt_t *) \
1440 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1441 (fail_stack).size * sizeof (fail_stack_elt_t), \
1442 MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1443 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1444 * FAIL_STACK_GROWTH_FACTOR))), \
1445 \
1446 (fail_stack).stack == NULL \
1447 ? 0 \
1448 : ((fail_stack).size \
1449 = (MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1450 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1451 * FAIL_STACK_GROWTH_FACTOR)) \
1452 / sizeof (fail_stack_elt_t)), \
1453 1)))
1454
1455
1456 /* Push a pointer value onto the failure stack.
1457 Assumes the variable `fail_stack'. Probably should only
1458 be called from within `PUSH_FAILURE_POINT'. */
1459 #define PUSH_FAILURE_POINTER(item) \
1460 fail_stack.stack[fail_stack.avail++].pointer = (item)
1461
1462 /* This pushes an integer-valued item onto the failure stack.
1463 Assumes the variable `fail_stack'. Probably should only
1464 be called from within `PUSH_FAILURE_POINT'. */
1465 #define PUSH_FAILURE_INT(item) \
1466 fail_stack.stack[fail_stack.avail++].integer = (item)
1467
1468 /* Push a fail_stack_elt_t value onto the failure stack.
1469 Assumes the variable `fail_stack'. Probably should only
1470 be called from within `PUSH_FAILURE_POINT'. */
1471 #define PUSH_FAILURE_ELT(item) \
1472 fail_stack.stack[fail_stack.avail++] = (item)
1473
1474 /* These three POP... operations complement the three PUSH... operations.
1475 All assume that `fail_stack' is nonempty. */
1476 #define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1477 #define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1478 #define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1479
1480 /* Individual items aside from the registers. */
1481 #define NUM_NONREG_ITEMS 3
1482
1483 /* Used to examine the stack (to detect infinite loops). */
1484 #define FAILURE_PAT(h) fail_stack.stack[(h) - 1].pointer
1485 #define FAILURE_STR(h) (fail_stack.stack[(h) - 2].pointer)
1486 #define NEXT_FAILURE_HANDLE(h) fail_stack.stack[(h) - 3].integer
1487 #define TOP_FAILURE_HANDLE() fail_stack.frame
1488
1489
1490 #define ENSURE_FAIL_STACK(space) \
1491 while (REMAINING_AVAIL_SLOTS <= space) { \
1492 if (!GROW_FAIL_STACK (fail_stack)) \
1493 return -2; \
1494 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", (fail_stack).size);\
1495 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
1496 }
1497
1498 /* Push register NUM onto the stack. */
1499 #define PUSH_FAILURE_REG(num) \
1500 do { \
1501 char *destination; \
1502 ENSURE_FAIL_STACK(3); \
1503 DEBUG_PRINT4 (" Push reg %d (spanning %p -> %p)\n", \
1504 num, regstart[num], regend[num]); \
1505 PUSH_FAILURE_POINTER (regstart[num]); \
1506 PUSH_FAILURE_POINTER (regend[num]); \
1507 PUSH_FAILURE_INT (num); \
1508 } while (0)
1509
1510 /* Change the counter's value to VAL, but make sure that it will
1511 be reset when backtracking. */
1512 #define PUSH_NUMBER(ptr,val) \
1513 do { \
1514 char *destination; \
1515 int c; \
1516 ENSURE_FAIL_STACK(3); \
1517 EXTRACT_NUMBER (c, ptr); \
1518 DEBUG_PRINT4 (" Push number %p = %d -> %d\n", ptr, c, val); \
1519 PUSH_FAILURE_INT (c); \
1520 PUSH_FAILURE_POINTER (ptr); \
1521 PUSH_FAILURE_INT (-1); \
1522 STORE_NUMBER (ptr, val); \
1523 } while (0)
1524
1525 /* Pop a saved register off the stack. */
1526 #define POP_FAILURE_REG_OR_COUNT() \
1527 do { \
1528 int reg = POP_FAILURE_INT (); \
1529 if (reg == -1) \
1530 { \
1531 /* It's a counter. */ \
1532 /* Here, we discard `const', making re_match non-reentrant. */ \
1533 unsigned char *ptr = (unsigned char*) POP_FAILURE_POINTER (); \
1534 reg = POP_FAILURE_INT (); \
1535 STORE_NUMBER (ptr, reg); \
1536 DEBUG_PRINT3 (" Pop counter %p = %d\n", ptr, reg); \
1537 } \
1538 else \
1539 { \
1540 regend[reg] = POP_FAILURE_POINTER (); \
1541 regstart[reg] = POP_FAILURE_POINTER (); \
1542 DEBUG_PRINT4 (" Pop reg %d (spanning %p -> %p)\n", \
1543 reg, regstart[reg], regend[reg]); \
1544 } \
1545 } while (0)
1546
1547 /* Check that we are not stuck in an infinite loop. */
1548 #define CHECK_INFINITE_LOOP(pat_cur, string_place) \
1549 do { \
1550 int failure = TOP_FAILURE_HANDLE(); \
1551 /* Check for infinite matching loops */ \
1552 while (failure > 0 && \
1553 (FAILURE_STR (failure) == string_place \
1554 || FAILURE_STR (failure) == NULL)) \
1555 { \
1556 assert (FAILURE_PAT (failure) >= bufp->buffer \
1557 && FAILURE_PAT (failure) <= bufp->buffer + bufp->used); \
1558 if (FAILURE_PAT (failure) == pat_cur) \
1559 goto fail; \
1560 DEBUG_PRINT2 (" Other pattern: %p\n", FAILURE_PAT (failure)); \
1561 failure = NEXT_FAILURE_HANDLE(failure); \
1562 } \
1563 DEBUG_PRINT2 (" Other string: %p\n", FAILURE_STR (failure)); \
1564 } while (0)
1565
1566 /* Push the information about the state we will need
1567 if we ever fail back to it.
1568
1569 Requires variables fail_stack, regstart, regend and
1570 num_regs be declared. GROW_FAIL_STACK requires `destination' be
1571 declared.
1572
1573 Does `return FAILURE_CODE' if runs out of memory. */
1574
1575 #define PUSH_FAILURE_POINT(pattern, string_place) \
1576 do { \
1577 char *destination; \
1578 /* Must be int, so when we don't save any registers, the arithmetic \
1579 of 0 + -1 isn't done as unsigned. */ \
1580 \
1581 DEBUG_STATEMENT (nfailure_points_pushed++); \
1582 DEBUG_PRINT1 ("\nPUSH_FAILURE_POINT:\n"); \
1583 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail); \
1584 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
1585 \
1586 ENSURE_FAIL_STACK (NUM_NONREG_ITEMS); \
1587 \
1588 DEBUG_PRINT1 ("\n"); \
1589 \
1590 DEBUG_PRINT2 (" Push frame index: %d\n", fail_stack.frame); \
1591 PUSH_FAILURE_INT (fail_stack.frame); \
1592 \
1593 DEBUG_PRINT2 (" Push string %p: `", string_place); \
1594 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, size2);\
1595 DEBUG_PRINT1 ("'\n"); \
1596 PUSH_FAILURE_POINTER (string_place); \
1597 \
1598 DEBUG_PRINT2 (" Push pattern %p: ", pattern); \
1599 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern, pend); \
1600 PUSH_FAILURE_POINTER (pattern); \
1601 \
1602 /* Close the frame by moving the frame pointer past it. */ \
1603 fail_stack.frame = fail_stack.avail; \
1604 } while (0)
1605
1606 /* Estimate the size of data pushed by a typical failure stack entry.
1607 An estimate is all we need, because all we use this for
1608 is to choose a limit for how big to make the failure stack. */
1609 /* BEWARE, the value `20' is hard-coded in emacs.c:main(). */
1610 #define TYPICAL_FAILURE_SIZE 20
1611
1612 /* How many items can still be added to the stack without overflowing it. */
1613 #define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1614
1615
1616 /* Pops what PUSH_FAIL_STACK pushes.
1617
1618 We restore into the parameters, all of which should be lvalues:
1619 STR -- the saved data position.
1620 PAT -- the saved pattern position.
1621 REGSTART, REGEND -- arrays of string positions.
1622
1623 Also assumes the variables `fail_stack' and (if debugging), `bufp',
1624 `pend', `string1', `size1', `string2', and `size2'. */
1625
1626 #define POP_FAILURE_POINT(str, pat) \
1627 do { \
1628 assert (!FAIL_STACK_EMPTY ()); \
1629 \
1630 /* Remove failure points and point to how many regs pushed. */ \
1631 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1632 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
1633 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
1634 \
1635 /* Pop the saved registers. */ \
1636 while (fail_stack.frame < fail_stack.avail) \
1637 POP_FAILURE_REG_OR_COUNT (); \
1638 \
1639 pat = POP_FAILURE_POINTER (); \
1640 DEBUG_PRINT2 (" Popping pattern %p: ", pat); \
1641 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
1642 \
1643 /* If the saved string location is NULL, it came from an \
1644 on_failure_keep_string_jump opcode, and we want to throw away the \
1645 saved NULL, thus retaining our current position in the string. */ \
1646 str = POP_FAILURE_POINTER (); \
1647 DEBUG_PRINT2 (" Popping string %p: `", str); \
1648 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1649 DEBUG_PRINT1 ("'\n"); \
1650 \
1651 fail_stack.frame = POP_FAILURE_INT (); \
1652 DEBUG_PRINT2 (" Popping frame index: %d\n", fail_stack.frame); \
1653 \
1654 assert (fail_stack.avail >= 0); \
1655 assert (fail_stack.frame <= fail_stack.avail); \
1656 \
1657 DEBUG_STATEMENT (nfailure_points_popped++); \
1658 } while (0) /* POP_FAILURE_POINT */
1659
1660
1661 \f
1662 /* Registers are set to a sentinel when they haven't yet matched. */
1663 #define REG_UNSET(e) ((e) == NULL)
1664 \f
1665 /* Subroutine declarations and macros for regex_compile. */
1666
1667 static reg_errcode_t regex_compile _RE_ARGS ((re_char *pattern, size_t size,
1668 reg_syntax_t syntax,
1669 struct re_pattern_buffer *bufp));
1670 static void store_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc, int arg));
1671 static void store_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1672 int arg1, int arg2));
1673 static void insert_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1674 int arg, unsigned char *end));
1675 static void insert_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1676 int arg1, int arg2, unsigned char *end));
1677 static boolean at_begline_loc_p _RE_ARGS ((re_char *pattern,
1678 re_char *p,
1679 reg_syntax_t syntax));
1680 static boolean at_endline_loc_p _RE_ARGS ((re_char *p,
1681 re_char *pend,
1682 reg_syntax_t syntax));
1683 static re_char *skip_one_char _RE_ARGS ((re_char *p));
1684 static int analyse_first _RE_ARGS ((re_char *p, re_char *pend,
1685 char *fastmap, const int multibyte));
1686
1687 /* Fetch the next character in the uncompiled pattern---translating it
1688 if necessary. */
1689 #define PATFETCH(c) \
1690 do { \
1691 PATFETCH_RAW (c); \
1692 if (! multibyte) \
1693 MAKE_CHAR_MULTIBYTE (c); \
1694 c = TRANSLATE (c); \
1695 if (! target_multibyte) \
1696 MAKE_CHAR_UNIBYTE (c); \
1697 } while (0)
1698
1699 /* Fetch the next character in the uncompiled pattern, with no
1700 translation. */
1701 #define PATFETCH_RAW(c) \
1702 do { \
1703 int len; \
1704 if (p == pend) return REG_EEND; \
1705 c = RE_STRING_CHAR_AND_LENGTH (p, pend - p, len); \
1706 p += len; \
1707 } while (0)
1708
1709
1710 /* If `translate' is non-null, return translate[D], else just D. We
1711 cast the subscript to translate because some data is declared as
1712 `char *', to avoid warnings when a string constant is passed. But
1713 when we use a character as a subscript we must make it unsigned. */
1714 #ifndef TRANSLATE
1715 # define TRANSLATE(d) \
1716 (RE_TRANSLATE_P (translate) ? RE_TRANSLATE (translate, (d)) : (d))
1717 #endif
1718
1719
1720 /* Macros for outputting the compiled pattern into `buffer'. */
1721
1722 /* If the buffer isn't allocated when it comes in, use this. */
1723 #define INIT_BUF_SIZE 32
1724
1725 /* Make sure we have at least N more bytes of space in buffer. */
1726 #define GET_BUFFER_SPACE(n) \
1727 while ((size_t) (b - bufp->buffer + (n)) > bufp->allocated) \
1728 EXTEND_BUFFER ()
1729
1730 /* Make sure we have one more byte of buffer space and then add C to it. */
1731 #define BUF_PUSH(c) \
1732 do { \
1733 GET_BUFFER_SPACE (1); \
1734 *b++ = (unsigned char) (c); \
1735 } while (0)
1736
1737
1738 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1739 #define BUF_PUSH_2(c1, c2) \
1740 do { \
1741 GET_BUFFER_SPACE (2); \
1742 *b++ = (unsigned char) (c1); \
1743 *b++ = (unsigned char) (c2); \
1744 } while (0)
1745
1746
1747 /* As with BUF_PUSH_2, except for three bytes. */
1748 #define BUF_PUSH_3(c1, c2, c3) \
1749 do { \
1750 GET_BUFFER_SPACE (3); \
1751 *b++ = (unsigned char) (c1); \
1752 *b++ = (unsigned char) (c2); \
1753 *b++ = (unsigned char) (c3); \
1754 } while (0)
1755
1756
1757 /* Store a jump with opcode OP at LOC to location TO. We store a
1758 relative address offset by the three bytes the jump itself occupies. */
1759 #define STORE_JUMP(op, loc, to) \
1760 store_op1 (op, loc, (to) - (loc) - 3)
1761
1762 /* Likewise, for a two-argument jump. */
1763 #define STORE_JUMP2(op, loc, to, arg) \
1764 store_op2 (op, loc, (to) - (loc) - 3, arg)
1765
1766 /* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
1767 #define INSERT_JUMP(op, loc, to) \
1768 insert_op1 (op, loc, (to) - (loc) - 3, b)
1769
1770 /* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
1771 #define INSERT_JUMP2(op, loc, to, arg) \
1772 insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
1773
1774
1775 /* This is not an arbitrary limit: the arguments which represent offsets
1776 into the pattern are two bytes long. So if 2^16 bytes turns out to
1777 be too small, many things would have to change. */
1778 /* Any other compiler which, like MSC, has allocation limit below 2^16
1779 bytes will have to use approach similar to what was done below for
1780 MSC and drop MAX_BUF_SIZE a bit. Otherwise you may end up
1781 reallocating to 0 bytes. Such thing is not going to work too well.
1782 You have been warned!! */
1783 #if defined _MSC_VER && !defined WIN32
1784 /* Microsoft C 16-bit versions limit malloc to approx 65512 bytes. */
1785 # define MAX_BUF_SIZE 65500L
1786 #else
1787 # define MAX_BUF_SIZE (1L << 16)
1788 #endif
1789
1790 /* Extend the buffer by twice its current size via realloc and
1791 reset the pointers that pointed into the old block to point to the
1792 correct places in the new one. If extending the buffer results in it
1793 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1794 #if __BOUNDED_POINTERS__
1795 # define SET_HIGH_BOUND(P) (__ptrhigh (P) = __ptrlow (P) + bufp->allocated)
1796 # define MOVE_BUFFER_POINTER(P) \
1797 (__ptrlow (P) += incr, SET_HIGH_BOUND (P), __ptrvalue (P) += incr)
1798 # define ELSE_EXTEND_BUFFER_HIGH_BOUND \
1799 else \
1800 { \
1801 SET_HIGH_BOUND (b); \
1802 SET_HIGH_BOUND (begalt); \
1803 if (fixup_alt_jump) \
1804 SET_HIGH_BOUND (fixup_alt_jump); \
1805 if (laststart) \
1806 SET_HIGH_BOUND (laststart); \
1807 if (pending_exact) \
1808 SET_HIGH_BOUND (pending_exact); \
1809 }
1810 #else
1811 # define MOVE_BUFFER_POINTER(P) (P) += incr
1812 # define ELSE_EXTEND_BUFFER_HIGH_BOUND
1813 #endif
1814 #define EXTEND_BUFFER() \
1815 do { \
1816 re_char *old_buffer = bufp->buffer; \
1817 if (bufp->allocated == MAX_BUF_SIZE) \
1818 return REG_ESIZE; \
1819 bufp->allocated <<= 1; \
1820 if (bufp->allocated > MAX_BUF_SIZE) \
1821 bufp->allocated = MAX_BUF_SIZE; \
1822 RETALLOC (bufp->buffer, bufp->allocated, unsigned char); \
1823 if (bufp->buffer == NULL) \
1824 return REG_ESPACE; \
1825 /* If the buffer moved, move all the pointers into it. */ \
1826 if (old_buffer != bufp->buffer) \
1827 { \
1828 int incr = bufp->buffer - old_buffer; \
1829 MOVE_BUFFER_POINTER (b); \
1830 MOVE_BUFFER_POINTER (begalt); \
1831 if (fixup_alt_jump) \
1832 MOVE_BUFFER_POINTER (fixup_alt_jump); \
1833 if (laststart) \
1834 MOVE_BUFFER_POINTER (laststart); \
1835 if (pending_exact) \
1836 MOVE_BUFFER_POINTER (pending_exact); \
1837 } \
1838 ELSE_EXTEND_BUFFER_HIGH_BOUND \
1839 } while (0)
1840
1841
1842 /* Since we have one byte reserved for the register number argument to
1843 {start,stop}_memory, the maximum number of groups we can report
1844 things about is what fits in that byte. */
1845 #define MAX_REGNUM 255
1846
1847 /* But patterns can have more than `MAX_REGNUM' registers. We just
1848 ignore the excess. */
1849 typedef unsigned regnum_t;
1850
1851
1852 /* Macros for the compile stack. */
1853
1854 /* Since offsets can go either forwards or backwards, this type needs to
1855 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1856 /* int may be not enough when sizeof(int) == 2. */
1857 typedef long pattern_offset_t;
1858
1859 typedef struct
1860 {
1861 pattern_offset_t begalt_offset;
1862 pattern_offset_t fixup_alt_jump;
1863 pattern_offset_t laststart_offset;
1864 regnum_t regnum;
1865 } compile_stack_elt_t;
1866
1867
1868 typedef struct
1869 {
1870 compile_stack_elt_t *stack;
1871 unsigned size;
1872 unsigned avail; /* Offset of next open position. */
1873 } compile_stack_type;
1874
1875
1876 #define INIT_COMPILE_STACK_SIZE 32
1877
1878 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1879 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1880
1881 /* The next available element. */
1882 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1883
1884
1885 /* Structure to manage work area for range table. */
1886 struct range_table_work_area
1887 {
1888 int *table; /* actual work area. */
1889 int allocated; /* allocated size for work area in bytes. */
1890 int used; /* actually used size in words. */
1891 int bits; /* flag to record character classes */
1892 };
1893
1894 /* Make sure that WORK_AREA can hold more N multibyte characters. */
1895 #define EXTEND_RANGE_TABLE_WORK_AREA(work_area, n) \
1896 do { \
1897 if (((work_area).used + (n)) * sizeof (int) > (work_area).allocated) \
1898 { \
1899 (work_area).allocated += 16 * sizeof (int); \
1900 if ((work_area).table) \
1901 (work_area).table \
1902 = (int *) realloc ((work_area).table, (work_area).allocated); \
1903 else \
1904 (work_area).table \
1905 = (int *) malloc ((work_area).allocated); \
1906 if ((work_area).table == 0) \
1907 FREE_STACK_RETURN (REG_ESPACE); \
1908 } \
1909 } while (0)
1910
1911 #define SET_RANGE_TABLE_WORK_AREA_BIT(work_area, bit) \
1912 (work_area).bits |= (bit)
1913
1914 /* Bits used to implement the multibyte-part of the various character classes
1915 such as [:alnum:] in a charset's range table. */
1916 #define BIT_WORD 0x1
1917 #define BIT_LOWER 0x2
1918 #define BIT_PUNCT 0x4
1919 #define BIT_SPACE 0x8
1920 #define BIT_UPPER 0x10
1921 #define BIT_MULTIBYTE 0x20
1922
1923 /* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */
1924 #define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \
1925 do { \
1926 EXTEND_RANGE_TABLE_WORK_AREA ((work_area), 2); \
1927 (work_area).table[(work_area).used++] = (range_start); \
1928 (work_area).table[(work_area).used++] = (range_end); \
1929 } while (0)
1930
1931 /* Free allocated memory for WORK_AREA. */
1932 #define FREE_RANGE_TABLE_WORK_AREA(work_area) \
1933 do { \
1934 if ((work_area).table) \
1935 free ((work_area).table); \
1936 } while (0)
1937
1938 #define CLEAR_RANGE_TABLE_WORK_USED(work_area) ((work_area).used = 0, (work_area).bits = 0)
1939 #define RANGE_TABLE_WORK_USED(work_area) ((work_area).used)
1940 #define RANGE_TABLE_WORK_BITS(work_area) ((work_area).bits)
1941 #define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
1942
1943
1944 /* Set the bit for character C in a list. */
1945 #define SET_LIST_BIT(c) (b[((c)) / BYTEWIDTH] |= 1 << ((c) % BYTEWIDTH))
1946
1947
1948 #ifdef emacs
1949
1950 /* It is better to implement this jumbo macro by a function, but it's
1951 not that easy because macros called within it assumes various
1952 variables being defined. */
1953
1954 #define SETUP_MULTIBYTE_RANGE(work_area, c0, c1) \
1955 do { \
1956 re_wchar_t c, t, t_last; \
1957 int n; \
1958 \
1959 c = (c0); \
1960 t_last = multibyte ? TRANSLATE (c) : TRANSLATE (MAKE_CHAR_MULTIBYTE (c)); \
1961 for (c++, n = 1; c <= (c1); c++, n++) \
1962 { \
1963 t = multibyte ? TRANSLATE (c) : TRANSLATE (MAKE_CHAR_MULTIBYTE (c)); \
1964 if (t_last + n == t) \
1965 continue; \
1966 SET_RANGE_TABLE_WORK_AREA ((work_area), t_last, t_last + n - 1); \
1967 t_last = t; \
1968 n = 1; \
1969 } \
1970 if (n > 0) \
1971 SET_RANGE_TABLE_WORK_AREA ((work_area), t_last, t_last + n - 1); \
1972 } while (0)
1973
1974
1975 #endif /* emacs */
1976
1977 /* Get the next unsigned number in the uncompiled pattern. */
1978 #define GET_UNSIGNED_NUMBER(num) \
1979 do { if (p != pend) \
1980 { \
1981 PATFETCH (c); \
1982 while ('0' <= c && c <= '9') \
1983 { \
1984 if (num < 0) \
1985 num = 0; \
1986 num = num * 10 + c - '0'; \
1987 if (p == pend) \
1988 break; \
1989 PATFETCH (c); \
1990 } \
1991 } \
1992 } while (0)
1993
1994 #if WIDE_CHAR_SUPPORT
1995 /* The GNU C library provides support for user-defined character classes
1996 and the functions from ISO C amendement 1. */
1997 # ifdef CHARCLASS_NAME_MAX
1998 # define CHAR_CLASS_MAX_LENGTH CHARCLASS_NAME_MAX
1999 # else
2000 /* This shouldn't happen but some implementation might still have this
2001 problem. Use a reasonable default value. */
2002 # define CHAR_CLASS_MAX_LENGTH 256
2003 # endif
2004 typedef wctype_t re_wctype_t;
2005 typedef wchar_t re_wchar_t;
2006 # define re_wctype wctype
2007 # define re_iswctype iswctype
2008 # define re_wctype_to_bit(cc) 0
2009 #else
2010 # define CHAR_CLASS_MAX_LENGTH 9 /* Namely, `multibyte'. */
2011 # define btowc(c) c
2012
2013 /* Character classes. */
2014 typedef enum { RECC_ERROR = 0,
2015 RECC_ALNUM, RECC_ALPHA, RECC_WORD,
2016 RECC_GRAPH, RECC_PRINT,
2017 RECC_LOWER, RECC_UPPER,
2018 RECC_PUNCT, RECC_CNTRL,
2019 RECC_DIGIT, RECC_XDIGIT,
2020 RECC_BLANK, RECC_SPACE,
2021 RECC_MULTIBYTE, RECC_NONASCII,
2022 RECC_ASCII, RECC_UNIBYTE
2023 } re_wctype_t;
2024
2025 typedef int re_wchar_t;
2026
2027 /* Map a string to the char class it names (if any). */
2028 static re_wctype_t
2029 re_wctype (str)
2030 re_char *str;
2031 {
2032 const char *string = str;
2033 if (STREQ (string, "alnum")) return RECC_ALNUM;
2034 else if (STREQ (string, "alpha")) return RECC_ALPHA;
2035 else if (STREQ (string, "word")) return RECC_WORD;
2036 else if (STREQ (string, "ascii")) return RECC_ASCII;
2037 else if (STREQ (string, "nonascii")) return RECC_NONASCII;
2038 else if (STREQ (string, "graph")) return RECC_GRAPH;
2039 else if (STREQ (string, "lower")) return RECC_LOWER;
2040 else if (STREQ (string, "print")) return RECC_PRINT;
2041 else if (STREQ (string, "punct")) return RECC_PUNCT;
2042 else if (STREQ (string, "space")) return RECC_SPACE;
2043 else if (STREQ (string, "upper")) return RECC_UPPER;
2044 else if (STREQ (string, "unibyte")) return RECC_UNIBYTE;
2045 else if (STREQ (string, "multibyte")) return RECC_MULTIBYTE;
2046 else if (STREQ (string, "digit")) return RECC_DIGIT;
2047 else if (STREQ (string, "xdigit")) return RECC_XDIGIT;
2048 else if (STREQ (string, "cntrl")) return RECC_CNTRL;
2049 else if (STREQ (string, "blank")) return RECC_BLANK;
2050 else return 0;
2051 }
2052
2053 /* True iff CH is in the char class CC. */
2054 static boolean
2055 re_iswctype (ch, cc)
2056 int ch;
2057 re_wctype_t cc;
2058 {
2059 switch (cc)
2060 {
2061 case RECC_ALNUM: return ISALNUM (ch);
2062 case RECC_ALPHA: return ISALPHA (ch);
2063 case RECC_BLANK: return ISBLANK (ch);
2064 case RECC_CNTRL: return ISCNTRL (ch);
2065 case RECC_DIGIT: return ISDIGIT (ch);
2066 case RECC_GRAPH: return ISGRAPH (ch);
2067 case RECC_LOWER: return ISLOWER (ch);
2068 case RECC_PRINT: return ISPRINT (ch);
2069 case RECC_PUNCT: return ISPUNCT (ch);
2070 case RECC_SPACE: return ISSPACE (ch);
2071 case RECC_UPPER: return ISUPPER (ch);
2072 case RECC_XDIGIT: return ISXDIGIT (ch);
2073 case RECC_ASCII: return IS_REAL_ASCII (ch);
2074 case RECC_NONASCII: return !IS_REAL_ASCII (ch);
2075 case RECC_UNIBYTE: return ISUNIBYTE (ch);
2076 case RECC_MULTIBYTE: return !ISUNIBYTE (ch);
2077 case RECC_WORD: return ISWORD (ch);
2078 case RECC_ERROR: return false;
2079 default:
2080 abort();
2081 }
2082 }
2083
2084 /* Return a bit-pattern to use in the range-table bits to match multibyte
2085 chars of class CC. */
2086 static int
2087 re_wctype_to_bit (cc)
2088 re_wctype_t cc;
2089 {
2090 switch (cc)
2091 {
2092 case RECC_NONASCII: case RECC_PRINT: case RECC_GRAPH:
2093 case RECC_MULTIBYTE: return BIT_MULTIBYTE;
2094 case RECC_ALPHA: case RECC_ALNUM: case RECC_WORD: return BIT_WORD;
2095 case RECC_LOWER: return BIT_LOWER;
2096 case RECC_UPPER: return BIT_UPPER;
2097 case RECC_PUNCT: return BIT_PUNCT;
2098 case RECC_SPACE: return BIT_SPACE;
2099 case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL:
2100 case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0;
2101 default:
2102 abort();
2103 }
2104 }
2105 #endif
2106
2107 /* Explicit quit checking is only used on NTemacs. */
2108 #if defined WINDOWSNT && defined emacs && defined QUIT
2109 extern int immediate_quit;
2110 # define IMMEDIATE_QUIT_CHECK \
2111 do { \
2112 if (immediate_quit) QUIT; \
2113 } while (0)
2114 #else
2115 # define IMMEDIATE_QUIT_CHECK ((void)0)
2116 #endif
2117 \f
2118 #ifndef MATCH_MAY_ALLOCATE
2119
2120 /* If we cannot allocate large objects within re_match_2_internal,
2121 we make the fail stack and register vectors global.
2122 The fail stack, we grow to the maximum size when a regexp
2123 is compiled.
2124 The register vectors, we adjust in size each time we
2125 compile a regexp, according to the number of registers it needs. */
2126
2127 static fail_stack_type fail_stack;
2128
2129 /* Size with which the following vectors are currently allocated.
2130 That is so we can make them bigger as needed,
2131 but never make them smaller. */
2132 static int regs_allocated_size;
2133
2134 static re_char ** regstart, ** regend;
2135 static re_char **best_regstart, **best_regend;
2136
2137 /* Make the register vectors big enough for NUM_REGS registers,
2138 but don't make them smaller. */
2139
2140 static
2141 regex_grow_registers (num_regs)
2142 int num_regs;
2143 {
2144 if (num_regs > regs_allocated_size)
2145 {
2146 RETALLOC_IF (regstart, num_regs, re_char *);
2147 RETALLOC_IF (regend, num_regs, re_char *);
2148 RETALLOC_IF (best_regstart, num_regs, re_char *);
2149 RETALLOC_IF (best_regend, num_regs, re_char *);
2150
2151 regs_allocated_size = num_regs;
2152 }
2153 }
2154
2155 #endif /* not MATCH_MAY_ALLOCATE */
2156 \f
2157 static boolean group_in_compile_stack _RE_ARGS ((compile_stack_type
2158 compile_stack,
2159 regnum_t regnum));
2160
2161 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
2162 Returns one of error codes defined in `regex.h', or zero for success.
2163
2164 Assumes the `allocated' (and perhaps `buffer') and `translate'
2165 fields are set in BUFP on entry.
2166
2167 If it succeeds, results are put in BUFP (if it returns an error, the
2168 contents of BUFP are undefined):
2169 `buffer' is the compiled pattern;
2170 `syntax' is set to SYNTAX;
2171 `used' is set to the length of the compiled pattern;
2172 `fastmap_accurate' is zero;
2173 `re_nsub' is the number of subexpressions in PATTERN;
2174 `not_bol' and `not_eol' are zero;
2175
2176 The `fastmap' field is neither examined nor set. */
2177
2178 /* Insert the `jump' from the end of last alternative to "here".
2179 The space for the jump has already been allocated. */
2180 #define FIXUP_ALT_JUMP() \
2181 do { \
2182 if (fixup_alt_jump) \
2183 STORE_JUMP (jump, fixup_alt_jump, b); \
2184 } while (0)
2185
2186
2187 /* Return, freeing storage we allocated. */
2188 #define FREE_STACK_RETURN(value) \
2189 do { \
2190 FREE_RANGE_TABLE_WORK_AREA (range_table_work); \
2191 free (compile_stack.stack); \
2192 return value; \
2193 } while (0)
2194
2195 static reg_errcode_t
2196 regex_compile (pattern, size, syntax, bufp)
2197 re_char *pattern;
2198 size_t size;
2199 reg_syntax_t syntax;
2200 struct re_pattern_buffer *bufp;
2201 {
2202 /* We fetch characters from PATTERN here. */
2203 register re_wchar_t c, c1;
2204
2205 /* A random temporary spot in PATTERN. */
2206 re_char *p1;
2207
2208 /* Points to the end of the buffer, where we should append. */
2209 register unsigned char *b;
2210
2211 /* Keeps track of unclosed groups. */
2212 compile_stack_type compile_stack;
2213
2214 /* Points to the current (ending) position in the pattern. */
2215 #ifdef AIX
2216 /* `const' makes AIX compiler fail. */
2217 unsigned char *p = pattern;
2218 #else
2219 re_char *p = pattern;
2220 #endif
2221 re_char *pend = pattern + size;
2222
2223 /* How to translate the characters in the pattern. */
2224 RE_TRANSLATE_TYPE translate = bufp->translate;
2225
2226 /* Address of the count-byte of the most recently inserted `exactn'
2227 command. This makes it possible to tell if a new exact-match
2228 character can be added to that command or if the character requires
2229 a new `exactn' command. */
2230 unsigned char *pending_exact = 0;
2231
2232 /* Address of start of the most recently finished expression.
2233 This tells, e.g., postfix * where to find the start of its
2234 operand. Reset at the beginning of groups and alternatives. */
2235 unsigned char *laststart = 0;
2236
2237 /* Address of beginning of regexp, or inside of last group. */
2238 unsigned char *begalt;
2239
2240 /* Place in the uncompiled pattern (i.e., the {) to
2241 which to go back if the interval is invalid. */
2242 re_char *beg_interval;
2243
2244 /* Address of the place where a forward jump should go to the end of
2245 the containing expression. Each alternative of an `or' -- except the
2246 last -- ends with a forward jump of this sort. */
2247 unsigned char *fixup_alt_jump = 0;
2248
2249 /* Counts open-groups as they are encountered. Remembered for the
2250 matching close-group on the compile stack, so the same register
2251 number is put in the stop_memory as the start_memory. */
2252 regnum_t regnum = 0;
2253
2254 /* Work area for range table of charset. */
2255 struct range_table_work_area range_table_work;
2256
2257 /* If the object matched can contain multibyte characters. */
2258 const boolean multibyte = RE_MULTIBYTE_P (bufp);
2259
2260 /* If a target can contain multibyte characters. */
2261 const boolean target_multibyte = RE_TARGET_MULTIBYTE_P (bufp);
2262
2263 #ifdef DEBUG
2264 debug++;
2265 DEBUG_PRINT1 ("\nCompiling pattern: ");
2266 if (debug > 0)
2267 {
2268 unsigned debug_count;
2269
2270 for (debug_count = 0; debug_count < size; debug_count++)
2271 putchar (pattern[debug_count]);
2272 putchar ('\n');
2273 }
2274 #endif /* DEBUG */
2275
2276 /* Initialize the compile stack. */
2277 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
2278 if (compile_stack.stack == NULL)
2279 return REG_ESPACE;
2280
2281 compile_stack.size = INIT_COMPILE_STACK_SIZE;
2282 compile_stack.avail = 0;
2283
2284 range_table_work.table = 0;
2285 range_table_work.allocated = 0;
2286
2287 /* Initialize the pattern buffer. */
2288 bufp->syntax = syntax;
2289 bufp->fastmap_accurate = 0;
2290 bufp->not_bol = bufp->not_eol = 0;
2291
2292 /* Set `used' to zero, so that if we return an error, the pattern
2293 printer (for debugging) will think there's no pattern. We reset it
2294 at the end. */
2295 bufp->used = 0;
2296
2297 /* Always count groups, whether or not bufp->no_sub is set. */
2298 bufp->re_nsub = 0;
2299
2300 #if !defined emacs && !defined SYNTAX_TABLE
2301 /* Initialize the syntax table. */
2302 init_syntax_once ();
2303 #endif
2304
2305 if (bufp->allocated == 0)
2306 {
2307 if (bufp->buffer)
2308 { /* If zero allocated, but buffer is non-null, try to realloc
2309 enough space. This loses if buffer's address is bogus, but
2310 that is the user's responsibility. */
2311 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
2312 }
2313 else
2314 { /* Caller did not allocate a buffer. Do it for them. */
2315 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
2316 }
2317 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
2318
2319 bufp->allocated = INIT_BUF_SIZE;
2320 }
2321
2322 begalt = b = bufp->buffer;
2323
2324 /* Loop through the uncompiled pattern until we're at the end. */
2325 while (p != pend)
2326 {
2327 PATFETCH_RAW (c);
2328
2329 switch (c)
2330 {
2331 case '^':
2332 {
2333 if ( /* If at start of pattern, it's an operator. */
2334 p == pattern + 1
2335 /* If context independent, it's an operator. */
2336 || syntax & RE_CONTEXT_INDEP_ANCHORS
2337 /* Otherwise, depends on what's come before. */
2338 || at_begline_loc_p (pattern, p, syntax))
2339 BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? begbuf : begline);
2340 else
2341 goto normal_char;
2342 }
2343 break;
2344
2345
2346 case '$':
2347 {
2348 if ( /* If at end of pattern, it's an operator. */
2349 p == pend
2350 /* If context independent, it's an operator. */
2351 || syntax & RE_CONTEXT_INDEP_ANCHORS
2352 /* Otherwise, depends on what's next. */
2353 || at_endline_loc_p (p, pend, syntax))
2354 BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? endbuf : endline);
2355 else
2356 goto normal_char;
2357 }
2358 break;
2359
2360
2361 case '+':
2362 case '?':
2363 if ((syntax & RE_BK_PLUS_QM)
2364 || (syntax & RE_LIMITED_OPS))
2365 goto normal_char;
2366 handle_plus:
2367 case '*':
2368 /* If there is no previous pattern... */
2369 if (!laststart)
2370 {
2371 if (syntax & RE_CONTEXT_INVALID_OPS)
2372 FREE_STACK_RETURN (REG_BADRPT);
2373 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2374 goto normal_char;
2375 }
2376
2377 {
2378 /* 1 means zero (many) matches is allowed. */
2379 boolean zero_times_ok = 0, many_times_ok = 0;
2380 boolean greedy = 1;
2381
2382 /* If there is a sequence of repetition chars, collapse it
2383 down to just one (the right one). We can't combine
2384 interval operators with these because of, e.g., `a{2}*',
2385 which should only match an even number of `a's. */
2386
2387 for (;;)
2388 {
2389 if ((syntax & RE_FRUGAL)
2390 && c == '?' && (zero_times_ok || many_times_ok))
2391 greedy = 0;
2392 else
2393 {
2394 zero_times_ok |= c != '+';
2395 many_times_ok |= c != '?';
2396 }
2397
2398 if (p == pend)
2399 break;
2400 else if (*p == '*'
2401 || (!(syntax & RE_BK_PLUS_QM)
2402 && (*p == '+' || *p == '?')))
2403 ;
2404 else if (syntax & RE_BK_PLUS_QM && *p == '\\')
2405 {
2406 if (p+1 == pend)
2407 FREE_STACK_RETURN (REG_EESCAPE);
2408 if (p[1] == '+' || p[1] == '?')
2409 PATFETCH_RAW (c); /* Gobble up the backslash. */
2410 else
2411 break;
2412 }
2413 else
2414 break;
2415 /* If we get here, we found another repeat character. */
2416 PATFETCH_RAW (c);
2417 }
2418
2419 /* Star, etc. applied to an empty pattern is equivalent
2420 to an empty pattern. */
2421 if (!laststart || laststart == b)
2422 break;
2423
2424 /* Now we know whether or not zero matches is allowed
2425 and also whether or not two or more matches is allowed. */
2426 if (greedy)
2427 {
2428 if (many_times_ok)
2429 {
2430 boolean simple = skip_one_char (laststart) == b;
2431 unsigned int startoffset = 0;
2432 re_opcode_t ofj =
2433 /* Check if the loop can match the empty string. */
2434 (simple || !analyse_first (laststart, b, NULL, 0)) ?
2435 on_failure_jump : on_failure_jump_loop;
2436 assert (skip_one_char (laststart) <= b);
2437
2438 if (!zero_times_ok && simple)
2439 { /* Since simple * loops can be made faster by using
2440 on_failure_keep_string_jump, we turn simple P+
2441 into PP* if P is simple. */
2442 unsigned char *p1, *p2;
2443 startoffset = b - laststart;
2444 GET_BUFFER_SPACE (startoffset);
2445 p1 = b; p2 = laststart;
2446 while (p2 < p1)
2447 *b++ = *p2++;
2448 zero_times_ok = 1;
2449 }
2450
2451 GET_BUFFER_SPACE (6);
2452 if (!zero_times_ok)
2453 /* A + loop. */
2454 STORE_JUMP (ofj, b, b + 6);
2455 else
2456 /* Simple * loops can use on_failure_keep_string_jump
2457 depending on what follows. But since we don't know
2458 that yet, we leave the decision up to
2459 on_failure_jump_smart. */
2460 INSERT_JUMP (simple ? on_failure_jump_smart : ofj,
2461 laststart + startoffset, b + 6);
2462 b += 3;
2463 STORE_JUMP (jump, b, laststart + startoffset);
2464 b += 3;
2465 }
2466 else
2467 {
2468 /* A simple ? pattern. */
2469 assert (zero_times_ok);
2470 GET_BUFFER_SPACE (3);
2471 INSERT_JUMP (on_failure_jump, laststart, b + 3);
2472 b += 3;
2473 }
2474 }
2475 else /* not greedy */
2476 { /* I wish the greedy and non-greedy cases could be merged. */
2477
2478 GET_BUFFER_SPACE (7); /* We might use less. */
2479 if (many_times_ok)
2480 {
2481 boolean emptyp = analyse_first (laststart, b, NULL, 0);
2482
2483 /* The non-greedy multiple match looks like a repeat..until:
2484 we only need a conditional jump at the end of the loop */
2485 if (emptyp) BUF_PUSH (no_op);
2486 STORE_JUMP (emptyp ? on_failure_jump_nastyloop
2487 : on_failure_jump, b, laststart);
2488 b += 3;
2489 if (zero_times_ok)
2490 {
2491 /* The repeat...until naturally matches one or more.
2492 To also match zero times, we need to first jump to
2493 the end of the loop (its conditional jump). */
2494 INSERT_JUMP (jump, laststart, b);
2495 b += 3;
2496 }
2497 }
2498 else
2499 {
2500 /* non-greedy a?? */
2501 INSERT_JUMP (jump, laststart, b + 3);
2502 b += 3;
2503 INSERT_JUMP (on_failure_jump, laststart, laststart + 6);
2504 b += 3;
2505 }
2506 }
2507 }
2508 pending_exact = 0;
2509 break;
2510
2511
2512 case '.':
2513 laststart = b;
2514 BUF_PUSH (anychar);
2515 break;
2516
2517
2518 case '[':
2519 {
2520 CLEAR_RANGE_TABLE_WORK_USED (range_table_work);
2521
2522 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2523
2524 /* Ensure that we have enough space to push a charset: the
2525 opcode, the length count, and the bitset; 34 bytes in all. */
2526 GET_BUFFER_SPACE (34);
2527
2528 laststart = b;
2529
2530 /* We test `*p == '^' twice, instead of using an if
2531 statement, so we only need one BUF_PUSH. */
2532 BUF_PUSH (*p == '^' ? charset_not : charset);
2533 if (*p == '^')
2534 p++;
2535
2536 /* Remember the first position in the bracket expression. */
2537 p1 = p;
2538
2539 /* Push the number of bytes in the bitmap. */
2540 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
2541
2542 /* Clear the whole map. */
2543 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
2544
2545 /* charset_not matches newline according to a syntax bit. */
2546 if ((re_opcode_t) b[-2] == charset_not
2547 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2548 SET_LIST_BIT ('\n');
2549
2550 /* Read in characters and ranges, setting map bits. */
2551 for (;;)
2552 {
2553 boolean escaped_char = false;
2554 const unsigned char *p2 = p;
2555
2556 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2557
2558 PATFETCH_RAW (c);
2559
2560 /* \ might escape characters inside [...] and [^...]. */
2561 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2562 {
2563 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2564
2565 PATFETCH_RAW (c);
2566 escaped_char = true;
2567 }
2568 else
2569 {
2570 /* Could be the end of the bracket expression. If it's
2571 not (i.e., when the bracket expression is `[]' so
2572 far), the ']' character bit gets set way below. */
2573 if (c == ']' && p2 != p1)
2574 break;
2575 }
2576
2577 /* See if we're at the beginning of a possible character
2578 class. */
2579
2580 if (!escaped_char &&
2581 syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
2582 {
2583 /* Leave room for the null. */
2584 unsigned char str[CHAR_CLASS_MAX_LENGTH + 1];
2585 const unsigned char *class_beg;
2586
2587 PATFETCH_RAW (c);
2588 c1 = 0;
2589 class_beg = p;
2590
2591 /* If pattern is `[[:'. */
2592 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2593
2594 for (;;)
2595 {
2596 PATFETCH_RAW (c);
2597 if ((c == ':' && *p == ']') || p == pend)
2598 break;
2599 if (c1 < CHAR_CLASS_MAX_LENGTH)
2600 str[c1++] = c;
2601 else
2602 /* This is in any case an invalid class name. */
2603 str[0] = '\0';
2604 }
2605 str[c1] = '\0';
2606
2607 /* If isn't a word bracketed by `[:' and `:]':
2608 undo the ending character, the letters, and
2609 leave the leading `:' and `[' (but set bits for
2610 them). */
2611 if (c == ':' && *p == ']')
2612 {
2613 int ch;
2614 re_wctype_t cc;
2615
2616 cc = re_wctype (str);
2617
2618 if (cc == 0)
2619 FREE_STACK_RETURN (REG_ECTYPE);
2620
2621 /* Throw away the ] at the end of the character
2622 class. */
2623 PATFETCH_RAW (c);
2624
2625 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2626
2627 /* Most character classes in a multibyte match
2628 just set a flag. Exceptions are is_blank,
2629 is_digit, is_cntrl, and is_xdigit, since
2630 they can only match ASCII characters. We
2631 don't need to handle them for multibyte.
2632 They are distinguished by a negative wctype.
2633 We need this only for Emacs. */
2634 #ifdef emacs
2635 SET_RANGE_TABLE_WORK_AREA_BIT (range_table_work,
2636 re_wctype_to_bit (cc));
2637 #endif
2638
2639 for (ch = 0; ch < 1 << BYTEWIDTH; ++ch)
2640 {
2641 MAKE_CHAR_MULTIBYTE (ch);
2642 ch = TRANSLATE (ch);
2643 if (IS_REAL_ASCII (ch)
2644 & re_iswctype (btowc (ch), cc))
2645 SET_LIST_BIT (ch);
2646 }
2647
2648 /* Repeat the loop. */
2649 continue;
2650 }
2651 else
2652 {
2653 /* Go back to right after the "[:". */
2654 p = class_beg;
2655 SET_LIST_BIT ('[');
2656
2657 /* Because the `:' may starts the range, we
2658 can't simply set bit and repeat the loop.
2659 Instead, just set it to C and handle below. */
2660 c = ':';
2661 }
2662 }
2663
2664 if (p < pend && p[0] == '-' && p[1] != ']')
2665 {
2666
2667 /* Discard the `-'. */
2668 PATFETCH_RAW (c1);
2669
2670 /* Fetch the character which ends the range. */
2671 PATFETCH_RAW (c1);
2672 if (c > c1)
2673 {
2674 if (syntax & RE_NO_EMPTY_RANGES)
2675 FREE_STACK_RETURN (REG_ERANGE);
2676 /* Else, repeat the loop. */
2677 }
2678 }
2679 else
2680 c1 = c;
2681 #ifndef emacs
2682 c = TRANSLATE (c);
2683 c1 = TRANSLATE (c1);
2684 #else /* not emacs */
2685 if (target_multibyte)
2686 {
2687 if (! IS_REAL_ASCII (c1))
2688 {
2689 re_wchar_t c0 = MAX (c, 128);
2690
2691 SETUP_MULTIBYTE_RANGE (range_table_work, c0, c1);
2692 c1 = MIN (127, c1);
2693 }
2694 }
2695 else
2696 {
2697 if (multibyte)
2698 {
2699 MAKE_CHAR_UNIBYTE (c);
2700 MAKE_CHAR_UNIBYTE (c1);
2701 }
2702 }
2703 #endif /* not emacs */
2704 /* Set the range into bitmap */
2705 for (; c <= c1; c++)
2706 SET_LIST_BIT (TRANSLATE (c));
2707 }
2708
2709 /* Discard any (non)matching list bytes that are all 0 at the
2710 end of the map. Decrease the map-length byte too. */
2711 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
2712 b[-1]--;
2713 b += b[-1];
2714
2715 /* Build real range table from work area. */
2716 if (RANGE_TABLE_WORK_USED (range_table_work)
2717 || RANGE_TABLE_WORK_BITS (range_table_work))
2718 {
2719 int i;
2720 int used = RANGE_TABLE_WORK_USED (range_table_work);
2721
2722 /* Allocate space for COUNT + RANGE_TABLE. Needs two
2723 bytes for flags, two for COUNT, and three bytes for
2724 each character. */
2725 GET_BUFFER_SPACE (4 + used * 3);
2726
2727 /* Indicate the existence of range table. */
2728 laststart[1] |= 0x80;
2729
2730 /* Store the character class flag bits into the range table.
2731 If not in emacs, these flag bits are always 0. */
2732 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) & 0xff;
2733 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) >> 8;
2734
2735 STORE_NUMBER_AND_INCR (b, used / 2);
2736 for (i = 0; i < used; i++)
2737 STORE_CHARACTER_AND_INCR
2738 (b, RANGE_TABLE_WORK_ELT (range_table_work, i));
2739 }
2740 }
2741 break;
2742
2743
2744 case '(':
2745 if (syntax & RE_NO_BK_PARENS)
2746 goto handle_open;
2747 else
2748 goto normal_char;
2749
2750
2751 case ')':
2752 if (syntax & RE_NO_BK_PARENS)
2753 goto handle_close;
2754 else
2755 goto normal_char;
2756
2757
2758 case '\n':
2759 if (syntax & RE_NEWLINE_ALT)
2760 goto handle_alt;
2761 else
2762 goto normal_char;
2763
2764
2765 case '|':
2766 if (syntax & RE_NO_BK_VBAR)
2767 goto handle_alt;
2768 else
2769 goto normal_char;
2770
2771
2772 case '{':
2773 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
2774 goto handle_interval;
2775 else
2776 goto normal_char;
2777
2778
2779 case '\\':
2780 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2781
2782 /* Do not translate the character after the \, so that we can
2783 distinguish, e.g., \B from \b, even if we normally would
2784 translate, e.g., B to b. */
2785 PATFETCH_RAW (c);
2786
2787 switch (c)
2788 {
2789 case '(':
2790 if (syntax & RE_NO_BK_PARENS)
2791 goto normal_backslash;
2792
2793 handle_open:
2794 {
2795 int shy = 0;
2796 if (p+1 < pend)
2797 {
2798 /* Look for a special (?...) construct */
2799 if ((syntax & RE_SHY_GROUPS) && *p == '?')
2800 {
2801 PATFETCH_RAW (c); /* Gobble up the '?'. */
2802 PATFETCH (c);
2803 switch (c)
2804 {
2805 case ':': shy = 1; break;
2806 default:
2807 /* Only (?:...) is supported right now. */
2808 FREE_STACK_RETURN (REG_BADPAT);
2809 }
2810 }
2811 }
2812
2813 if (!shy)
2814 {
2815 bufp->re_nsub++;
2816 regnum++;
2817 }
2818
2819 if (COMPILE_STACK_FULL)
2820 {
2821 RETALLOC (compile_stack.stack, compile_stack.size << 1,
2822 compile_stack_elt_t);
2823 if (compile_stack.stack == NULL) return REG_ESPACE;
2824
2825 compile_stack.size <<= 1;
2826 }
2827
2828 /* These are the values to restore when we hit end of this
2829 group. They are all relative offsets, so that if the
2830 whole pattern moves because of realloc, they will still
2831 be valid. */
2832 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
2833 COMPILE_STACK_TOP.fixup_alt_jump
2834 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
2835 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
2836 COMPILE_STACK_TOP.regnum = shy ? -regnum : regnum;
2837
2838 /* Do not push a
2839 start_memory for groups beyond the last one we can
2840 represent in the compiled pattern. */
2841 if (regnum <= MAX_REGNUM && !shy)
2842 BUF_PUSH_2 (start_memory, regnum);
2843
2844 compile_stack.avail++;
2845
2846 fixup_alt_jump = 0;
2847 laststart = 0;
2848 begalt = b;
2849 /* If we've reached MAX_REGNUM groups, then this open
2850 won't actually generate any code, so we'll have to
2851 clear pending_exact explicitly. */
2852 pending_exact = 0;
2853 break;
2854 }
2855
2856 case ')':
2857 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
2858
2859 if (COMPILE_STACK_EMPTY)
2860 {
2861 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
2862 goto normal_backslash;
2863 else
2864 FREE_STACK_RETURN (REG_ERPAREN);
2865 }
2866
2867 handle_close:
2868 FIXUP_ALT_JUMP ();
2869
2870 /* See similar code for backslashed left paren above. */
2871 if (COMPILE_STACK_EMPTY)
2872 {
2873 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
2874 goto normal_char;
2875 else
2876 FREE_STACK_RETURN (REG_ERPAREN);
2877 }
2878
2879 /* Since we just checked for an empty stack above, this
2880 ``can't happen''. */
2881 assert (compile_stack.avail != 0);
2882 {
2883 /* We don't just want to restore into `regnum', because
2884 later groups should continue to be numbered higher,
2885 as in `(ab)c(de)' -- the second group is #2. */
2886 regnum_t this_group_regnum;
2887
2888 compile_stack.avail--;
2889 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
2890 fixup_alt_jump
2891 = COMPILE_STACK_TOP.fixup_alt_jump
2892 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
2893 : 0;
2894 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
2895 this_group_regnum = COMPILE_STACK_TOP.regnum;
2896 /* If we've reached MAX_REGNUM groups, then this open
2897 won't actually generate any code, so we'll have to
2898 clear pending_exact explicitly. */
2899 pending_exact = 0;
2900
2901 /* We're at the end of the group, so now we know how many
2902 groups were inside this one. */
2903 if (this_group_regnum <= MAX_REGNUM && this_group_regnum > 0)
2904 BUF_PUSH_2 (stop_memory, this_group_regnum);
2905 }
2906 break;
2907
2908
2909 case '|': /* `\|'. */
2910 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
2911 goto normal_backslash;
2912 handle_alt:
2913 if (syntax & RE_LIMITED_OPS)
2914 goto normal_char;
2915
2916 /* Insert before the previous alternative a jump which
2917 jumps to this alternative if the former fails. */
2918 GET_BUFFER_SPACE (3);
2919 INSERT_JUMP (on_failure_jump, begalt, b + 6);
2920 pending_exact = 0;
2921 b += 3;
2922
2923 /* The alternative before this one has a jump after it
2924 which gets executed if it gets matched. Adjust that
2925 jump so it will jump to this alternative's analogous
2926 jump (put in below, which in turn will jump to the next
2927 (if any) alternative's such jump, etc.). The last such
2928 jump jumps to the correct final destination. A picture:
2929 _____ _____
2930 | | | |
2931 | v | v
2932 a | b | c
2933
2934 If we are at `b', then fixup_alt_jump right now points to a
2935 three-byte space after `a'. We'll put in the jump, set
2936 fixup_alt_jump to right after `b', and leave behind three
2937 bytes which we'll fill in when we get to after `c'. */
2938
2939 FIXUP_ALT_JUMP ();
2940
2941 /* Mark and leave space for a jump after this alternative,
2942 to be filled in later either by next alternative or
2943 when know we're at the end of a series of alternatives. */
2944 fixup_alt_jump = b;
2945 GET_BUFFER_SPACE (3);
2946 b += 3;
2947
2948 laststart = 0;
2949 begalt = b;
2950 break;
2951
2952
2953 case '{':
2954 /* If \{ is a literal. */
2955 if (!(syntax & RE_INTERVALS)
2956 /* If we're at `\{' and it's not the open-interval
2957 operator. */
2958 || (syntax & RE_NO_BK_BRACES))
2959 goto normal_backslash;
2960
2961 handle_interval:
2962 {
2963 /* If got here, then the syntax allows intervals. */
2964
2965 /* At least (most) this many matches must be made. */
2966 int lower_bound = 0, upper_bound = -1;
2967
2968 beg_interval = p;
2969
2970 if (p == pend)
2971 FREE_STACK_RETURN (REG_EBRACE);
2972
2973 GET_UNSIGNED_NUMBER (lower_bound);
2974
2975 if (c == ',')
2976 GET_UNSIGNED_NUMBER (upper_bound);
2977 else
2978 /* Interval such as `{1}' => match exactly once. */
2979 upper_bound = lower_bound;
2980
2981 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
2982 || (upper_bound >= 0 && lower_bound > upper_bound))
2983 FREE_STACK_RETURN (REG_BADBR);
2984
2985 if (!(syntax & RE_NO_BK_BRACES))
2986 {
2987 if (c != '\\')
2988 FREE_STACK_RETURN (REG_BADBR);
2989
2990 PATFETCH (c);
2991 }
2992
2993 if (c != '}')
2994 FREE_STACK_RETURN (REG_BADBR);
2995
2996 /* We just parsed a valid interval. */
2997
2998 /* If it's invalid to have no preceding re. */
2999 if (!laststart)
3000 {
3001 if (syntax & RE_CONTEXT_INVALID_OPS)
3002 FREE_STACK_RETURN (REG_BADRPT);
3003 else if (syntax & RE_CONTEXT_INDEP_OPS)
3004 laststart = b;
3005 else
3006 goto unfetch_interval;
3007 }
3008
3009 if (upper_bound == 0)
3010 /* If the upper bound is zero, just drop the sub pattern
3011 altogether. */
3012 b = laststart;
3013 else if (lower_bound == 1 && upper_bound == 1)
3014 /* Just match it once: nothing to do here. */
3015 ;
3016
3017 /* Otherwise, we have a nontrivial interval. When
3018 we're all done, the pattern will look like:
3019 set_number_at <jump count> <upper bound>
3020 set_number_at <succeed_n count> <lower bound>
3021 succeed_n <after jump addr> <succeed_n count>
3022 <body of loop>
3023 jump_n <succeed_n addr> <jump count>
3024 (The upper bound and `jump_n' are omitted if
3025 `upper_bound' is 1, though.) */
3026 else
3027 { /* If the upper bound is > 1, we need to insert
3028 more at the end of the loop. */
3029 unsigned int nbytes = (upper_bound < 0 ? 3
3030 : upper_bound > 1 ? 5 : 0);
3031 unsigned int startoffset = 0;
3032
3033 GET_BUFFER_SPACE (20); /* We might use less. */
3034
3035 if (lower_bound == 0)
3036 {
3037 /* A succeed_n that starts with 0 is really a
3038 a simple on_failure_jump_loop. */
3039 INSERT_JUMP (on_failure_jump_loop, laststart,
3040 b + 3 + nbytes);
3041 b += 3;
3042 }
3043 else
3044 {
3045 /* Initialize lower bound of the `succeed_n', even
3046 though it will be set during matching by its
3047 attendant `set_number_at' (inserted next),
3048 because `re_compile_fastmap' needs to know.
3049 Jump to the `jump_n' we might insert below. */
3050 INSERT_JUMP2 (succeed_n, laststart,
3051 b + 5 + nbytes,
3052 lower_bound);
3053 b += 5;
3054
3055 /* Code to initialize the lower bound. Insert
3056 before the `succeed_n'. The `5' is the last two
3057 bytes of this `set_number_at', plus 3 bytes of
3058 the following `succeed_n'. */
3059 insert_op2 (set_number_at, laststart, 5, lower_bound, b);
3060 b += 5;
3061 startoffset += 5;
3062 }
3063
3064 if (upper_bound < 0)
3065 {
3066 /* A negative upper bound stands for infinity,
3067 in which case it degenerates to a plain jump. */
3068 STORE_JUMP (jump, b, laststart + startoffset);
3069 b += 3;
3070 }
3071 else if (upper_bound > 1)
3072 { /* More than one repetition is allowed, so
3073 append a backward jump to the `succeed_n'
3074 that starts this interval.
3075
3076 When we've reached this during matching,
3077 we'll have matched the interval once, so
3078 jump back only `upper_bound - 1' times. */
3079 STORE_JUMP2 (jump_n, b, laststart + startoffset,
3080 upper_bound - 1);
3081 b += 5;
3082
3083 /* The location we want to set is the second
3084 parameter of the `jump_n'; that is `b-2' as
3085 an absolute address. `laststart' will be
3086 the `set_number_at' we're about to insert;
3087 `laststart+3' the number to set, the source
3088 for the relative address. But we are
3089 inserting into the middle of the pattern --
3090 so everything is getting moved up by 5.
3091 Conclusion: (b - 2) - (laststart + 3) + 5,
3092 i.e., b - laststart.
3093
3094 We insert this at the beginning of the loop
3095 so that if we fail during matching, we'll
3096 reinitialize the bounds. */
3097 insert_op2 (set_number_at, laststart, b - laststart,
3098 upper_bound - 1, b);
3099 b += 5;
3100 }
3101 }
3102 pending_exact = 0;
3103 beg_interval = NULL;
3104 }
3105 break;
3106
3107 unfetch_interval:
3108 /* If an invalid interval, match the characters as literals. */
3109 assert (beg_interval);
3110 p = beg_interval;
3111 beg_interval = NULL;
3112
3113 /* normal_char and normal_backslash need `c'. */
3114 c = '{';
3115
3116 if (!(syntax & RE_NO_BK_BRACES))
3117 {
3118 assert (p > pattern && p[-1] == '\\');
3119 goto normal_backslash;
3120 }
3121 else
3122 goto normal_char;
3123
3124 #ifdef emacs
3125 /* There is no way to specify the before_dot and after_dot
3126 operators. rms says this is ok. --karl */
3127 case '=':
3128 BUF_PUSH (at_dot);
3129 break;
3130
3131 case 's':
3132 laststart = b;
3133 PATFETCH (c);
3134 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
3135 break;
3136
3137 case 'S':
3138 laststart = b;
3139 PATFETCH (c);
3140 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
3141 break;
3142
3143 case 'c':
3144 laststart = b;
3145 PATFETCH_RAW (c);
3146 BUF_PUSH_2 (categoryspec, c);
3147 break;
3148
3149 case 'C':
3150 laststart = b;
3151 PATFETCH_RAW (c);
3152 BUF_PUSH_2 (notcategoryspec, c);
3153 break;
3154 #endif /* emacs */
3155
3156
3157 case 'w':
3158 if (syntax & RE_NO_GNU_OPS)
3159 goto normal_char;
3160 laststart = b;
3161 BUF_PUSH_2 (syntaxspec, Sword);
3162 break;
3163
3164
3165 case 'W':
3166 if (syntax & RE_NO_GNU_OPS)
3167 goto normal_char;
3168 laststart = b;
3169 BUF_PUSH_2 (notsyntaxspec, Sword);
3170 break;
3171
3172
3173 case '<':
3174 if (syntax & RE_NO_GNU_OPS)
3175 goto normal_char;
3176 BUF_PUSH (wordbeg);
3177 break;
3178
3179 case '>':
3180 if (syntax & RE_NO_GNU_OPS)
3181 goto normal_char;
3182 BUF_PUSH (wordend);
3183 break;
3184
3185 case 'b':
3186 if (syntax & RE_NO_GNU_OPS)
3187 goto normal_char;
3188 BUF_PUSH (wordbound);
3189 break;
3190
3191 case 'B':
3192 if (syntax & RE_NO_GNU_OPS)
3193 goto normal_char;
3194 BUF_PUSH (notwordbound);
3195 break;
3196
3197 case '`':
3198 if (syntax & RE_NO_GNU_OPS)
3199 goto normal_char;
3200 BUF_PUSH (begbuf);
3201 break;
3202
3203 case '\'':
3204 if (syntax & RE_NO_GNU_OPS)
3205 goto normal_char;
3206 BUF_PUSH (endbuf);
3207 break;
3208
3209 case '1': case '2': case '3': case '4': case '5':
3210 case '6': case '7': case '8': case '9':
3211 {
3212 regnum_t reg;
3213
3214 if (syntax & RE_NO_BK_REFS)
3215 goto normal_backslash;
3216
3217 reg = c - '0';
3218
3219 /* Can't back reference to a subexpression before its end. */
3220 if (reg > regnum || group_in_compile_stack (compile_stack, reg))
3221 FREE_STACK_RETURN (REG_ESUBREG);
3222
3223 laststart = b;
3224 BUF_PUSH_2 (duplicate, reg);
3225 }
3226 break;
3227
3228
3229 case '+':
3230 case '?':
3231 if (syntax & RE_BK_PLUS_QM)
3232 goto handle_plus;
3233 else
3234 goto normal_backslash;
3235
3236 default:
3237 normal_backslash:
3238 /* You might think it would be useful for \ to mean
3239 not to translate; but if we don't translate it
3240 it will never match anything. */
3241 /* Actually we don't have to translate it now, because
3242 it is anyway translated later. */
3243 #if 0
3244 c = TRANSLATE (c);
3245 #endif
3246 goto normal_char;
3247 }
3248 break;
3249
3250
3251 default:
3252 /* Expects the character in `c'. */
3253 normal_char:
3254 /* If no exactn currently being built. */
3255 if (!pending_exact
3256
3257 /* If last exactn not at current position. */
3258 || pending_exact + *pending_exact + 1 != b
3259
3260 /* We have only one byte following the exactn for the count. */
3261 || *pending_exact >= (1 << BYTEWIDTH) - MAX_MULTIBYTE_LENGTH
3262
3263 /* If followed by a repetition operator. */
3264 || (p != pend && (*p == '*' || *p == '^'))
3265 || ((syntax & RE_BK_PLUS_QM)
3266 ? p + 1 < pend && *p == '\\' && (p[1] == '+' || p[1] == '?')
3267 : p != pend && (*p == '+' || *p == '?'))
3268 || ((syntax & RE_INTERVALS)
3269 && ((syntax & RE_NO_BK_BRACES)
3270 ? p != pend && *p == '{'
3271 : p + 1 < pend && p[0] == '\\' && p[1] == '{')))
3272 {
3273 /* Start building a new exactn. */
3274
3275 laststart = b;
3276
3277 BUF_PUSH_2 (exactn, 0);
3278 pending_exact = b - 1;
3279 }
3280
3281 GET_BUFFER_SPACE (MAX_MULTIBYTE_LENGTH);
3282 {
3283 int len;
3284
3285 if (! multibyte)
3286 MAKE_CHAR_MULTIBYTE (c);
3287 c = TRANSLATE (c);
3288 if (target_multibyte)
3289 {
3290 len = CHAR_STRING (c, b);
3291 b += len;
3292 }
3293 else
3294 {
3295 MAKE_CHAR_UNIBYTE (c);
3296 *b++ = c;
3297 len = 1;
3298 }
3299 (*pending_exact) += len;
3300 }
3301
3302 break;
3303 } /* switch (c) */
3304 } /* while p != pend */
3305
3306
3307 /* Through the pattern now. */
3308
3309 FIXUP_ALT_JUMP ();
3310
3311 if (!COMPILE_STACK_EMPTY)
3312 FREE_STACK_RETURN (REG_EPAREN);
3313
3314 /* If we don't want backtracking, force success
3315 the first time we reach the end of the compiled pattern. */
3316 if (syntax & RE_NO_POSIX_BACKTRACKING)
3317 BUF_PUSH (succeed);
3318
3319 free (compile_stack.stack);
3320
3321 /* We have succeeded; set the length of the buffer. */
3322 bufp->used = b - bufp->buffer;
3323
3324 #ifdef emacs
3325 /* Now the buffer is adjusted for the multibyteness of a target. */
3326 bufp->multibyte = bufp->target_multibyte;
3327 #endif
3328
3329 #ifdef DEBUG
3330 if (debug > 0)
3331 {
3332 re_compile_fastmap (bufp);
3333 DEBUG_PRINT1 ("\nCompiled pattern: \n");
3334 print_compiled_pattern (bufp);
3335 }
3336 debug--;
3337 #endif /* DEBUG */
3338
3339 #ifndef MATCH_MAY_ALLOCATE
3340 /* Initialize the failure stack to the largest possible stack. This
3341 isn't necessary unless we're trying to avoid calling alloca in
3342 the search and match routines. */
3343 {
3344 int num_regs = bufp->re_nsub + 1;
3345
3346 if (fail_stack.size < re_max_failures * TYPICAL_FAILURE_SIZE)
3347 {
3348 fail_stack.size = re_max_failures * TYPICAL_FAILURE_SIZE;
3349
3350 if (! fail_stack.stack)
3351 fail_stack.stack
3352 = (fail_stack_elt_t *) malloc (fail_stack.size
3353 * sizeof (fail_stack_elt_t));
3354 else
3355 fail_stack.stack
3356 = (fail_stack_elt_t *) realloc (fail_stack.stack,
3357 (fail_stack.size
3358 * sizeof (fail_stack_elt_t)));
3359 }
3360
3361 regex_grow_registers (num_regs);
3362 }
3363 #endif /* not MATCH_MAY_ALLOCATE */
3364
3365 return REG_NOERROR;
3366 } /* regex_compile */
3367 \f
3368 /* Subroutines for `regex_compile'. */
3369
3370 /* Store OP at LOC followed by two-byte integer parameter ARG. */
3371
3372 static void
3373 store_op1 (op, loc, arg)
3374 re_opcode_t op;
3375 unsigned char *loc;
3376 int arg;
3377 {
3378 *loc = (unsigned char) op;
3379 STORE_NUMBER (loc + 1, arg);
3380 }
3381
3382
3383 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
3384
3385 static void
3386 store_op2 (op, loc, arg1, arg2)
3387 re_opcode_t op;
3388 unsigned char *loc;
3389 int arg1, arg2;
3390 {
3391 *loc = (unsigned char) op;
3392 STORE_NUMBER (loc + 1, arg1);
3393 STORE_NUMBER (loc + 3, arg2);
3394 }
3395
3396
3397 /* Copy the bytes from LOC to END to open up three bytes of space at LOC
3398 for OP followed by two-byte integer parameter ARG. */
3399
3400 static void
3401 insert_op1 (op, loc, arg, end)
3402 re_opcode_t op;
3403 unsigned char *loc;
3404 int arg;
3405 unsigned char *end;
3406 {
3407 register unsigned char *pfrom = end;
3408 register unsigned char *pto = end + 3;
3409
3410 while (pfrom != loc)
3411 *--pto = *--pfrom;
3412
3413 store_op1 (op, loc, arg);
3414 }
3415
3416
3417 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
3418
3419 static void
3420 insert_op2 (op, loc, arg1, arg2, end)
3421 re_opcode_t op;
3422 unsigned char *loc;
3423 int arg1, arg2;
3424 unsigned char *end;
3425 {
3426 register unsigned char *pfrom = end;
3427 register unsigned char *pto = end + 5;
3428
3429 while (pfrom != loc)
3430 *--pto = *--pfrom;
3431
3432 store_op2 (op, loc, arg1, arg2);
3433 }
3434
3435
3436 /* P points to just after a ^ in PATTERN. Return true if that ^ comes
3437 after an alternative or a begin-subexpression. We assume there is at
3438 least one character before the ^. */
3439
3440 static boolean
3441 at_begline_loc_p (pattern, p, syntax)
3442 re_char *pattern, *p;
3443 reg_syntax_t syntax;
3444 {
3445 re_char *prev = p - 2;
3446 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
3447
3448 return
3449 /* After a subexpression? */
3450 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
3451 /* After an alternative? */
3452 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash))
3453 /* After a shy subexpression? */
3454 || ((syntax & RE_SHY_GROUPS) && prev - 2 >= pattern
3455 && prev[-1] == '?' && prev[-2] == '('
3456 && (syntax & RE_NO_BK_PARENS
3457 || (prev - 3 >= pattern && prev[-3] == '\\')));
3458 }
3459
3460
3461 /* The dual of at_begline_loc_p. This one is for $. We assume there is
3462 at least one character after the $, i.e., `P < PEND'. */
3463
3464 static boolean
3465 at_endline_loc_p (p, pend, syntax)
3466 re_char *p, *pend;
3467 reg_syntax_t syntax;
3468 {
3469 re_char *next = p;
3470 boolean next_backslash = *next == '\\';
3471 re_char *next_next = p + 1 < pend ? p + 1 : 0;
3472
3473 return
3474 /* Before a subexpression? */
3475 (syntax & RE_NO_BK_PARENS ? *next == ')'
3476 : next_backslash && next_next && *next_next == ')')
3477 /* Before an alternative? */
3478 || (syntax & RE_NO_BK_VBAR ? *next == '|'
3479 : next_backslash && next_next && *next_next == '|');
3480 }
3481
3482
3483 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
3484 false if it's not. */
3485
3486 static boolean
3487 group_in_compile_stack (compile_stack, regnum)
3488 compile_stack_type compile_stack;
3489 regnum_t regnum;
3490 {
3491 int this_element;
3492
3493 for (this_element = compile_stack.avail - 1;
3494 this_element >= 0;
3495 this_element--)
3496 if (compile_stack.stack[this_element].regnum == regnum)
3497 return true;
3498
3499 return false;
3500 }
3501 \f
3502 /* analyse_first.
3503 If fastmap is non-NULL, go through the pattern and fill fastmap
3504 with all the possible leading chars. If fastmap is NULL, don't
3505 bother filling it up (obviously) and only return whether the
3506 pattern could potentially match the empty string.
3507
3508 Return 1 if p..pend might match the empty string.
3509 Return 0 if p..pend matches at least one char.
3510 Return -1 if fastmap was not updated accurately. */
3511
3512 static int
3513 analyse_first (p, pend, fastmap, multibyte)
3514 re_char *p, *pend;
3515 char *fastmap;
3516 const int multibyte;
3517 {
3518 int j, k;
3519 boolean not;
3520
3521 /* If all elements for base leading-codes in fastmap is set, this
3522 flag is set true. */
3523 boolean match_any_multibyte_characters = false;
3524
3525 assert (p);
3526
3527 /* The loop below works as follows:
3528 - It has a working-list kept in the PATTERN_STACK and which basically
3529 starts by only containing a pointer to the first operation.
3530 - If the opcode we're looking at is a match against some set of
3531 chars, then we add those chars to the fastmap and go on to the
3532 next work element from the worklist (done via `break').
3533 - If the opcode is a control operator on the other hand, we either
3534 ignore it (if it's meaningless at this point, such as `start_memory')
3535 or execute it (if it's a jump). If the jump has several destinations
3536 (i.e. `on_failure_jump'), then we push the other destination onto the
3537 worklist.
3538 We guarantee termination by ignoring backward jumps (more or less),
3539 so that `p' is monotonically increasing. More to the point, we
3540 never set `p' (or push) anything `<= p1'. */
3541
3542 while (p < pend)
3543 {
3544 /* `p1' is used as a marker of how far back a `on_failure_jump'
3545 can go without being ignored. It is normally equal to `p'
3546 (which prevents any backward `on_failure_jump') except right
3547 after a plain `jump', to allow patterns such as:
3548 0: jump 10
3549 3..9: <body>
3550 10: on_failure_jump 3
3551 as used for the *? operator. */
3552 re_char *p1 = p;
3553
3554 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
3555 {
3556 case succeed:
3557 return 1;
3558 continue;
3559
3560 case duplicate:
3561 /* If the first character has to match a backreference, that means
3562 that the group was empty (since it already matched). Since this
3563 is the only case that interests us here, we can assume that the
3564 backreference must match the empty string. */
3565 p++;
3566 continue;
3567
3568
3569 /* Following are the cases which match a character. These end
3570 with `break'. */
3571
3572 case exactn:
3573 if (fastmap)
3574 /* If multibyte is nonzero, the first byte of each
3575 character is an ASCII or a leading code. Otherwise,
3576 each byte is a character. Thus, this works in both
3577 cases. */
3578 fastmap[p[1]] = 1;
3579 break;
3580
3581
3582 case anychar:
3583 /* We could put all the chars except for \n (and maybe \0)
3584 but we don't bother since it is generally not worth it. */
3585 if (!fastmap) break;
3586 return -1;
3587
3588
3589 case charset_not:
3590 if (!fastmap) break;
3591 {
3592 /* Chars beyond end of bitmap are possible matches. */
3593 /* In a multibyte case, the bitmap is used only for ASCII
3594 characters. */
3595 int limit = multibyte ? 128 : (1 << BYTEWIDTH);
3596
3597 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
3598 j < limit; j++)
3599 fastmap[j] = 1;
3600 }
3601
3602 /* Fallthrough */
3603 case charset:
3604 if (!fastmap) break;
3605 not = (re_opcode_t) *(p - 1) == charset_not;
3606 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
3607 j >= 0; j--)
3608 if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
3609 fastmap[j] = 1;
3610
3611 if ((not && multibyte)
3612 /* Any leading code can possibly start a character
3613 which doesn't match the specified set of characters. */
3614 || (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
3615 && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0))
3616 /* If we can match a character class, we can match
3617 any multibyte characters. */
3618 {
3619 if (match_any_multibyte_characters == false)
3620 {
3621 for (j = 0x80; j < (1 << BYTEWIDTH); j++)
3622 fastmap[j] = 1;
3623 match_any_multibyte_characters = true;
3624 }
3625 }
3626
3627 else if (!not && CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
3628 && match_any_multibyte_characters == false)
3629 {
3630 /* Set fastmap[I] to 1 where I is a leading code of each
3631 multibyte characer in the range table. */
3632 int c, count;
3633 unsigned char lc1, lc2;
3634
3635 /* Make P points the range table. `+ 2' is to skip flag
3636 bits for a character class. */
3637 p += CHARSET_BITMAP_SIZE (&p[-2]) + 2;
3638
3639 /* Extract the number of ranges in range table into COUNT. */
3640 EXTRACT_NUMBER_AND_INCR (count, p);
3641 for (; count > 0; count--, p += 2 * 3) /* XXX */
3642 {
3643 /* Extract the start and end of each range. */
3644 EXTRACT_CHARACTER (c, p);
3645 lc1 = CHAR_LEADING_CODE (c);
3646 p += 3;
3647 EXTRACT_CHARACTER (c, p);
3648 lc2 = CHAR_LEADING_CODE (c);
3649 for (j = lc1; j <= lc2; j++)
3650 fastmap[j] = 1;
3651 }
3652 }
3653 break;
3654
3655 case syntaxspec:
3656 case notsyntaxspec:
3657 if (!fastmap) break;
3658 #ifndef emacs
3659 not = (re_opcode_t)p[-1] == notsyntaxspec;
3660 k = *p++;
3661 for (j = 0; j < (1 << BYTEWIDTH); j++)
3662 if ((SYNTAX (j) == (enum syntaxcode) k) ^ not)
3663 fastmap[j] = 1;
3664 break;
3665 #else /* emacs */
3666 /* This match depends on text properties. These end with
3667 aborting optimizations. */
3668 return -1;
3669
3670 case categoryspec:
3671 case notcategoryspec:
3672 if (!fastmap) break;
3673 not = (re_opcode_t)p[-1] == notcategoryspec;
3674 k = *p++;
3675 for (j = (multibyte ? 127 : (1 << BYTEWIDTH)); j >= 0; j--)
3676 if ((CHAR_HAS_CATEGORY (j, k)) ^ not)
3677 fastmap[j] = 1;
3678
3679 if (multibyte)
3680 {
3681 /* Any character set can possibly contain a character
3682 whose category is K (or not). */
3683 if (match_any_multibyte_characters == false)
3684 {
3685 for (j = 0x80; j < (1 << BYTEWIDTH); j++)
3686 fastmap[j] = 1;
3687 match_any_multibyte_characters = true;
3688 }
3689 }
3690 break;
3691
3692 /* All cases after this match the empty string. These end with
3693 `continue'. */
3694
3695 case before_dot:
3696 case at_dot:
3697 case after_dot:
3698 #endif /* !emacs */
3699 case no_op:
3700 case begline:
3701 case endline:
3702 case begbuf:
3703 case endbuf:
3704 case wordbound:
3705 case notwordbound:
3706 case wordbeg:
3707 case wordend:
3708 continue;
3709
3710
3711 case jump:
3712 EXTRACT_NUMBER_AND_INCR (j, p);
3713 if (j < 0)
3714 /* Backward jumps can only go back to code that we've already
3715 visited. `re_compile' should make sure this is true. */
3716 break;
3717 p += j;
3718 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p))
3719 {
3720 case on_failure_jump:
3721 case on_failure_keep_string_jump:
3722 case on_failure_jump_loop:
3723 case on_failure_jump_nastyloop:
3724 case on_failure_jump_smart:
3725 p++;
3726 break;
3727 default:
3728 continue;
3729 };
3730 /* Keep `p1' to allow the `on_failure_jump' we are jumping to
3731 to jump back to "just after here". */
3732 /* Fallthrough */
3733
3734 case on_failure_jump:
3735 case on_failure_keep_string_jump:
3736 case on_failure_jump_nastyloop:
3737 case on_failure_jump_loop:
3738 case on_failure_jump_smart:
3739 EXTRACT_NUMBER_AND_INCR (j, p);
3740 if (p + j <= p1)
3741 ; /* Backward jump to be ignored. */
3742 else
3743 { /* We have to look down both arms.
3744 We first go down the "straight" path so as to minimize
3745 stack usage when going through alternatives. */
3746 int r = analyse_first (p, pend, fastmap, multibyte);
3747 if (r) return r;
3748 p += j;
3749 }
3750 continue;
3751
3752
3753 case jump_n:
3754 /* This code simply does not properly handle forward jump_n. */
3755 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p); assert (j < 0));
3756 p += 4;
3757 /* jump_n can either jump or fall through. The (backward) jump
3758 case has already been handled, so we only need to look at the
3759 fallthrough case. */
3760 continue;
3761
3762 case succeed_n:
3763 /* If N == 0, it should be an on_failure_jump_loop instead. */
3764 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); assert (j > 0));
3765 p += 4;
3766 /* We only care about one iteration of the loop, so we don't
3767 need to consider the case where this behaves like an
3768 on_failure_jump. */
3769 continue;
3770
3771
3772 case set_number_at:
3773 p += 4;
3774 continue;
3775
3776
3777 case start_memory:
3778 case stop_memory:
3779 p += 1;
3780 continue;
3781
3782
3783 default:
3784 abort (); /* We have listed all the cases. */
3785 } /* switch *p++ */
3786
3787 /* Getting here means we have found the possible starting
3788 characters for one path of the pattern -- and that the empty
3789 string does not match. We need not follow this path further. */
3790 return 0;
3791 } /* while p */
3792
3793 /* We reached the end without matching anything. */
3794 return 1;
3795
3796 } /* analyse_first */
3797 \f
3798 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
3799 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
3800 characters can start a string that matches the pattern. This fastmap
3801 is used by re_search to skip quickly over impossible starting points.
3802
3803 Character codes above (1 << BYTEWIDTH) are not represented in the
3804 fastmap, but the leading codes are represented. Thus, the fastmap
3805 indicates which character sets could start a match.
3806
3807 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
3808 area as BUFP->fastmap.
3809
3810 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
3811 the pattern buffer.
3812
3813 Returns 0 if we succeed, -2 if an internal error. */
3814
3815 int
3816 re_compile_fastmap (bufp)
3817 struct re_pattern_buffer *bufp;
3818 {
3819 char *fastmap = bufp->fastmap;
3820 int analysis;
3821
3822 assert (fastmap && bufp->buffer);
3823
3824 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
3825 bufp->fastmap_accurate = 1; /* It will be when we're done. */
3826
3827 analysis = analyse_first (bufp->buffer, bufp->buffer + bufp->used,
3828 fastmap,
3829 #ifdef emacs
3830 /* The compiled pattern buffer is always
3831 setup for multibyte characters. */
3832 1
3833 #else
3834 0
3835 #endif
3836 );
3837 bufp->can_be_null = (analysis != 0);
3838 return 0;
3839 } /* re_compile_fastmap */
3840 \f
3841 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
3842 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
3843 this memory for recording register information. STARTS and ENDS
3844 must be allocated using the malloc library routine, and must each
3845 be at least NUM_REGS * sizeof (regoff_t) bytes long.
3846
3847 If NUM_REGS == 0, then subsequent matches should allocate their own
3848 register data.
3849
3850 Unless this function is called, the first search or match using
3851 PATTERN_BUFFER will allocate its own register data, without
3852 freeing the old data. */
3853
3854 void
3855 re_set_registers (bufp, regs, num_regs, starts, ends)
3856 struct re_pattern_buffer *bufp;
3857 struct re_registers *regs;
3858 unsigned num_regs;
3859 regoff_t *starts, *ends;
3860 {
3861 if (num_regs)
3862 {
3863 bufp->regs_allocated = REGS_REALLOCATE;
3864 regs->num_regs = num_regs;
3865 regs->start = starts;
3866 regs->end = ends;
3867 }
3868 else
3869 {
3870 bufp->regs_allocated = REGS_UNALLOCATED;
3871 regs->num_regs = 0;
3872 regs->start = regs->end = (regoff_t *) 0;
3873 }
3874 }
3875 WEAK_ALIAS (__re_set_registers, re_set_registers)
3876 \f
3877 /* Searching routines. */
3878
3879 /* Like re_search_2, below, but only one string is specified, and
3880 doesn't let you say where to stop matching. */
3881
3882 int
3883 re_search (bufp, string, size, startpos, range, regs)
3884 struct re_pattern_buffer *bufp;
3885 const char *string;
3886 int size, startpos, range;
3887 struct re_registers *regs;
3888 {
3889 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
3890 regs, size);
3891 }
3892 WEAK_ALIAS (__re_search, re_search)
3893
3894 /* End address of virtual concatenation of string. */
3895 #define STOP_ADDR_VSTRING(P) \
3896 (((P) >= size1 ? string2 + size2 : string1 + size1))
3897
3898 /* Address of POS in the concatenation of virtual string. */
3899 #define POS_ADDR_VSTRING(POS) \
3900 (((POS) >= size1 ? string2 - size1 : string1) + (POS))
3901
3902 /* Using the compiled pattern in BUFP->buffer, first tries to match the
3903 virtual concatenation of STRING1 and STRING2, starting first at index
3904 STARTPOS, then at STARTPOS + 1, and so on.
3905
3906 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
3907
3908 RANGE is how far to scan while trying to match. RANGE = 0 means try
3909 only at STARTPOS; in general, the last start tried is STARTPOS +
3910 RANGE.
3911
3912 In REGS, return the indices of the virtual concatenation of STRING1
3913 and STRING2 that matched the entire BUFP->buffer and its contained
3914 subexpressions.
3915
3916 Do not consider matching one past the index STOP in the virtual
3917 concatenation of STRING1 and STRING2.
3918
3919 We return either the position in the strings at which the match was
3920 found, -1 if no match, or -2 if error (such as failure
3921 stack overflow). */
3922
3923 int
3924 re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop)
3925 struct re_pattern_buffer *bufp;
3926 const char *str1, *str2;
3927 int size1, size2;
3928 int startpos;
3929 int range;
3930 struct re_registers *regs;
3931 int stop;
3932 {
3933 int val;
3934 re_char *string1 = (re_char*) str1;
3935 re_char *string2 = (re_char*) str2;
3936 register char *fastmap = bufp->fastmap;
3937 register RE_TRANSLATE_TYPE translate = bufp->translate;
3938 int total_size = size1 + size2;
3939 int endpos = startpos + range;
3940 boolean anchored_start;
3941 /* Nonzero if BUFP is setup for multibyte characters. We are sure
3942 that it is the same as RE_TARGET_MULTIBYTE_P (bufp). */
3943 const boolean multibyte = RE_MULTIBYTE_P (bufp);
3944
3945 /* Check for out-of-range STARTPOS. */
3946 if (startpos < 0 || startpos > total_size)
3947 return -1;
3948
3949 /* Fix up RANGE if it might eventually take us outside
3950 the virtual concatenation of STRING1 and STRING2.
3951 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */
3952 if (endpos < 0)
3953 range = 0 - startpos;
3954 else if (endpos > total_size)
3955 range = total_size - startpos;
3956
3957 /* If the search isn't to be a backwards one, don't waste time in a
3958 search for a pattern anchored at beginning of buffer. */
3959 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
3960 {
3961 if (startpos > 0)
3962 return -1;
3963 else
3964 range = 0;
3965 }
3966
3967 #ifdef emacs
3968 /* In a forward search for something that starts with \=.
3969 don't keep searching past point. */
3970 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
3971 {
3972 range = PT_BYTE - BEGV_BYTE - startpos;
3973 if (range < 0)
3974 return -1;
3975 }
3976 #endif /* emacs */
3977
3978 /* Update the fastmap now if not correct already. */
3979 if (fastmap && !bufp->fastmap_accurate)
3980 re_compile_fastmap (bufp);
3981
3982 /* See whether the pattern is anchored. */
3983 anchored_start = (bufp->buffer[0] == begline);
3984
3985 #ifdef emacs
3986 gl_state.object = re_match_object;
3987 {
3988 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (startpos));
3989
3990 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
3991 }
3992 #endif
3993
3994 /* Loop through the string, looking for a place to start matching. */
3995 for (;;)
3996 {
3997 /* If the pattern is anchored,
3998 skip quickly past places we cannot match.
3999 We don't bother to treat startpos == 0 specially
4000 because that case doesn't repeat. */
4001 if (anchored_start && startpos > 0)
4002 {
4003 if (! ((startpos <= size1 ? string1[startpos - 1]
4004 : string2[startpos - size1 - 1])
4005 == '\n'))
4006 goto advance;
4007 }
4008
4009 /* If a fastmap is supplied, skip quickly over characters that
4010 cannot be the start of a match. If the pattern can match the
4011 null string, however, we don't need to skip characters; we want
4012 the first null string. */
4013 if (fastmap && startpos < total_size && !bufp->can_be_null)
4014 {
4015 register re_char *d;
4016 register re_wchar_t buf_ch;
4017
4018 d = POS_ADDR_VSTRING (startpos);
4019
4020 if (range > 0) /* Searching forwards. */
4021 {
4022 register int lim = 0;
4023 int irange = range;
4024
4025 if (startpos < size1 && startpos + range >= size1)
4026 lim = range - (size1 - startpos);
4027
4028 /* Written out as an if-else to avoid testing `translate'
4029 inside the loop. */
4030 if (RE_TRANSLATE_P (translate))
4031 {
4032 if (multibyte)
4033 while (range > lim)
4034 {
4035 int buf_charlen;
4036
4037 buf_ch = STRING_CHAR_AND_LENGTH (d, range - lim,
4038 buf_charlen);
4039 buf_ch = RE_TRANSLATE (translate, buf_ch);
4040 if (fastmap[CHAR_LEADING_CODE (buf_ch)])
4041 break;
4042 range -= buf_charlen;
4043 d += buf_charlen;
4044 }
4045 else
4046 while (range > lim)
4047 {
4048 buf_ch = *d;
4049 #ifdef emacs
4050 MAKE_CHAR_MULTIBYTE (buf_ch);
4051 #endif
4052 buf_ch = RE_TRANSLATE (translate, buf_ch);
4053 #ifdef emacs
4054 MAKE_CHAR_UNIBYTE (buf_ch);
4055 #endif
4056 if (fastmap[buf_ch])
4057 break;
4058 d++;
4059 range--;
4060 }
4061 }
4062 else
4063 {
4064 if (multibyte)
4065 while (range > lim)
4066 {
4067 int buf_charlen;
4068
4069 buf_ch = STRING_CHAR_AND_LENGTH (d, range - lim,
4070 buf_charlen);
4071 if (fastmap[CHAR_LEADING_CODE (buf_ch)])
4072 break;
4073 range -= buf_charlen;
4074 d += buf_charlen;
4075 }
4076 else
4077 while (range > lim && !fastmap[*d])
4078 {
4079 d++;
4080 range--;
4081 }
4082 }
4083 startpos += irange - range;
4084 }
4085 else /* Searching backwards. */
4086 {
4087 int room = (startpos >= size1
4088 ? size2 + size1 - startpos
4089 : size1 - startpos);
4090
4091 if (multibyte)
4092 {
4093 buf_ch = STRING_CHAR (d, room);
4094 buf_ch = TRANSLATE (buf_ch);
4095 if (! fastmap[CHAR_LEADING_CODE (buf_ch)])
4096 goto advance;
4097 }
4098 else
4099 {
4100 if (! fastmap[TRANSLATE (*d)])
4101 goto advance;
4102 }
4103 }
4104 }
4105
4106 /* If can't match the null string, and that's all we have left, fail. */
4107 if (range >= 0 && startpos == total_size && fastmap
4108 && !bufp->can_be_null)
4109 return -1;
4110
4111 val = re_match_2_internal (bufp, string1, size1, string2, size2,
4112 startpos, regs, stop);
4113 #ifndef REGEX_MALLOC
4114 # ifdef C_ALLOCA
4115 alloca (0);
4116 # endif
4117 #endif
4118
4119 if (val >= 0)
4120 return startpos;
4121
4122 if (val == -2)
4123 return -2;
4124
4125 advance:
4126 if (!range)
4127 break;
4128 else if (range > 0)
4129 {
4130 /* Update STARTPOS to the next character boundary. */
4131 if (multibyte)
4132 {
4133 re_char *p = POS_ADDR_VSTRING (startpos);
4134 re_char *pend = STOP_ADDR_VSTRING (startpos);
4135 int len = MULTIBYTE_FORM_LENGTH (p, pend - p);
4136
4137 range -= len;
4138 if (range < 0)
4139 break;
4140 startpos += len;
4141 }
4142 else
4143 {
4144 range--;
4145 startpos++;
4146 }
4147 }
4148 else
4149 {
4150 range++;
4151 startpos--;
4152
4153 /* Update STARTPOS to the previous character boundary. */
4154 if (multibyte)
4155 {
4156 re_char *p = POS_ADDR_VSTRING (startpos);
4157 int len = 0;
4158
4159 /* Find the head of multibyte form. */
4160 while (!CHAR_HEAD_P (*p))
4161 p--, len++;
4162 range += len;
4163 if (range > 0)
4164 break;
4165 startpos -= len;
4166 }
4167 }
4168 }
4169 return -1;
4170 } /* re_search_2 */
4171 WEAK_ALIAS (__re_search_2, re_search_2)
4172 \f
4173 /* Declarations and macros for re_match_2. */
4174
4175 static int bcmp_translate _RE_ARGS((re_char *s1, re_char *s2,
4176 register int len,
4177 RE_TRANSLATE_TYPE translate,
4178 const int multibyte));
4179
4180 /* This converts PTR, a pointer into one of the search strings `string1'
4181 and `string2' into an offset from the beginning of that string. */
4182 #define POINTER_TO_OFFSET(ptr) \
4183 (FIRST_STRING_P (ptr) \
4184 ? ((regoff_t) ((ptr) - string1)) \
4185 : ((regoff_t) ((ptr) - string2 + size1)))
4186
4187 /* Call before fetching a character with *d. This switches over to
4188 string2 if necessary.
4189 Check re_match_2_internal for a discussion of why end_match_2 might
4190 not be within string2 (but be equal to end_match_1 instead). */
4191 #define PREFETCH() \
4192 while (d == dend) \
4193 { \
4194 /* End of string2 => fail. */ \
4195 if (dend == end_match_2) \
4196 goto fail; \
4197 /* End of string1 => advance to string2. */ \
4198 d = string2; \
4199 dend = end_match_2; \
4200 }
4201
4202 /* Call before fetching a char with *d if you already checked other limits.
4203 This is meant for use in lookahead operations like wordend, etc..
4204 where we might need to look at parts of the string that might be
4205 outside of the LIMITs (i.e past `stop'). */
4206 #define PREFETCH_NOLIMIT() \
4207 if (d == end1) \
4208 { \
4209 d = string2; \
4210 dend = end_match_2; \
4211 } \
4212
4213 /* Test if at very beginning or at very end of the virtual concatenation
4214 of `string1' and `string2'. If only one string, it's `string2'. */
4215 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
4216 #define AT_STRINGS_END(d) ((d) == end2)
4217
4218
4219 /* Test if D points to a character which is word-constituent. We have
4220 two special cases to check for: if past the end of string1, look at
4221 the first character in string2; and if before the beginning of
4222 string2, look at the last character in string1. */
4223 #define WORDCHAR_P(d) \
4224 (SYNTAX ((d) == end1 ? *string2 \
4225 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
4226 == Sword)
4227
4228 /* Disabled due to a compiler bug -- see comment at case wordbound */
4229
4230 /* The comment at case wordbound is following one, but we don't use
4231 AT_WORD_BOUNDARY anymore to support multibyte form.
4232
4233 The DEC Alpha C compiler 3.x generates incorrect code for the
4234 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
4235 AT_WORD_BOUNDARY, so this code is disabled. Expanding the
4236 macro and introducing temporary variables works around the bug. */
4237
4238 #if 0
4239 /* Test if the character before D and the one at D differ with respect
4240 to being word-constituent. */
4241 #define AT_WORD_BOUNDARY(d) \
4242 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
4243 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
4244 #endif
4245
4246 /* Free everything we malloc. */
4247 #ifdef MATCH_MAY_ALLOCATE
4248 # define FREE_VAR(var) if (var) { REGEX_FREE (var); var = NULL; } else
4249 # define FREE_VARIABLES() \
4250 do { \
4251 REGEX_FREE_STACK (fail_stack.stack); \
4252 FREE_VAR (regstart); \
4253 FREE_VAR (regend); \
4254 FREE_VAR (best_regstart); \
4255 FREE_VAR (best_regend); \
4256 } while (0)
4257 #else
4258 # define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
4259 #endif /* not MATCH_MAY_ALLOCATE */
4260
4261 \f
4262 /* Optimization routines. */
4263
4264 /* If the operation is a match against one or more chars,
4265 return a pointer to the next operation, else return NULL. */
4266 static re_char *
4267 skip_one_char (p)
4268 re_char *p;
4269 {
4270 switch (SWITCH_ENUM_CAST (*p++))
4271 {
4272 case anychar:
4273 break;
4274
4275 case exactn:
4276 p += *p + 1;
4277 break;
4278
4279 case charset_not:
4280 case charset:
4281 if (CHARSET_RANGE_TABLE_EXISTS_P (p - 1))
4282 {
4283 int mcnt;
4284 p = CHARSET_RANGE_TABLE (p - 1);
4285 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4286 p = CHARSET_RANGE_TABLE_END (p, mcnt);
4287 }
4288 else
4289 p += 1 + CHARSET_BITMAP_SIZE (p - 1);
4290 break;
4291
4292 case syntaxspec:
4293 case notsyntaxspec:
4294 #ifdef emacs
4295 case categoryspec:
4296 case notcategoryspec:
4297 #endif /* emacs */
4298 p++;
4299 break;
4300
4301 default:
4302 p = NULL;
4303 }
4304 return p;
4305 }
4306
4307
4308 /* Jump over non-matching operations. */
4309 static unsigned char *
4310 skip_noops (p, pend)
4311 unsigned char *p, *pend;
4312 {
4313 int mcnt;
4314 while (p < pend)
4315 {
4316 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p))
4317 {
4318 case start_memory:
4319 case stop_memory:
4320 p += 2; break;
4321 case no_op:
4322 p += 1; break;
4323 case jump:
4324 p += 1;
4325 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4326 p += mcnt;
4327 break;
4328 default:
4329 return p;
4330 }
4331 }
4332 assert (p == pend);
4333 return p;
4334 }
4335
4336 /* Non-zero if "p1 matches something" implies "p2 fails". */
4337 static int
4338 mutually_exclusive_p (bufp, p1, p2)
4339 struct re_pattern_buffer *bufp;
4340 unsigned char *p1, *p2;
4341 {
4342 re_opcode_t op2;
4343 const boolean multibyte = RE_MULTIBYTE_P (bufp);
4344 unsigned char *pend = bufp->buffer + bufp->used;
4345
4346 assert (p1 >= bufp->buffer && p1 < pend
4347 && p2 >= bufp->buffer && p2 <= pend);
4348
4349 /* Skip over open/close-group commands.
4350 If what follows this loop is a ...+ construct,
4351 look at what begins its body, since we will have to
4352 match at least one of that. */
4353 p2 = skip_noops (p2, pend);
4354 /* The same skip can be done for p1, except that this function
4355 is only used in the case where p1 is a simple match operator. */
4356 /* p1 = skip_noops (p1, pend); */
4357
4358 assert (p1 >= bufp->buffer && p1 < pend
4359 && p2 >= bufp->buffer && p2 <= pend);
4360
4361 op2 = p2 == pend ? succeed : *p2;
4362
4363 switch (SWITCH_ENUM_CAST (op2))
4364 {
4365 case succeed:
4366 case endbuf:
4367 /* If we're at the end of the pattern, we can change. */
4368 if (skip_one_char (p1))
4369 {
4370 DEBUG_PRINT1 (" End of pattern: fast loop.\n");
4371 return 1;
4372 }
4373 break;
4374
4375 case endline:
4376 case exactn:
4377 {
4378 register re_wchar_t c
4379 = (re_opcode_t) *p2 == endline ? '\n'
4380 : RE_STRING_CHAR (p2 + 2, pend - p2 - 2);
4381
4382 if ((re_opcode_t) *p1 == exactn)
4383 {
4384 if (c != RE_STRING_CHAR (p1 + 2, pend - p1 - 2))
4385 {
4386 DEBUG_PRINT3 (" '%c' != '%c' => fast loop.\n", c, p1[2]);
4387 return 1;
4388 }
4389 }
4390
4391 else if ((re_opcode_t) *p1 == charset
4392 || (re_opcode_t) *p1 == charset_not)
4393 {
4394 int not = (re_opcode_t) *p1 == charset_not;
4395
4396 /* Test if C is listed in charset (or charset_not)
4397 at `p1'. */
4398 if (! multibyte || IS_REAL_ASCII (c))
4399 {
4400 if (c < CHARSET_BITMAP_SIZE (p1) * BYTEWIDTH
4401 && p1[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4402 not = !not;
4403 }
4404 else if (CHARSET_RANGE_TABLE_EXISTS_P (p1))
4405 CHARSET_LOOKUP_RANGE_TABLE (not, c, p1);
4406
4407 /* `not' is equal to 1 if c would match, which means
4408 that we can't change to pop_failure_jump. */
4409 if (!not)
4410 {
4411 DEBUG_PRINT1 (" No match => fast loop.\n");
4412 return 1;
4413 }
4414 }
4415 else if ((re_opcode_t) *p1 == anychar
4416 && c == '\n')
4417 {
4418 DEBUG_PRINT1 (" . != \\n => fast loop.\n");
4419 return 1;
4420 }
4421 }
4422 break;
4423
4424 case charset:
4425 {
4426 if ((re_opcode_t) *p1 == exactn)
4427 /* Reuse the code above. */
4428 return mutually_exclusive_p (bufp, p2, p1);
4429
4430 /* It is hard to list up all the character in charset
4431 P2 if it includes multibyte character. Give up in
4432 such case. */
4433 else if (!multibyte || !CHARSET_RANGE_TABLE_EXISTS_P (p2))
4434 {
4435 /* Now, we are sure that P2 has no range table.
4436 So, for the size of bitmap in P2, `p2[1]' is
4437 enough. But P1 may have range table, so the
4438 size of bitmap table of P1 is extracted by
4439 using macro `CHARSET_BITMAP_SIZE'.
4440
4441 In a multibyte case, we know that all the character
4442 listed in P2 is ASCII. In a unibyte case, P1 has only a
4443 bitmap table. So, in both cases, it is enough to test
4444 only the bitmap table of P1. */
4445
4446 if ((re_opcode_t) *p1 == charset)
4447 {
4448 int idx;
4449 /* We win if the charset inside the loop
4450 has no overlap with the one after the loop. */
4451 for (idx = 0;
4452 (idx < (int) p2[1]
4453 && idx < CHARSET_BITMAP_SIZE (p1));
4454 idx++)
4455 if ((p2[2 + idx] & p1[2 + idx]) != 0)
4456 break;
4457
4458 if (idx == p2[1]
4459 || idx == CHARSET_BITMAP_SIZE (p1))
4460 {
4461 DEBUG_PRINT1 (" No match => fast loop.\n");
4462 return 1;
4463 }
4464 }
4465 else if ((re_opcode_t) *p1 == charset_not)
4466 {
4467 int idx;
4468 /* We win if the charset_not inside the loop lists
4469 every character listed in the charset after. */
4470 for (idx = 0; idx < (int) p2[1]; idx++)
4471 if (! (p2[2 + idx] == 0
4472 || (idx < CHARSET_BITMAP_SIZE (p1)
4473 && ((p2[2 + idx] & ~ p1[2 + idx]) == 0))))
4474 break;
4475
4476 if (idx == p2[1])
4477 {
4478 DEBUG_PRINT1 (" No match => fast loop.\n");
4479 return 1;
4480 }
4481 }
4482 }
4483 }
4484 break;
4485
4486 case charset_not:
4487 switch (SWITCH_ENUM_CAST (*p1))
4488 {
4489 case exactn:
4490 case charset:
4491 /* Reuse the code above. */
4492 return mutually_exclusive_p (bufp, p2, p1);
4493 case charset_not:
4494 /* When we have two charset_not, it's very unlikely that
4495 they don't overlap. The union of the two sets of excluded
4496 chars should cover all possible chars, which, as a matter of
4497 fact, is virtually impossible in multibyte buffers. */
4498 ;
4499 }
4500 break;
4501
4502 case wordend:
4503 case notsyntaxspec:
4504 return ((re_opcode_t) *p1 == syntaxspec
4505 && p1[1] == (op2 == wordend ? Sword : p2[1]));
4506
4507 case wordbeg:
4508 case syntaxspec:
4509 return ((re_opcode_t) *p1 == notsyntaxspec
4510 && p1[1] == (op2 == wordend ? Sword : p2[1]));
4511
4512 case wordbound:
4513 return (((re_opcode_t) *p1 == notsyntaxspec
4514 || (re_opcode_t) *p1 == syntaxspec)
4515 && p1[1] == Sword);
4516
4517 #ifdef emacs
4518 case categoryspec:
4519 return ((re_opcode_t) *p1 == notcategoryspec && p1[1] == p2[1]);
4520 case notcategoryspec:
4521 return ((re_opcode_t) *p1 == categoryspec && p1[1] == p2[1]);
4522 #endif /* emacs */
4523
4524 default:
4525 ;
4526 }
4527
4528 /* Safe default. */
4529 return 0;
4530 }
4531
4532 \f
4533 /* Matching routines. */
4534
4535 #ifndef emacs /* Emacs never uses this. */
4536 /* re_match is like re_match_2 except it takes only a single string. */
4537
4538 int
4539 re_match (bufp, string, size, pos, regs)
4540 struct re_pattern_buffer *bufp;
4541 const char *string;
4542 int size, pos;
4543 struct re_registers *regs;
4544 {
4545 int result = re_match_2_internal (bufp, NULL, 0, (re_char*) string, size,
4546 pos, regs, size);
4547 # if defined C_ALLOCA && !defined REGEX_MALLOC
4548 alloca (0);
4549 # endif
4550 return result;
4551 }
4552 WEAK_ALIAS (__re_match, re_match)
4553 #endif /* not emacs */
4554
4555 #ifdef emacs
4556 /* In Emacs, this is the string or buffer in which we
4557 are matching. It is used for looking up syntax properties. */
4558 Lisp_Object re_match_object;
4559 #endif
4560
4561 /* re_match_2 matches the compiled pattern in BUFP against the
4562 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
4563 and SIZE2, respectively). We start matching at POS, and stop
4564 matching at STOP.
4565
4566 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
4567 store offsets for the substring each group matched in REGS. See the
4568 documentation for exactly how many groups we fill.
4569
4570 We return -1 if no match, -2 if an internal error (such as the
4571 failure stack overflowing). Otherwise, we return the length of the
4572 matched substring. */
4573
4574 int
4575 re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
4576 struct re_pattern_buffer *bufp;
4577 const char *string1, *string2;
4578 int size1, size2;
4579 int pos;
4580 struct re_registers *regs;
4581 int stop;
4582 {
4583 int result;
4584
4585 #ifdef emacs
4586 int charpos;
4587 gl_state.object = re_match_object;
4588 charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (pos));
4589 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
4590 #endif
4591
4592 result = re_match_2_internal (bufp, (re_char*) string1, size1,
4593 (re_char*) string2, size2,
4594 pos, regs, stop);
4595 #if defined C_ALLOCA && !defined REGEX_MALLOC
4596 alloca (0);
4597 #endif
4598 return result;
4599 }
4600 WEAK_ALIAS (__re_match_2, re_match_2)
4601
4602 #ifdef emacs
4603 #define TRANSLATE_VIA_MULTIBYTE(c) \
4604 do { \
4605 if (multibyte) \
4606 (c) = TRANSLATE (c); \
4607 else \
4608 { \
4609 MAKE_CHAR_MULTIBYTE (c); \
4610 (c) = TRANSLATE (c); \
4611 MAKE_CHAR_UNIBYTE (c); \
4612 } \
4613 } while (0)
4614
4615 #else
4616 #define TRANSLATE_VIA_MULTIBYTE(c) ((c) = TRANSLATE (c))
4617 #endif
4618
4619
4620 /* This is a separate function so that we can force an alloca cleanup
4621 afterwards. */
4622 static int
4623 re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
4624 struct re_pattern_buffer *bufp;
4625 re_char *string1, *string2;
4626 int size1, size2;
4627 int pos;
4628 struct re_registers *regs;
4629 int stop;
4630 {
4631 /* General temporaries. */
4632 int mcnt;
4633 size_t reg;
4634 boolean not;
4635
4636 /* Just past the end of the corresponding string. */
4637 re_char *end1, *end2;
4638
4639 /* Pointers into string1 and string2, just past the last characters in
4640 each to consider matching. */
4641 re_char *end_match_1, *end_match_2;
4642
4643 /* Where we are in the data, and the end of the current string. */
4644 re_char *d, *dend;
4645
4646 /* Used sometimes to remember where we were before starting matching
4647 an operator so that we can go back in case of failure. This "atomic"
4648 behavior of matching opcodes is indispensable to the correctness
4649 of the on_failure_keep_string_jump optimization. */
4650 re_char *dfail;
4651
4652 /* Where we are in the pattern, and the end of the pattern. */
4653 re_char *p = bufp->buffer;
4654 re_char *pend = p + bufp->used;
4655
4656 /* We use this to map every character in the string. */
4657 RE_TRANSLATE_TYPE translate = bufp->translate;
4658
4659 /* Nonzero if BUFP is setup for multibyte characters. We are sure
4660 that it is the same as RE_TARGET_MULTIBYTE_P (bufp). */
4661 const boolean multibyte = RE_MULTIBYTE_P (bufp);
4662
4663 /* Failure point stack. Each place that can handle a failure further
4664 down the line pushes a failure point on this stack. It consists of
4665 regstart, and regend for all registers corresponding to
4666 the subexpressions we're currently inside, plus the number of such
4667 registers, and, finally, two char *'s. The first char * is where
4668 to resume scanning the pattern; the second one is where to resume
4669 scanning the strings. */
4670 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
4671 fail_stack_type fail_stack;
4672 #endif
4673 #ifdef DEBUG
4674 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
4675 #endif
4676
4677 #if defined REL_ALLOC && defined REGEX_MALLOC
4678 /* This holds the pointer to the failure stack, when
4679 it is allocated relocatably. */
4680 fail_stack_elt_t *failure_stack_ptr;
4681 #endif
4682
4683 /* We fill all the registers internally, independent of what we
4684 return, for use in backreferences. The number here includes
4685 an element for register zero. */
4686 size_t num_regs = bufp->re_nsub + 1;
4687
4688 /* Information on the contents of registers. These are pointers into
4689 the input strings; they record just what was matched (on this
4690 attempt) by a subexpression part of the pattern, that is, the
4691 regnum-th regstart pointer points to where in the pattern we began
4692 matching and the regnum-th regend points to right after where we
4693 stopped matching the regnum-th subexpression. (The zeroth register
4694 keeps track of what the whole pattern matches.) */
4695 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4696 re_char **regstart, **regend;
4697 #endif
4698
4699 /* The following record the register info as found in the above
4700 variables when we find a match better than any we've seen before.
4701 This happens as we backtrack through the failure points, which in
4702 turn happens only if we have not yet matched the entire string. */
4703 unsigned best_regs_set = false;
4704 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4705 re_char **best_regstart, **best_regend;
4706 #endif
4707
4708 /* Logically, this is `best_regend[0]'. But we don't want to have to
4709 allocate space for that if we're not allocating space for anything
4710 else (see below). Also, we never need info about register 0 for
4711 any of the other register vectors, and it seems rather a kludge to
4712 treat `best_regend' differently than the rest. So we keep track of
4713 the end of the best match so far in a separate variable. We
4714 initialize this to NULL so that when we backtrack the first time
4715 and need to test it, it's not garbage. */
4716 re_char *match_end = NULL;
4717
4718 #ifdef DEBUG
4719 /* Counts the total number of registers pushed. */
4720 unsigned num_regs_pushed = 0;
4721 #endif
4722
4723 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
4724
4725 INIT_FAIL_STACK ();
4726
4727 #ifdef MATCH_MAY_ALLOCATE
4728 /* Do not bother to initialize all the register variables if there are
4729 no groups in the pattern, as it takes a fair amount of time. If
4730 there are groups, we include space for register 0 (the whole
4731 pattern), even though we never use it, since it simplifies the
4732 array indexing. We should fix this. */
4733 if (bufp->re_nsub)
4734 {
4735 regstart = REGEX_TALLOC (num_regs, re_char *);
4736 regend = REGEX_TALLOC (num_regs, re_char *);
4737 best_regstart = REGEX_TALLOC (num_regs, re_char *);
4738 best_regend = REGEX_TALLOC (num_regs, re_char *);
4739
4740 if (!(regstart && regend && best_regstart && best_regend))
4741 {
4742 FREE_VARIABLES ();
4743 return -2;
4744 }
4745 }
4746 else
4747 {
4748 /* We must initialize all our variables to NULL, so that
4749 `FREE_VARIABLES' doesn't try to free them. */
4750 regstart = regend = best_regstart = best_regend = NULL;
4751 }
4752 #endif /* MATCH_MAY_ALLOCATE */
4753
4754 /* The starting position is bogus. */
4755 if (pos < 0 || pos > size1 + size2)
4756 {
4757 FREE_VARIABLES ();
4758 return -1;
4759 }
4760
4761 /* Initialize subexpression text positions to -1 to mark ones that no
4762 start_memory/stop_memory has been seen for. Also initialize the
4763 register information struct. */
4764 for (reg = 1; reg < num_regs; reg++)
4765 regstart[reg] = regend[reg] = NULL;
4766
4767 /* We move `string1' into `string2' if the latter's empty -- but not if
4768 `string1' is null. */
4769 if (size2 == 0 && string1 != NULL)
4770 {
4771 string2 = string1;
4772 size2 = size1;
4773 string1 = 0;
4774 size1 = 0;
4775 }
4776 end1 = string1 + size1;
4777 end2 = string2 + size2;
4778
4779 /* `p' scans through the pattern as `d' scans through the data.
4780 `dend' is the end of the input string that `d' points within. `d'
4781 is advanced into the following input string whenever necessary, but
4782 this happens before fetching; therefore, at the beginning of the
4783 loop, `d' can be pointing at the end of a string, but it cannot
4784 equal `string2'. */
4785 if (pos >= size1)
4786 {
4787 /* Only match within string2. */
4788 d = string2 + pos - size1;
4789 dend = end_match_2 = string2 + stop - size1;
4790 end_match_1 = end1; /* Just to give it a value. */
4791 }
4792 else
4793 {
4794 if (stop < size1)
4795 {
4796 /* Only match within string1. */
4797 end_match_1 = string1 + stop;
4798 /* BEWARE!
4799 When we reach end_match_1, PREFETCH normally switches to string2.
4800 But in the present case, this means that just doing a PREFETCH
4801 makes us jump from `stop' to `gap' within the string.
4802 What we really want here is for the search to stop as
4803 soon as we hit end_match_1. That's why we set end_match_2
4804 to end_match_1 (since PREFETCH fails as soon as we hit
4805 end_match_2). */
4806 end_match_2 = end_match_1;
4807 }
4808 else
4809 { /* It's important to use this code when stop == size so that
4810 moving `d' from end1 to string2 will not prevent the d == dend
4811 check from catching the end of string. */
4812 end_match_1 = end1;
4813 end_match_2 = string2 + stop - size1;
4814 }
4815 d = string1 + pos;
4816 dend = end_match_1;
4817 }
4818
4819 DEBUG_PRINT1 ("The compiled pattern is: ");
4820 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
4821 DEBUG_PRINT1 ("The string to match is: `");
4822 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
4823 DEBUG_PRINT1 ("'\n");
4824
4825 /* This loops over pattern commands. It exits by returning from the
4826 function if the match is complete, or it drops through if the match
4827 fails at this starting point in the input data. */
4828 for (;;)
4829 {
4830 DEBUG_PRINT2 ("\n%p: ", p);
4831
4832 if (p == pend)
4833 { /* End of pattern means we might have succeeded. */
4834 DEBUG_PRINT1 ("end of pattern ... ");
4835
4836 /* If we haven't matched the entire string, and we want the
4837 longest match, try backtracking. */
4838 if (d != end_match_2)
4839 {
4840 /* 1 if this match ends in the same string (string1 or string2)
4841 as the best previous match. */
4842 boolean same_str_p = (FIRST_STRING_P (match_end)
4843 == FIRST_STRING_P (d));
4844 /* 1 if this match is the best seen so far. */
4845 boolean best_match_p;
4846
4847 /* AIX compiler got confused when this was combined
4848 with the previous declaration. */
4849 if (same_str_p)
4850 best_match_p = d > match_end;
4851 else
4852 best_match_p = !FIRST_STRING_P (d);
4853
4854 DEBUG_PRINT1 ("backtracking.\n");
4855
4856 if (!FAIL_STACK_EMPTY ())
4857 { /* More failure points to try. */
4858
4859 /* If exceeds best match so far, save it. */
4860 if (!best_regs_set || best_match_p)
4861 {
4862 best_regs_set = true;
4863 match_end = d;
4864
4865 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
4866
4867 for (reg = 1; reg < num_regs; reg++)
4868 {
4869 best_regstart[reg] = regstart[reg];
4870 best_regend[reg] = regend[reg];
4871 }
4872 }
4873 goto fail;
4874 }
4875
4876 /* If no failure points, don't restore garbage. And if
4877 last match is real best match, don't restore second
4878 best one. */
4879 else if (best_regs_set && !best_match_p)
4880 {
4881 restore_best_regs:
4882 /* Restore best match. It may happen that `dend ==
4883 end_match_1' while the restored d is in string2.
4884 For example, the pattern `x.*y.*z' against the
4885 strings `x-' and `y-z-', if the two strings are
4886 not consecutive in memory. */
4887 DEBUG_PRINT1 ("Restoring best registers.\n");
4888
4889 d = match_end;
4890 dend = ((d >= string1 && d <= end1)
4891 ? end_match_1 : end_match_2);
4892
4893 for (reg = 1; reg < num_regs; reg++)
4894 {
4895 regstart[reg] = best_regstart[reg];
4896 regend[reg] = best_regend[reg];
4897 }
4898 }
4899 } /* d != end_match_2 */
4900
4901 succeed_label:
4902 DEBUG_PRINT1 ("Accepting match.\n");
4903
4904 /* If caller wants register contents data back, do it. */
4905 if (regs && !bufp->no_sub)
4906 {
4907 /* Have the register data arrays been allocated? */
4908 if (bufp->regs_allocated == REGS_UNALLOCATED)
4909 { /* No. So allocate them with malloc. We need one
4910 extra element beyond `num_regs' for the `-1' marker
4911 GNU code uses. */
4912 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
4913 regs->start = TALLOC (regs->num_regs, regoff_t);
4914 regs->end = TALLOC (regs->num_regs, regoff_t);
4915 if (regs->start == NULL || regs->end == NULL)
4916 {
4917 FREE_VARIABLES ();
4918 return -2;
4919 }
4920 bufp->regs_allocated = REGS_REALLOCATE;
4921 }
4922 else if (bufp->regs_allocated == REGS_REALLOCATE)
4923 { /* Yes. If we need more elements than were already
4924 allocated, reallocate them. If we need fewer, just
4925 leave it alone. */
4926 if (regs->num_regs < num_regs + 1)
4927 {
4928 regs->num_regs = num_regs + 1;
4929 RETALLOC (regs->start, regs->num_regs, regoff_t);
4930 RETALLOC (regs->end, regs->num_regs, regoff_t);
4931 if (regs->start == NULL || regs->end == NULL)
4932 {
4933 FREE_VARIABLES ();
4934 return -2;
4935 }
4936 }
4937 }
4938 else
4939 {
4940 /* These braces fend off a "empty body in an else-statement"
4941 warning under GCC when assert expands to nothing. */
4942 assert (bufp->regs_allocated == REGS_FIXED);
4943 }
4944
4945 /* Convert the pointer data in `regstart' and `regend' to
4946 indices. Register zero has to be set differently,
4947 since we haven't kept track of any info for it. */
4948 if (regs->num_regs > 0)
4949 {
4950 regs->start[0] = pos;
4951 regs->end[0] = POINTER_TO_OFFSET (d);
4952 }
4953
4954 /* Go through the first `min (num_regs, regs->num_regs)'
4955 registers, since that is all we initialized. */
4956 for (reg = 1; reg < MIN (num_regs, regs->num_regs); reg++)
4957 {
4958 if (REG_UNSET (regstart[reg]) || REG_UNSET (regend[reg]))
4959 regs->start[reg] = regs->end[reg] = -1;
4960 else
4961 {
4962 regs->start[reg]
4963 = (regoff_t) POINTER_TO_OFFSET (regstart[reg]);
4964 regs->end[reg]
4965 = (regoff_t) POINTER_TO_OFFSET (regend[reg]);
4966 }
4967 }
4968
4969 /* If the regs structure we return has more elements than
4970 were in the pattern, set the extra elements to -1. If
4971 we (re)allocated the registers, this is the case,
4972 because we always allocate enough to have at least one
4973 -1 at the end. */
4974 for (reg = num_regs; reg < regs->num_regs; reg++)
4975 regs->start[reg] = regs->end[reg] = -1;
4976 } /* regs && !bufp->no_sub */
4977
4978 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
4979 nfailure_points_pushed, nfailure_points_popped,
4980 nfailure_points_pushed - nfailure_points_popped);
4981 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
4982
4983 mcnt = POINTER_TO_OFFSET (d) - pos;
4984
4985 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
4986
4987 FREE_VARIABLES ();
4988 return mcnt;
4989 }
4990
4991 /* Otherwise match next pattern command. */
4992 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
4993 {
4994 /* Ignore these. Used to ignore the n of succeed_n's which
4995 currently have n == 0. */
4996 case no_op:
4997 DEBUG_PRINT1 ("EXECUTING no_op.\n");
4998 break;
4999
5000 case succeed:
5001 DEBUG_PRINT1 ("EXECUTING succeed.\n");
5002 goto succeed_label;
5003
5004 /* Match the next n pattern characters exactly. The following
5005 byte in the pattern defines n, and the n bytes after that
5006 are the characters to match. */
5007 case exactn:
5008 mcnt = *p++;
5009 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
5010
5011 /* Remember the start point to rollback upon failure. */
5012 dfail = d;
5013
5014 #ifndef emacs
5015 /* This is written out as an if-else so we don't waste time
5016 testing `translate' inside the loop. */
5017 if (RE_TRANSLATE_P (translate))
5018 do
5019 {
5020 PREFETCH ();
5021 if (RE_TRANSLATE (translate, *d) != *p++)
5022 {
5023 d = dfail;
5024 goto fail;
5025 }
5026 d++;
5027 }
5028 while (--mcnt);
5029 else
5030 do
5031 {
5032 PREFETCH ();
5033 if (*d++ != *p++)
5034 {
5035 d = dfail;
5036 goto fail;
5037 }
5038 }
5039 while (--mcnt);
5040 #else /* emacs */
5041 /* The cost of testing `translate' is comparatively small. */
5042 if (multibyte)
5043 do
5044 {
5045 int pat_charlen, buf_charlen;
5046 unsigned int pat_ch, buf_ch;
5047
5048 PREFETCH ();
5049 pat_ch = STRING_CHAR_AND_LENGTH (p, pend - p, pat_charlen);
5050 buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
5051
5052 if (TRANSLATE (buf_ch) != pat_ch)
5053 {
5054 d = dfail;
5055 goto fail;
5056 }
5057
5058 p += pat_charlen;
5059 d += buf_charlen;
5060 mcnt -= pat_charlen;
5061 }
5062 while (mcnt > 0);
5063 else
5064 do
5065 {
5066 unsigned int buf_ch;
5067
5068 PREFETCH ();
5069 buf_ch = *d++;
5070 TRANSLATE_VIA_MULTIBYTE (buf_ch);
5071 if (buf_ch != *p++)
5072 {
5073 d = dfail;
5074 goto fail;
5075 }
5076 }
5077 while (--mcnt);
5078 #endif
5079 break;
5080
5081 /* Match any character except possibly a newline or a null. */
5082 case anychar:
5083 {
5084 int buf_charlen;
5085 re_wchar_t buf_ch;
5086
5087 DEBUG_PRINT1 ("EXECUTING anychar.\n");
5088
5089 PREFETCH ();
5090 buf_ch = RE_STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
5091
5092 if ((!(bufp->syntax & RE_DOT_NEWLINE)
5093 && buf_ch == '\n')
5094 || ((bufp->syntax & RE_DOT_NOT_NULL)
5095 && buf_ch == '\000'))
5096 goto fail;
5097
5098 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
5099 d += buf_charlen;
5100 }
5101 break;
5102
5103
5104 case charset:
5105 case charset_not:
5106 {
5107 register unsigned int c;
5108 boolean not = (re_opcode_t) *(p - 1) == charset_not;
5109 int len;
5110
5111 /* Start of actual range_table, or end of bitmap if there is no
5112 range table. */
5113 re_char *range_table;
5114
5115 /* Nonzero if there is a range table. */
5116 int range_table_exists;
5117
5118 /* Number of ranges of range table. This is not included
5119 in the initial byte-length of the command. */
5120 int count = 0;
5121
5122 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
5123
5124 range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]);
5125
5126 if (range_table_exists)
5127 {
5128 range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */
5129 EXTRACT_NUMBER_AND_INCR (count, range_table);
5130 }
5131
5132 PREFETCH ();
5133 c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len);
5134 TRANSLATE_VIA_MULTIBYTE (c); /* The character to match. */
5135
5136 if (! multibyte || IS_REAL_ASCII (c))
5137 { /* Lookup bitmap. */
5138 /* Cast to `unsigned' instead of `unsigned char' in
5139 case the bit list is a full 32 bytes long. */
5140 if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH)
5141 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
5142 not = !not;
5143 }
5144 #ifdef emacs
5145 else if (range_table_exists)
5146 {
5147 int class_bits = CHARSET_RANGE_TABLE_BITS (&p[-1]);
5148
5149 if ( (class_bits & BIT_LOWER && ISLOWER (c))
5150 | (class_bits & BIT_MULTIBYTE)
5151 | (class_bits & BIT_PUNCT && ISPUNCT (c))
5152 | (class_bits & BIT_SPACE && ISSPACE (c))
5153 | (class_bits & BIT_UPPER && ISUPPER (c))
5154 | (class_bits & BIT_WORD && ISWORD (c)))
5155 not = !not;
5156 else
5157 CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count);
5158 }
5159 #endif /* emacs */
5160
5161 if (range_table_exists)
5162 p = CHARSET_RANGE_TABLE_END (range_table, count);
5163 else
5164 p += CHARSET_BITMAP_SIZE (&p[-1]) + 1;
5165
5166 if (!not) goto fail;
5167
5168 d += len;
5169 break;
5170 }
5171
5172
5173 /* The beginning of a group is represented by start_memory.
5174 The argument is the register number. The text
5175 matched within the group is recorded (in the internal
5176 registers data structure) under the register number. */
5177 case start_memory:
5178 DEBUG_PRINT2 ("EXECUTING start_memory %d:\n", *p);
5179
5180 /* In case we need to undo this operation (via backtracking). */
5181 PUSH_FAILURE_REG ((unsigned int)*p);
5182
5183 regstart[*p] = d;
5184 regend[*p] = NULL; /* probably unnecessary. -sm */
5185 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
5186
5187 /* Move past the register number and inner group count. */
5188 p += 1;
5189 break;
5190
5191
5192 /* The stop_memory opcode represents the end of a group. Its
5193 argument is the same as start_memory's: the register number. */
5194 case stop_memory:
5195 DEBUG_PRINT2 ("EXECUTING stop_memory %d:\n", *p);
5196
5197 assert (!REG_UNSET (regstart[*p]));
5198 /* Strictly speaking, there should be code such as:
5199
5200 assert (REG_UNSET (regend[*p]));
5201 PUSH_FAILURE_REGSTOP ((unsigned int)*p);
5202
5203 But the only info to be pushed is regend[*p] and it is known to
5204 be UNSET, so there really isn't anything to push.
5205 Not pushing anything, on the other hand deprives us from the
5206 guarantee that regend[*p] is UNSET since undoing this operation
5207 will not reset its value properly. This is not important since
5208 the value will only be read on the next start_memory or at
5209 the very end and both events can only happen if this stop_memory
5210 is *not* undone. */
5211
5212 regend[*p] = d;
5213 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
5214
5215 /* Move past the register number and the inner group count. */
5216 p += 1;
5217 break;
5218
5219
5220 /* \<digit> has been turned into a `duplicate' command which is
5221 followed by the numeric value of <digit> as the register number. */
5222 case duplicate:
5223 {
5224 register re_char *d2, *dend2;
5225 int regno = *p++; /* Get which register to match against. */
5226 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
5227
5228 /* Can't back reference a group which we've never matched. */
5229 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
5230 goto fail;
5231
5232 /* Where in input to try to start matching. */
5233 d2 = regstart[regno];
5234
5235 /* Remember the start point to rollback upon failure. */
5236 dfail = d;
5237
5238 /* Where to stop matching; if both the place to start and
5239 the place to stop matching are in the same string, then
5240 set to the place to stop, otherwise, for now have to use
5241 the end of the first string. */
5242
5243 dend2 = ((FIRST_STRING_P (regstart[regno])
5244 == FIRST_STRING_P (regend[regno]))
5245 ? regend[regno] : end_match_1);
5246 for (;;)
5247 {
5248 /* If necessary, advance to next segment in register
5249 contents. */
5250 while (d2 == dend2)
5251 {
5252 if (dend2 == end_match_2) break;
5253 if (dend2 == regend[regno]) break;
5254
5255 /* End of string1 => advance to string2. */
5256 d2 = string2;
5257 dend2 = regend[regno];
5258 }
5259 /* At end of register contents => success */
5260 if (d2 == dend2) break;
5261
5262 /* If necessary, advance to next segment in data. */
5263 PREFETCH ();
5264
5265 /* How many characters left in this segment to match. */
5266 mcnt = dend - d;
5267
5268 /* Want how many consecutive characters we can match in
5269 one shot, so, if necessary, adjust the count. */
5270 if (mcnt > dend2 - d2)
5271 mcnt = dend2 - d2;
5272
5273 /* Compare that many; failure if mismatch, else move
5274 past them. */
5275 if (RE_TRANSLATE_P (translate)
5276 ? bcmp_translate (d, d2, mcnt, translate, multibyte)
5277 : memcmp (d, d2, mcnt))
5278 {
5279 d = dfail;
5280 goto fail;
5281 }
5282 d += mcnt, d2 += mcnt;
5283 }
5284 }
5285 break;
5286
5287
5288 /* begline matches the empty string at the beginning of the string
5289 (unless `not_bol' is set in `bufp'), and after newlines. */
5290 case begline:
5291 DEBUG_PRINT1 ("EXECUTING begline.\n");
5292
5293 if (AT_STRINGS_BEG (d))
5294 {
5295 if (!bufp->not_bol) break;
5296 }
5297 else
5298 {
5299 unsigned c;
5300 GET_CHAR_BEFORE_2 (c, d, string1, end1, string2, end2);
5301 if (c == '\n')
5302 break;
5303 }
5304 /* In all other cases, we fail. */
5305 goto fail;
5306
5307
5308 /* endline is the dual of begline. */
5309 case endline:
5310 DEBUG_PRINT1 ("EXECUTING endline.\n");
5311
5312 if (AT_STRINGS_END (d))
5313 {
5314 if (!bufp->not_eol) break;
5315 }
5316 else
5317 {
5318 PREFETCH_NOLIMIT ();
5319 if (*d == '\n')
5320 break;
5321 }
5322 goto fail;
5323
5324
5325 /* Match at the very beginning of the data. */
5326 case begbuf:
5327 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
5328 if (AT_STRINGS_BEG (d))
5329 break;
5330 goto fail;
5331
5332
5333 /* Match at the very end of the data. */
5334 case endbuf:
5335 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
5336 if (AT_STRINGS_END (d))
5337 break;
5338 goto fail;
5339
5340
5341 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
5342 pushes NULL as the value for the string on the stack. Then
5343 `POP_FAILURE_POINT' will keep the current value for the
5344 string, instead of restoring it. To see why, consider
5345 matching `foo\nbar' against `.*\n'. The .* matches the foo;
5346 then the . fails against the \n. But the next thing we want
5347 to do is match the \n against the \n; if we restored the
5348 string value, we would be back at the foo.
5349
5350 Because this is used only in specific cases, we don't need to
5351 check all the things that `on_failure_jump' does, to make
5352 sure the right things get saved on the stack. Hence we don't
5353 share its code. The only reason to push anything on the
5354 stack at all is that otherwise we would have to change
5355 `anychar's code to do something besides goto fail in this
5356 case; that seems worse than this. */
5357 case on_failure_keep_string_jump:
5358 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5359 DEBUG_PRINT3 ("EXECUTING on_failure_keep_string_jump %d (to %p):\n",
5360 mcnt, p + mcnt);
5361
5362 PUSH_FAILURE_POINT (p - 3, NULL);
5363 break;
5364
5365 /* A nasty loop is introduced by the non-greedy *? and +?.
5366 With such loops, the stack only ever contains one failure point
5367 at a time, so that a plain on_failure_jump_loop kind of
5368 cycle detection cannot work. Worse yet, such a detection
5369 can not only fail to detect a cycle, but it can also wrongly
5370 detect a cycle (between different instantiations of the same
5371 loop.
5372 So the method used for those nasty loops is a little different:
5373 We use a special cycle-detection-stack-frame which is pushed
5374 when the on_failure_jump_nastyloop failure-point is *popped*.
5375 This special frame thus marks the beginning of one iteration
5376 through the loop and we can hence easily check right here
5377 whether something matched between the beginning and the end of
5378 the loop. */
5379 case on_failure_jump_nastyloop:
5380 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5381 DEBUG_PRINT3 ("EXECUTING on_failure_jump_nastyloop %d (to %p):\n",
5382 mcnt, p + mcnt);
5383
5384 assert ((re_opcode_t)p[-4] == no_op);
5385 CHECK_INFINITE_LOOP (p - 4, d);
5386 PUSH_FAILURE_POINT (p - 3, d);
5387 break;
5388
5389
5390 /* Simple loop detecting on_failure_jump: just check on the
5391 failure stack if the same spot was already hit earlier. */
5392 case on_failure_jump_loop:
5393 on_failure:
5394 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5395 DEBUG_PRINT3 ("EXECUTING on_failure_jump_loop %d (to %p):\n",
5396 mcnt, p + mcnt);
5397
5398 CHECK_INFINITE_LOOP (p - 3, d);
5399 PUSH_FAILURE_POINT (p - 3, d);
5400 break;
5401
5402
5403 /* Uses of on_failure_jump:
5404
5405 Each alternative starts with an on_failure_jump that points
5406 to the beginning of the next alternative. Each alternative
5407 except the last ends with a jump that in effect jumps past
5408 the rest of the alternatives. (They really jump to the
5409 ending jump of the following alternative, because tensioning
5410 these jumps is a hassle.)
5411
5412 Repeats start with an on_failure_jump that points past both
5413 the repetition text and either the following jump or
5414 pop_failure_jump back to this on_failure_jump. */
5415 case on_failure_jump:
5416 IMMEDIATE_QUIT_CHECK;
5417 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5418 DEBUG_PRINT3 ("EXECUTING on_failure_jump %d (to %p):\n",
5419 mcnt, p + mcnt);
5420
5421 PUSH_FAILURE_POINT (p -3, d);
5422 break;
5423
5424 /* This operation is used for greedy *.
5425 Compare the beginning of the repeat with what in the
5426 pattern follows its end. If we can establish that there
5427 is nothing that they would both match, i.e., that we
5428 would have to backtrack because of (as in, e.g., `a*a')
5429 then we can use a non-backtracking loop based on
5430 on_failure_keep_string_jump instead of on_failure_jump. */
5431 case on_failure_jump_smart:
5432 IMMEDIATE_QUIT_CHECK;
5433 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5434 DEBUG_PRINT3 ("EXECUTING on_failure_jump_smart %d (to %p).\n",
5435 mcnt, p + mcnt);
5436 {
5437 re_char *p1 = p; /* Next operation. */
5438 /* Here, we discard `const', making re_match non-reentrant. */
5439 unsigned char *p2 = (unsigned char*) p + mcnt; /* Jump dest. */
5440 unsigned char *p3 = (unsigned char*) p - 3; /* opcode location. */
5441
5442 p -= 3; /* Reset so that we will re-execute the
5443 instruction once it's been changed. */
5444
5445 EXTRACT_NUMBER (mcnt, p2 - 2);
5446
5447 /* Ensure this is a indeed the trivial kind of loop
5448 we are expecting. */
5449 assert (skip_one_char (p1) == p2 - 3);
5450 assert ((re_opcode_t) p2[-3] == jump && p2 + mcnt == p);
5451 DEBUG_STATEMENT (debug += 2);
5452 if (mutually_exclusive_p (bufp, p1, p2))
5453 {
5454 /* Use a fast `on_failure_keep_string_jump' loop. */
5455 DEBUG_PRINT1 (" smart exclusive => fast loop.\n");
5456 *p3 = (unsigned char) on_failure_keep_string_jump;
5457 STORE_NUMBER (p2 - 2, mcnt + 3);
5458 }
5459 else
5460 {
5461 /* Default to a safe `on_failure_jump' loop. */
5462 DEBUG_PRINT1 (" smart default => slow loop.\n");
5463 *p3 = (unsigned char) on_failure_jump;
5464 }
5465 DEBUG_STATEMENT (debug -= 2);
5466 }
5467 break;
5468
5469 /* Unconditionally jump (without popping any failure points). */
5470 case jump:
5471 unconditional_jump:
5472 IMMEDIATE_QUIT_CHECK;
5473 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
5474 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
5475 p += mcnt; /* Do the jump. */
5476 DEBUG_PRINT2 ("(to %p).\n", p);
5477 break;
5478
5479
5480 /* Have to succeed matching what follows at least n times.
5481 After that, handle like `on_failure_jump'. */
5482 case succeed_n:
5483 /* Signedness doesn't matter since we only compare MCNT to 0. */
5484 EXTRACT_NUMBER (mcnt, p + 2);
5485 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
5486
5487 /* Originally, mcnt is how many times we HAVE to succeed. */
5488 if (mcnt != 0)
5489 {
5490 /* Here, we discard `const', making re_match non-reentrant. */
5491 unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */
5492 mcnt--;
5493 p += 4;
5494 PUSH_NUMBER (p2, mcnt);
5495 }
5496 else
5497 /* The two bytes encoding mcnt == 0 are two no_op opcodes. */
5498 goto on_failure;
5499 break;
5500
5501 case jump_n:
5502 /* Signedness doesn't matter since we only compare MCNT to 0. */
5503 EXTRACT_NUMBER (mcnt, p + 2);
5504 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
5505
5506 /* Originally, this is how many times we CAN jump. */
5507 if (mcnt != 0)
5508 {
5509 /* Here, we discard `const', making re_match non-reentrant. */
5510 unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */
5511 mcnt--;
5512 PUSH_NUMBER (p2, mcnt);
5513 goto unconditional_jump;
5514 }
5515 /* If don't have to jump any more, skip over the rest of command. */
5516 else
5517 p += 4;
5518 break;
5519
5520 case set_number_at:
5521 {
5522 unsigned char *p2; /* Location of the counter. */
5523 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
5524
5525 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5526 /* Here, we discard `const', making re_match non-reentrant. */
5527 p2 = (unsigned char*) p + mcnt;
5528 /* Signedness doesn't matter since we only copy MCNT's bits . */
5529 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5530 DEBUG_PRINT3 (" Setting %p to %d.\n", p2, mcnt);
5531 PUSH_NUMBER (p2, mcnt);
5532 break;
5533 }
5534
5535 case wordbound:
5536 case notwordbound:
5537 not = (re_opcode_t) *(p - 1) == notwordbound;
5538 DEBUG_PRINT2 ("EXECUTING %swordbound.\n", not?"not":"");
5539
5540 /* We SUCCEED (or FAIL) in one of the following cases: */
5541
5542 /* Case 1: D is at the beginning or the end of string. */
5543 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
5544 not = !not;
5545 else
5546 {
5547 /* C1 is the character before D, S1 is the syntax of C1, C2
5548 is the character at D, and S2 is the syntax of C2. */
5549 re_wchar_t c1, c2;
5550 int s1, s2;
5551 int dummy;
5552 #ifdef emacs
5553 int offset = PTR_TO_OFFSET (d - 1);
5554 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
5555 UPDATE_SYNTAX_TABLE (charpos);
5556 #endif
5557 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5558 s1 = SYNTAX (c1);
5559 #ifdef emacs
5560 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
5561 #endif
5562 PREFETCH_NOLIMIT ();
5563 GET_CHAR_AFTER (c2, d, dummy);
5564 s2 = SYNTAX (c2);
5565
5566 if (/* Case 2: Only one of S1 and S2 is Sword. */
5567 ((s1 == Sword) != (s2 == Sword))
5568 /* Case 3: Both of S1 and S2 are Sword, and macro
5569 WORD_BOUNDARY_P (C1, C2) returns nonzero. */
5570 || ((s1 == Sword) && WORD_BOUNDARY_P (c1, c2)))
5571 not = !not;
5572 }
5573 if (not)
5574 break;
5575 else
5576 goto fail;
5577
5578 case wordbeg:
5579 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
5580
5581 /* We FAIL in one of the following cases: */
5582
5583 /* Case 1: D is at the end of string. */
5584 if (AT_STRINGS_END (d))
5585 goto fail;
5586 else
5587 {
5588 /* C1 is the character before D, S1 is the syntax of C1, C2
5589 is the character at D, and S2 is the syntax of C2. */
5590 re_wchar_t c1, c2;
5591 int s1, s2;
5592 int dummy;
5593 #ifdef emacs
5594 int offset = PTR_TO_OFFSET (d);
5595 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
5596 UPDATE_SYNTAX_TABLE (charpos);
5597 #endif
5598 PREFETCH ();
5599 GET_CHAR_AFTER (c2, d, dummy);
5600 s2 = SYNTAX (c2);
5601
5602 /* Case 2: S2 is not Sword. */
5603 if (s2 != Sword)
5604 goto fail;
5605
5606 /* Case 3: D is not at the beginning of string ... */
5607 if (!AT_STRINGS_BEG (d))
5608 {
5609 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5610 #ifdef emacs
5611 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
5612 #endif
5613 s1 = SYNTAX (c1);
5614
5615 /* ... and S1 is Sword, and WORD_BOUNDARY_P (C1, C2)
5616 returns 0. */
5617 if ((s1 == Sword) && !WORD_BOUNDARY_P (c1, c2))
5618 goto fail;
5619 }
5620 }
5621 break;
5622
5623 case wordend:
5624 DEBUG_PRINT1 ("EXECUTING wordend.\n");
5625
5626 /* We FAIL in one of the following cases: */
5627
5628 /* Case 1: D is at the beginning of string. */
5629 if (AT_STRINGS_BEG (d))
5630 goto fail;
5631 else
5632 {
5633 /* C1 is the character before D, S1 is the syntax of C1, C2
5634 is the character at D, and S2 is the syntax of C2. */
5635 re_wchar_t c1, c2;
5636 int s1, s2;
5637 int dummy;
5638 #ifdef emacs
5639 int offset = PTR_TO_OFFSET (d) - 1;
5640 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
5641 UPDATE_SYNTAX_TABLE (charpos);
5642 #endif
5643 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5644 s1 = SYNTAX (c1);
5645
5646 /* Case 2: S1 is not Sword. */
5647 if (s1 != Sword)
5648 goto fail;
5649
5650 /* Case 3: D is not at the end of string ... */
5651 if (!AT_STRINGS_END (d))
5652 {
5653 PREFETCH_NOLIMIT ();
5654 GET_CHAR_AFTER (c2, d, dummy);
5655 #ifdef emacs
5656 UPDATE_SYNTAX_TABLE_FORWARD (charpos);
5657 #endif
5658 s2 = SYNTAX (c2);
5659
5660 /* ... and S2 is Sword, and WORD_BOUNDARY_P (C1, C2)
5661 returns 0. */
5662 if ((s2 == Sword) && !WORD_BOUNDARY_P (c1, c2))
5663 goto fail;
5664 }
5665 }
5666 break;
5667
5668 case syntaxspec:
5669 case notsyntaxspec:
5670 not = (re_opcode_t) *(p - 1) == notsyntaxspec;
5671 mcnt = *p++;
5672 DEBUG_PRINT3 ("EXECUTING %ssyntaxspec %d.\n", not?"not":"", mcnt);
5673 PREFETCH ();
5674 #ifdef emacs
5675 {
5676 int offset = PTR_TO_OFFSET (d);
5677 int pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
5678 UPDATE_SYNTAX_TABLE (pos1);
5679 }
5680 #endif
5681 {
5682 int len;
5683 re_wchar_t c;
5684
5685 GET_CHAR_AFTER (c, d, len);
5686 if ((SYNTAX (c) != (enum syntaxcode) mcnt) ^ not)
5687 goto fail;
5688 d += len;
5689 }
5690 break;
5691
5692 #ifdef emacs
5693 case before_dot:
5694 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
5695 if (PTR_BYTE_POS (d) >= PT_BYTE)
5696 goto fail;
5697 break;
5698
5699 case at_dot:
5700 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
5701 if (PTR_BYTE_POS (d) != PT_BYTE)
5702 goto fail;
5703 break;
5704
5705 case after_dot:
5706 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
5707 if (PTR_BYTE_POS (d) <= PT_BYTE)
5708 goto fail;
5709 break;
5710
5711 case categoryspec:
5712 case notcategoryspec:
5713 not = (re_opcode_t) *(p - 1) == notcategoryspec;
5714 mcnt = *p++;
5715 DEBUG_PRINT3 ("EXECUTING %scategoryspec %d.\n", not?"not":"", mcnt);
5716 PREFETCH ();
5717 {
5718 int len;
5719 re_wchar_t c;
5720
5721 GET_CHAR_AFTER (c, d, len);
5722
5723 if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not)
5724 goto fail;
5725 d += len;
5726 }
5727 break;
5728
5729 #endif /* emacs */
5730
5731 default:
5732 abort ();
5733 }
5734 continue; /* Successfully executed one pattern command; keep going. */
5735
5736
5737 /* We goto here if a matching operation fails. */
5738 fail:
5739 IMMEDIATE_QUIT_CHECK;
5740 if (!FAIL_STACK_EMPTY ())
5741 {
5742 re_char *str, *pat;
5743 /* A restart point is known. Restore to that state. */
5744 DEBUG_PRINT1 ("\nFAIL:\n");
5745 POP_FAILURE_POINT (str, pat);
5746 switch (SWITCH_ENUM_CAST ((re_opcode_t) *pat++))
5747 {
5748 case on_failure_keep_string_jump:
5749 assert (str == NULL);
5750 goto continue_failure_jump;
5751
5752 case on_failure_jump_nastyloop:
5753 assert ((re_opcode_t)pat[-2] == no_op);
5754 PUSH_FAILURE_POINT (pat - 2, str);
5755 /* Fallthrough */
5756
5757 case on_failure_jump_loop:
5758 case on_failure_jump:
5759 case succeed_n:
5760 d = str;
5761 continue_failure_jump:
5762 EXTRACT_NUMBER_AND_INCR (mcnt, pat);
5763 p = pat + mcnt;
5764 break;
5765
5766 case no_op:
5767 /* A special frame used for nastyloops. */
5768 goto fail;
5769
5770 default:
5771 abort();
5772 }
5773
5774 assert (p >= bufp->buffer && p <= pend);
5775
5776 if (d >= string1 && d <= end1)
5777 dend = end_match_1;
5778 }
5779 else
5780 break; /* Matching at this starting point really fails. */
5781 } /* for (;;) */
5782
5783 if (best_regs_set)
5784 goto restore_best_regs;
5785
5786 FREE_VARIABLES ();
5787
5788 return -1; /* Failure to match. */
5789 } /* re_match_2 */
5790 \f
5791 /* Subroutine definitions for re_match_2. */
5792
5793 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
5794 bytes; nonzero otherwise. */
5795
5796 static int
5797 bcmp_translate (s1, s2, len, translate, multibyte)
5798 re_char *s1, *s2;
5799 register int len;
5800 RE_TRANSLATE_TYPE translate;
5801 const int multibyte;
5802 {
5803 register re_char *p1 = s1, *p2 = s2;
5804 re_char *p1_end = s1 + len;
5805 re_char *p2_end = s2 + len;
5806
5807 /* FIXME: Checking both p1 and p2 presumes that the two strings might have
5808 different lengths, but relying on a single `len' would break this. -sm */
5809 while (p1 < p1_end && p2 < p2_end)
5810 {
5811 int p1_charlen, p2_charlen;
5812 re_wchar_t p1_ch, p2_ch;
5813
5814 GET_CHAR_AFTER (p1_ch, p1, p1_charlen);
5815 GET_CHAR_AFTER (p2_ch, p2, p2_charlen);
5816
5817 if (RE_TRANSLATE (translate, p1_ch)
5818 != RE_TRANSLATE (translate, p2_ch))
5819 return 1;
5820
5821 p1 += p1_charlen, p2 += p2_charlen;
5822 }
5823
5824 if (p1 != p1_end || p2 != p2_end)
5825 return 1;
5826
5827 return 0;
5828 }
5829 \f
5830 /* Entry points for GNU code. */
5831
5832 /* re_compile_pattern is the GNU regular expression compiler: it
5833 compiles PATTERN (of length SIZE) and puts the result in BUFP.
5834 Returns 0 if the pattern was valid, otherwise an error string.
5835
5836 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
5837 are set in BUFP on entry.
5838
5839 We call regex_compile to do the actual compilation. */
5840
5841 const char *
5842 re_compile_pattern (pattern, length, bufp)
5843 const char *pattern;
5844 size_t length;
5845 struct re_pattern_buffer *bufp;
5846 {
5847 reg_errcode_t ret;
5848
5849 /* GNU code is written to assume at least RE_NREGS registers will be set
5850 (and at least one extra will be -1). */
5851 bufp->regs_allocated = REGS_UNALLOCATED;
5852
5853 /* And GNU code determines whether or not to get register information
5854 by passing null for the REGS argument to re_match, etc., not by
5855 setting no_sub. */
5856 bufp->no_sub = 0;
5857
5858 ret = regex_compile ((re_char*) pattern, length, re_syntax_options, bufp);
5859
5860 if (!ret)
5861 return NULL;
5862 return gettext (re_error_msgid[(int) ret]);
5863 }
5864 WEAK_ALIAS (__re_compile_pattern, re_compile_pattern)
5865 \f
5866 /* Entry points compatible with 4.2 BSD regex library. We don't define
5867 them unless specifically requested. */
5868
5869 #if defined _REGEX_RE_COMP || defined _LIBC
5870
5871 /* BSD has one and only one pattern buffer. */
5872 static struct re_pattern_buffer re_comp_buf;
5873
5874 char *
5875 # ifdef _LIBC
5876 /* Make these definitions weak in libc, so POSIX programs can redefine
5877 these names if they don't use our functions, and still use
5878 regcomp/regexec below without link errors. */
5879 weak_function
5880 # endif
5881 re_comp (s)
5882 const char *s;
5883 {
5884 reg_errcode_t ret;
5885
5886 if (!s)
5887 {
5888 if (!re_comp_buf.buffer)
5889 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
5890 return (char *) gettext ("No previous regular expression");
5891 return 0;
5892 }
5893
5894 if (!re_comp_buf.buffer)
5895 {
5896 re_comp_buf.buffer = (unsigned char *) malloc (200);
5897 if (re_comp_buf.buffer == NULL)
5898 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
5899 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
5900 re_comp_buf.allocated = 200;
5901
5902 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
5903 if (re_comp_buf.fastmap == NULL)
5904 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
5905 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
5906 }
5907
5908 /* Since `re_exec' always passes NULL for the `regs' argument, we
5909 don't need to initialize the pattern buffer fields which affect it. */
5910
5911 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
5912
5913 if (!ret)
5914 return NULL;
5915
5916 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
5917 return (char *) gettext (re_error_msgid[(int) ret]);
5918 }
5919
5920
5921 int
5922 # ifdef _LIBC
5923 weak_function
5924 # endif
5925 re_exec (s)
5926 const char *s;
5927 {
5928 const int len = strlen (s);
5929 return
5930 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
5931 }
5932 #endif /* _REGEX_RE_COMP */
5933 \f
5934 /* POSIX.2 functions. Don't define these for Emacs. */
5935
5936 #ifndef emacs
5937
5938 /* regcomp takes a regular expression as a string and compiles it.
5939
5940 PREG is a regex_t *. We do not expect any fields to be initialized,
5941 since POSIX says we shouldn't. Thus, we set
5942
5943 `buffer' to the compiled pattern;
5944 `used' to the length of the compiled pattern;
5945 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
5946 REG_EXTENDED bit in CFLAGS is set; otherwise, to
5947 RE_SYNTAX_POSIX_BASIC;
5948 `fastmap' to an allocated space for the fastmap;
5949 `fastmap_accurate' to zero;
5950 `re_nsub' to the number of subexpressions in PATTERN.
5951
5952 PATTERN is the address of the pattern string.
5953
5954 CFLAGS is a series of bits which affect compilation.
5955
5956 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
5957 use POSIX basic syntax.
5958
5959 If REG_NEWLINE is set, then . and [^...] don't match newline.
5960 Also, regexec will try a match beginning after every newline.
5961
5962 If REG_ICASE is set, then we considers upper- and lowercase
5963 versions of letters to be equivalent when matching.
5964
5965 If REG_NOSUB is set, then when PREG is passed to regexec, that
5966 routine will report only success or failure, and nothing about the
5967 registers.
5968
5969 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
5970 the return codes and their meanings.) */
5971
5972 int
5973 regcomp (preg, pattern, cflags)
5974 regex_t *__restrict preg;
5975 const char *__restrict pattern;
5976 int cflags;
5977 {
5978 reg_errcode_t ret;
5979 reg_syntax_t syntax
5980 = (cflags & REG_EXTENDED) ?
5981 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
5982
5983 /* regex_compile will allocate the space for the compiled pattern. */
5984 preg->buffer = 0;
5985 preg->allocated = 0;
5986 preg->used = 0;
5987
5988 /* Try to allocate space for the fastmap. */
5989 preg->fastmap = (char *) malloc (1 << BYTEWIDTH);
5990
5991 if (cflags & REG_ICASE)
5992 {
5993 unsigned i;
5994
5995 preg->translate
5996 = (RE_TRANSLATE_TYPE) malloc (CHAR_SET_SIZE
5997 * sizeof (*(RE_TRANSLATE_TYPE)0));
5998 if (preg->translate == NULL)
5999 return (int) REG_ESPACE;
6000
6001 /* Map uppercase characters to corresponding lowercase ones. */
6002 for (i = 0; i < CHAR_SET_SIZE; i++)
6003 preg->translate[i] = ISUPPER (i) ? TOLOWER (i) : i;
6004 }
6005 else
6006 preg->translate = NULL;
6007
6008 /* If REG_NEWLINE is set, newlines are treated differently. */
6009 if (cflags & REG_NEWLINE)
6010 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
6011 syntax &= ~RE_DOT_NEWLINE;
6012 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
6013 }
6014 else
6015 syntax |= RE_NO_NEWLINE_ANCHOR;
6016
6017 preg->no_sub = !!(cflags & REG_NOSUB);
6018
6019 /* POSIX says a null character in the pattern terminates it, so we
6020 can use strlen here in compiling the pattern. */
6021 ret = regex_compile ((re_char*) pattern, strlen (pattern), syntax, preg);
6022
6023 /* POSIX doesn't distinguish between an unmatched open-group and an
6024 unmatched close-group: both are REG_EPAREN. */
6025 if (ret == REG_ERPAREN)
6026 ret = REG_EPAREN;
6027
6028 if (ret == REG_NOERROR && preg->fastmap)
6029 { /* Compute the fastmap now, since regexec cannot modify the pattern
6030 buffer. */
6031 re_compile_fastmap (preg);
6032 if (preg->can_be_null)
6033 { /* The fastmap can't be used anyway. */
6034 free (preg->fastmap);
6035 preg->fastmap = NULL;
6036 }
6037 }
6038 return (int) ret;
6039 }
6040 WEAK_ALIAS (__regcomp, regcomp)
6041
6042
6043 /* regexec searches for a given pattern, specified by PREG, in the
6044 string STRING.
6045
6046 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
6047 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
6048 least NMATCH elements, and we set them to the offsets of the
6049 corresponding matched substrings.
6050
6051 EFLAGS specifies `execution flags' which affect matching: if
6052 REG_NOTBOL is set, then ^ does not match at the beginning of the
6053 string; if REG_NOTEOL is set, then $ does not match at the end.
6054
6055 We return 0 if we find a match and REG_NOMATCH if not. */
6056
6057 int
6058 regexec (preg, string, nmatch, pmatch, eflags)
6059 const regex_t *__restrict preg;
6060 const char *__restrict string;
6061 size_t nmatch;
6062 regmatch_t pmatch[];
6063 int eflags;
6064 {
6065 int ret;
6066 struct re_registers regs;
6067 regex_t private_preg;
6068 int len = strlen (string);
6069 boolean want_reg_info = !preg->no_sub && nmatch > 0 && pmatch;
6070
6071 private_preg = *preg;
6072
6073 private_preg.not_bol = !!(eflags & REG_NOTBOL);
6074 private_preg.not_eol = !!(eflags & REG_NOTEOL);
6075
6076 /* The user has told us exactly how many registers to return
6077 information about, via `nmatch'. We have to pass that on to the
6078 matching routines. */
6079 private_preg.regs_allocated = REGS_FIXED;
6080
6081 if (want_reg_info)
6082 {
6083 regs.num_regs = nmatch;
6084 regs.start = TALLOC (nmatch * 2, regoff_t);
6085 if (regs.start == NULL)
6086 return (int) REG_NOMATCH;
6087 regs.end = regs.start + nmatch;
6088 }
6089
6090 /* Instead of using not_eol to implement REG_NOTEOL, we could simply
6091 pass (&private_preg, string, len + 1, 0, len, ...) pretending the string
6092 was a little bit longer but still only matching the real part.
6093 This works because the `endline' will check for a '\n' and will find a
6094 '\0', correctly deciding that this is not the end of a line.
6095 But it doesn't work out so nicely for REG_NOTBOL, since we don't have
6096 a convenient '\0' there. For all we know, the string could be preceded
6097 by '\n' which would throw things off. */
6098
6099 /* Perform the searching operation. */
6100 ret = re_search (&private_preg, string, len,
6101 /* start: */ 0, /* range: */ len,
6102 want_reg_info ? &regs : (struct re_registers *) 0);
6103
6104 /* Copy the register information to the POSIX structure. */
6105 if (want_reg_info)
6106 {
6107 if (ret >= 0)
6108 {
6109 unsigned r;
6110
6111 for (r = 0; r < nmatch; r++)
6112 {
6113 pmatch[r].rm_so = regs.start[r];
6114 pmatch[r].rm_eo = regs.end[r];
6115 }
6116 }
6117
6118 /* If we needed the temporary register info, free the space now. */
6119 free (regs.start);
6120 }
6121
6122 /* We want zero return to mean success, unlike `re_search'. */
6123 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
6124 }
6125 WEAK_ALIAS (__regexec, regexec)
6126
6127
6128 /* Returns a message corresponding to an error code, ERRCODE, returned
6129 from either regcomp or regexec. We don't use PREG here. */
6130
6131 size_t
6132 regerror (errcode, preg, errbuf, errbuf_size)
6133 int errcode;
6134 const regex_t *preg;
6135 char *errbuf;
6136 size_t errbuf_size;
6137 {
6138 const char *msg;
6139 size_t msg_size;
6140
6141 if (errcode < 0
6142 || errcode >= (sizeof (re_error_msgid) / sizeof (re_error_msgid[0])))
6143 /* Only error codes returned by the rest of the code should be passed
6144 to this routine. If we are given anything else, or if other regex
6145 code generates an invalid error code, then the program has a bug.
6146 Dump core so we can fix it. */
6147 abort ();
6148
6149 msg = gettext (re_error_msgid[errcode]);
6150
6151 msg_size = strlen (msg) + 1; /* Includes the null. */
6152
6153 if (errbuf_size != 0)
6154 {
6155 if (msg_size > errbuf_size)
6156 {
6157 strncpy (errbuf, msg, errbuf_size - 1);
6158 errbuf[errbuf_size - 1] = 0;
6159 }
6160 else
6161 strcpy (errbuf, msg);
6162 }
6163
6164 return msg_size;
6165 }
6166 WEAK_ALIAS (__regerror, regerror)
6167
6168
6169 /* Free dynamically allocated space used by PREG. */
6170
6171 void
6172 regfree (preg)
6173 regex_t *preg;
6174 {
6175 if (preg->buffer != NULL)
6176 free (preg->buffer);
6177 preg->buffer = NULL;
6178
6179 preg->allocated = 0;
6180 preg->used = 0;
6181
6182 if (preg->fastmap != NULL)
6183 free (preg->fastmap);
6184 preg->fastmap = NULL;
6185 preg->fastmap_accurate = 0;
6186
6187 if (preg->translate != NULL)
6188 free (preg->translate);
6189 preg->translate = NULL;
6190 }
6191 WEAK_ALIAS (__regfree, regfree)
6192
6193 #endif /* not emacs */