]> code.delx.au - gnu-emacs/blob - src/search.c
570bbe7258829b60b878e0124aa5f3ec3f97a927
[gnu-emacs] / src / search.c
1 /* String search routines for GNU Emacs.
2 Copyright (C) 1985, 1986, 1987, 1993, 1994, 1997, 1998, 1999, 2001, 2002,
3 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
4 Free Software Foundation, Inc.
5
6 This file is part of GNU Emacs.
7
8 GNU Emacs is free software: you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation, either version 3 of the License, or
11 (at your option) any later version.
12
13 GNU Emacs is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
20
21
22 #include <config.h>
23 #include <setjmp.h>
24 #include "lisp.h"
25 #include "syntax.h"
26 #include "category.h"
27 #include "buffer.h"
28 #include "character.h"
29 #include "charset.h"
30 #include "region-cache.h"
31 #include "commands.h"
32 #include "blockinput.h"
33 #include "intervals.h"
34
35 #include <sys/types.h>
36 #include "regex.h"
37
38 #define REGEXP_CACHE_SIZE 20
39
40 /* If the regexp is non-nil, then the buffer contains the compiled form
41 of that regexp, suitable for searching. */
42 struct regexp_cache
43 {
44 struct regexp_cache *next;
45 Lisp_Object regexp, whitespace_regexp;
46 /* Syntax table for which the regexp applies. We need this because
47 of character classes. If this is t, then the compiled pattern is valid
48 for any syntax-table. */
49 Lisp_Object syntax_table;
50 struct re_pattern_buffer buf;
51 char fastmap[0400];
52 /* Nonzero means regexp was compiled to do full POSIX backtracking. */
53 char posix;
54 };
55
56 /* The instances of that struct. */
57 struct regexp_cache searchbufs[REGEXP_CACHE_SIZE];
58
59 /* The head of the linked list; points to the most recently used buffer. */
60 struct regexp_cache *searchbuf_head;
61
62
63 /* Every call to re_match, etc., must pass &search_regs as the regs
64 argument unless you can show it is unnecessary (i.e., if re_match
65 is certainly going to be called again before region-around-match
66 can be called).
67
68 Since the registers are now dynamically allocated, we need to make
69 sure not to refer to the Nth register before checking that it has
70 been allocated by checking search_regs.num_regs.
71
72 The regex code keeps track of whether it has allocated the search
73 buffer using bits in the re_pattern_buffer. This means that whenever
74 you compile a new pattern, it completely forgets whether it has
75 allocated any registers, and will allocate new registers the next
76 time you call a searching or matching function. Therefore, we need
77 to call re_set_registers after compiling a new pattern or after
78 setting the match registers, so that the regex functions will be
79 able to free or re-allocate it properly. */
80 static struct re_registers search_regs;
81
82 /* The buffer in which the last search was performed, or
83 Qt if the last search was done in a string;
84 Qnil if no searching has been done yet. */
85 static Lisp_Object last_thing_searched;
86
87 /* error condition signaled when regexp compile_pattern fails */
88
89 Lisp_Object Qinvalid_regexp;
90
91 /* Error condition used for failing searches */
92 Lisp_Object Qsearch_failed;
93
94 static void set_search_regs (EMACS_INT, EMACS_INT);
95 static void save_search_regs (void);
96 static EMACS_INT simple_search (EMACS_INT, unsigned char *, EMACS_INT,
97 EMACS_INT, Lisp_Object, EMACS_INT, EMACS_INT,
98 EMACS_INT, EMACS_INT);
99 static EMACS_INT boyer_moore (EMACS_INT, unsigned char *, EMACS_INT, EMACS_INT,
100 Lisp_Object, Lisp_Object,
101 EMACS_INT, EMACS_INT,
102 EMACS_INT, EMACS_INT, int);
103 static EMACS_INT search_buffer (Lisp_Object, EMACS_INT, EMACS_INT,
104 EMACS_INT, EMACS_INT, EMACS_INT, int,
105 Lisp_Object, Lisp_Object, int);
106 static void matcher_overflow (void) NO_RETURN;
107
108 static void
109 matcher_overflow (void)
110 {
111 error ("Stack overflow in regexp matcher");
112 }
113
114 /* Compile a regexp and signal a Lisp error if anything goes wrong.
115 PATTERN is the pattern to compile.
116 CP is the place to put the result.
117 TRANSLATE is a translation table for ignoring case, or nil for none.
118 REGP is the structure that says where to store the "register"
119 values that will result from matching this pattern.
120 If it is 0, we should compile the pattern not to record any
121 subexpression bounds.
122 POSIX is nonzero if we want full backtracking (POSIX style)
123 for this pattern. 0 means backtrack only enough to get a valid match.
124
125 The behavior also depends on Vsearch_spaces_regexp. */
126
127 static void
128 compile_pattern_1 (struct regexp_cache *cp, Lisp_Object pattern, Lisp_Object translate, struct re_registers *regp, int posix)
129 {
130 char *val;
131 reg_syntax_t old;
132
133 cp->regexp = Qnil;
134 cp->buf.translate = (! NILP (translate) ? translate : make_number (0));
135 cp->posix = posix;
136 cp->buf.multibyte = STRING_MULTIBYTE (pattern);
137 cp->buf.charset_unibyte = charset_unibyte;
138 if (STRINGP (Vsearch_spaces_regexp))
139 cp->whitespace_regexp = Vsearch_spaces_regexp;
140 else
141 cp->whitespace_regexp = Qnil;
142
143 /* rms: I think BLOCK_INPUT is not needed here any more,
144 because regex.c defines malloc to call xmalloc.
145 Using BLOCK_INPUT here means the debugger won't run if an error occurs.
146 So let's turn it off. */
147 /* BLOCK_INPUT; */
148 old = re_set_syntax (RE_SYNTAX_EMACS
149 | (posix ? 0 : RE_NO_POSIX_BACKTRACKING));
150
151 if (STRINGP (Vsearch_spaces_regexp))
152 re_set_whitespace_regexp (SDATA (Vsearch_spaces_regexp));
153 else
154 re_set_whitespace_regexp (NULL);
155
156 val = (char *) re_compile_pattern (SSDATA (pattern),
157 SBYTES (pattern), &cp->buf);
158
159 /* If the compiled pattern hard codes some of the contents of the
160 syntax-table, it can only be reused with *this* syntax table. */
161 cp->syntax_table = cp->buf.used_syntax ? current_buffer->syntax_table : Qt;
162
163 re_set_whitespace_regexp (NULL);
164
165 re_set_syntax (old);
166 /* UNBLOCK_INPUT; */
167 if (val)
168 xsignal1 (Qinvalid_regexp, build_string (val));
169
170 cp->regexp = Fcopy_sequence (pattern);
171 }
172
173 /* Shrink each compiled regexp buffer in the cache
174 to the size actually used right now.
175 This is called from garbage collection. */
176
177 void
178 shrink_regexp_cache (void)
179 {
180 struct regexp_cache *cp;
181
182 for (cp = searchbuf_head; cp != 0; cp = cp->next)
183 {
184 cp->buf.allocated = cp->buf.used;
185 cp->buf.buffer
186 = (unsigned char *) xrealloc (cp->buf.buffer, cp->buf.used);
187 }
188 }
189
190 /* Clear the regexp cache w.r.t. a particular syntax table,
191 because it was changed.
192 There is no danger of memory leak here because re_compile_pattern
193 automagically manages the memory in each re_pattern_buffer struct,
194 based on its `allocated' and `buffer' values. */
195 void
196 clear_regexp_cache (void)
197 {
198 int i;
199
200 for (i = 0; i < REGEXP_CACHE_SIZE; ++i)
201 /* It's tempting to compare with the syntax-table we've actually changed,
202 but it's not sufficient because char-table inheritance means that
203 modifying one syntax-table can change others at the same time. */
204 if (!EQ (searchbufs[i].syntax_table, Qt))
205 searchbufs[i].regexp = Qnil;
206 }
207
208 /* Compile a regexp if necessary, but first check to see if there's one in
209 the cache.
210 PATTERN is the pattern to compile.
211 TRANSLATE is a translation table for ignoring case, or nil for none.
212 REGP is the structure that says where to store the "register"
213 values that will result from matching this pattern.
214 If it is 0, we should compile the pattern not to record any
215 subexpression bounds.
216 POSIX is nonzero if we want full backtracking (POSIX style)
217 for this pattern. 0 means backtrack only enough to get a valid match. */
218
219 struct re_pattern_buffer *
220 compile_pattern (Lisp_Object pattern, struct re_registers *regp, Lisp_Object translate, int posix, int multibyte)
221 {
222 struct regexp_cache *cp, **cpp;
223
224 for (cpp = &searchbuf_head; ; cpp = &cp->next)
225 {
226 cp = *cpp;
227 /* Entries are initialized to nil, and may be set to nil by
228 compile_pattern_1 if the pattern isn't valid. Don't apply
229 string accessors in those cases. However, compile_pattern_1
230 is only applied to the cache entry we pick here to reuse. So
231 nil should never appear before a non-nil entry. */
232 if (NILP (cp->regexp))
233 goto compile_it;
234 if (SCHARS (cp->regexp) == SCHARS (pattern)
235 && STRING_MULTIBYTE (cp->regexp) == STRING_MULTIBYTE (pattern)
236 && !NILP (Fstring_equal (cp->regexp, pattern))
237 && EQ (cp->buf.translate, (! NILP (translate) ? translate : make_number (0)))
238 && cp->posix == posix
239 && (EQ (cp->syntax_table, Qt)
240 || EQ (cp->syntax_table, current_buffer->syntax_table))
241 && !NILP (Fequal (cp->whitespace_regexp, Vsearch_spaces_regexp))
242 && cp->buf.charset_unibyte == charset_unibyte)
243 break;
244
245 /* If we're at the end of the cache, compile into the nil cell
246 we found, or the last (least recently used) cell with a
247 string value. */
248 if (cp->next == 0)
249 {
250 compile_it:
251 compile_pattern_1 (cp, pattern, translate, regp, posix);
252 break;
253 }
254 }
255
256 /* When we get here, cp (aka *cpp) contains the compiled pattern,
257 either because we found it in the cache or because we just compiled it.
258 Move it to the front of the queue to mark it as most recently used. */
259 *cpp = cp->next;
260 cp->next = searchbuf_head;
261 searchbuf_head = cp;
262
263 /* Advise the searching functions about the space we have allocated
264 for register data. */
265 if (regp)
266 re_set_registers (&cp->buf, regp, regp->num_regs, regp->start, regp->end);
267
268 /* The compiled pattern can be used both for multibyte and unibyte
269 target. But, we have to tell which the pattern is used for. */
270 cp->buf.target_multibyte = multibyte;
271
272 return &cp->buf;
273 }
274
275 \f
276 static Lisp_Object
277 looking_at_1 (Lisp_Object string, int posix)
278 {
279 Lisp_Object val;
280 unsigned char *p1, *p2;
281 EMACS_INT s1, s2;
282 register EMACS_INT i;
283 struct re_pattern_buffer *bufp;
284
285 if (running_asynch_code)
286 save_search_regs ();
287
288 /* This is so set_image_of_range_1 in regex.c can find the EQV table. */
289 XCHAR_TABLE (current_buffer->case_canon_table)->extras[2]
290 = current_buffer->case_eqv_table;
291
292 CHECK_STRING (string);
293 bufp = compile_pattern (string,
294 (NILP (Vinhibit_changing_match_data)
295 ? &search_regs : NULL),
296 (!NILP (current_buffer->case_fold_search)
297 ? current_buffer->case_canon_table : Qnil),
298 posix,
299 !NILP (current_buffer->enable_multibyte_characters));
300
301 immediate_quit = 1;
302 QUIT; /* Do a pending quit right away, to avoid paradoxical behavior */
303
304 /* Get pointers and sizes of the two strings
305 that make up the visible portion of the buffer. */
306
307 p1 = BEGV_ADDR;
308 s1 = GPT_BYTE - BEGV_BYTE;
309 p2 = GAP_END_ADDR;
310 s2 = ZV_BYTE - GPT_BYTE;
311 if (s1 < 0)
312 {
313 p2 = p1;
314 s2 = ZV_BYTE - BEGV_BYTE;
315 s1 = 0;
316 }
317 if (s2 < 0)
318 {
319 s1 = ZV_BYTE - BEGV_BYTE;
320 s2 = 0;
321 }
322
323 re_match_object = Qnil;
324
325 i = re_match_2 (bufp, (char *) p1, s1, (char *) p2, s2,
326 PT_BYTE - BEGV_BYTE,
327 (NILP (Vinhibit_changing_match_data)
328 ? &search_regs : NULL),
329 ZV_BYTE - BEGV_BYTE);
330 immediate_quit = 0;
331
332 if (i == -2)
333 matcher_overflow ();
334
335 val = (0 <= i ? Qt : Qnil);
336 if (NILP (Vinhibit_changing_match_data) && i >= 0)
337 for (i = 0; i < search_regs.num_regs; i++)
338 if (search_regs.start[i] >= 0)
339 {
340 search_regs.start[i]
341 = BYTE_TO_CHAR (search_regs.start[i] + BEGV_BYTE);
342 search_regs.end[i]
343 = BYTE_TO_CHAR (search_regs.end[i] + BEGV_BYTE);
344 }
345
346 /* Set last_thing_searched only when match data is changed. */
347 if (NILP (Vinhibit_changing_match_data))
348 XSETBUFFER (last_thing_searched, current_buffer);
349
350 return val;
351 }
352
353 DEFUN ("looking-at", Flooking_at, Slooking_at, 1, 1, 0,
354 doc: /* Return t if text after point matches regular expression REGEXP.
355 This function modifies the match data that `match-beginning',
356 `match-end' and `match-data' access; save and restore the match
357 data if you want to preserve them. */)
358 (Lisp_Object regexp)
359 {
360 return looking_at_1 (regexp, 0);
361 }
362
363 DEFUN ("posix-looking-at", Fposix_looking_at, Sposix_looking_at, 1, 1, 0,
364 doc: /* Return t if text after point matches regular expression REGEXP.
365 Find the longest match, in accord with Posix regular expression rules.
366 This function modifies the match data that `match-beginning',
367 `match-end' and `match-data' access; save and restore the match
368 data if you want to preserve them. */)
369 (Lisp_Object regexp)
370 {
371 return looking_at_1 (regexp, 1);
372 }
373 \f
374 static Lisp_Object
375 string_match_1 (Lisp_Object regexp, Lisp_Object string, Lisp_Object start, int posix)
376 {
377 int val;
378 struct re_pattern_buffer *bufp;
379 EMACS_INT pos, pos_byte;
380 int i;
381
382 if (running_asynch_code)
383 save_search_regs ();
384
385 CHECK_STRING (regexp);
386 CHECK_STRING (string);
387
388 if (NILP (start))
389 pos = 0, pos_byte = 0;
390 else
391 {
392 EMACS_INT len = SCHARS (string);
393
394 CHECK_NUMBER (start);
395 pos = XINT (start);
396 if (pos < 0 && -pos <= len)
397 pos = len + pos;
398 else if (0 > pos || pos > len)
399 args_out_of_range (string, start);
400 pos_byte = string_char_to_byte (string, pos);
401 }
402
403 /* This is so set_image_of_range_1 in regex.c can find the EQV table. */
404 XCHAR_TABLE (current_buffer->case_canon_table)->extras[2]
405 = current_buffer->case_eqv_table;
406
407 bufp = compile_pattern (regexp,
408 (NILP (Vinhibit_changing_match_data)
409 ? &search_regs : NULL),
410 (!NILP (current_buffer->case_fold_search)
411 ? current_buffer->case_canon_table : Qnil),
412 posix,
413 STRING_MULTIBYTE (string));
414 immediate_quit = 1;
415 re_match_object = string;
416
417 val = re_search (bufp, SSDATA (string),
418 SBYTES (string), pos_byte,
419 SBYTES (string) - pos_byte,
420 (NILP (Vinhibit_changing_match_data)
421 ? &search_regs : NULL));
422 immediate_quit = 0;
423
424 /* Set last_thing_searched only when match data is changed. */
425 if (NILP (Vinhibit_changing_match_data))
426 last_thing_searched = Qt;
427
428 if (val == -2)
429 matcher_overflow ();
430 if (val < 0) return Qnil;
431
432 if (NILP (Vinhibit_changing_match_data))
433 for (i = 0; i < search_regs.num_regs; i++)
434 if (search_regs.start[i] >= 0)
435 {
436 search_regs.start[i]
437 = string_byte_to_char (string, search_regs.start[i]);
438 search_regs.end[i]
439 = string_byte_to_char (string, search_regs.end[i]);
440 }
441
442 return make_number (string_byte_to_char (string, val));
443 }
444
445 DEFUN ("string-match", Fstring_match, Sstring_match, 2, 3, 0,
446 doc: /* Return index of start of first match for REGEXP in STRING, or nil.
447 Matching ignores case if `case-fold-search' is non-nil.
448 If third arg START is non-nil, start search at that index in STRING.
449 For index of first char beyond the match, do (match-end 0).
450 `match-end' and `match-beginning' also give indices of substrings
451 matched by parenthesis constructs in the pattern.
452
453 You can use the function `match-string' to extract the substrings
454 matched by the parenthesis constructions in REGEXP. */)
455 (Lisp_Object regexp, Lisp_Object string, Lisp_Object start)
456 {
457 return string_match_1 (regexp, string, start, 0);
458 }
459
460 DEFUN ("posix-string-match", Fposix_string_match, Sposix_string_match, 2, 3, 0,
461 doc: /* Return index of start of first match for REGEXP in STRING, or nil.
462 Find the longest match, in accord with Posix regular expression rules.
463 Case is ignored if `case-fold-search' is non-nil in the current buffer.
464 If third arg START is non-nil, start search at that index in STRING.
465 For index of first char beyond the match, do (match-end 0).
466 `match-end' and `match-beginning' also give indices of substrings
467 matched by parenthesis constructs in the pattern. */)
468 (Lisp_Object regexp, Lisp_Object string, Lisp_Object start)
469 {
470 return string_match_1 (regexp, string, start, 1);
471 }
472
473 /* Match REGEXP against STRING, searching all of STRING,
474 and return the index of the match, or negative on failure.
475 This does not clobber the match data. */
476
477 int
478 fast_string_match (Lisp_Object regexp, Lisp_Object string)
479 {
480 int val;
481 struct re_pattern_buffer *bufp;
482
483 bufp = compile_pattern (regexp, 0, Qnil,
484 0, STRING_MULTIBYTE (string));
485 immediate_quit = 1;
486 re_match_object = string;
487
488 val = re_search (bufp, SSDATA (string),
489 SBYTES (string), 0,
490 SBYTES (string), 0);
491 immediate_quit = 0;
492 return val;
493 }
494
495 /* Match REGEXP against STRING, searching all of STRING ignoring case,
496 and return the index of the match, or negative on failure.
497 This does not clobber the match data.
498 We assume that STRING contains single-byte characters. */
499
500 int
501 fast_c_string_match_ignore_case (Lisp_Object regexp, const char *string)
502 {
503 int val;
504 struct re_pattern_buffer *bufp;
505 size_t len = strlen (string);
506
507 regexp = string_make_unibyte (regexp);
508 re_match_object = Qt;
509 bufp = compile_pattern (regexp, 0,
510 Vascii_canon_table, 0,
511 0);
512 immediate_quit = 1;
513 val = re_search (bufp, string, len, 0, len, 0);
514 immediate_quit = 0;
515 return val;
516 }
517
518 /* Like fast_string_match but ignore case. */
519
520 int
521 fast_string_match_ignore_case (Lisp_Object regexp, Lisp_Object string)
522 {
523 int val;
524 struct re_pattern_buffer *bufp;
525
526 bufp = compile_pattern (regexp, 0, Vascii_canon_table,
527 0, STRING_MULTIBYTE (string));
528 immediate_quit = 1;
529 re_match_object = string;
530
531 val = re_search (bufp, SSDATA (string),
532 SBYTES (string), 0,
533 SBYTES (string), 0);
534 immediate_quit = 0;
535 return val;
536 }
537 \f
538 /* Match REGEXP against the characters after POS to LIMIT, and return
539 the number of matched characters. If STRING is non-nil, match
540 against the characters in it. In that case, POS and LIMIT are
541 indices into the string. This function doesn't modify the match
542 data. */
543
544 EMACS_INT
545 fast_looking_at (Lisp_Object regexp, EMACS_INT pos, EMACS_INT pos_byte, EMACS_INT limit, EMACS_INT limit_byte, Lisp_Object string)
546 {
547 int multibyte;
548 struct re_pattern_buffer *buf;
549 unsigned char *p1, *p2;
550 EMACS_INT s1, s2;
551 EMACS_INT len;
552
553 if (STRINGP (string))
554 {
555 if (pos_byte < 0)
556 pos_byte = string_char_to_byte (string, pos);
557 if (limit_byte < 0)
558 limit_byte = string_char_to_byte (string, limit);
559 p1 = NULL;
560 s1 = 0;
561 p2 = SDATA (string);
562 s2 = SBYTES (string);
563 re_match_object = string;
564 multibyte = STRING_MULTIBYTE (string);
565 }
566 else
567 {
568 if (pos_byte < 0)
569 pos_byte = CHAR_TO_BYTE (pos);
570 if (limit_byte < 0)
571 limit_byte = CHAR_TO_BYTE (limit);
572 pos_byte -= BEGV_BYTE;
573 limit_byte -= BEGV_BYTE;
574 p1 = BEGV_ADDR;
575 s1 = GPT_BYTE - BEGV_BYTE;
576 p2 = GAP_END_ADDR;
577 s2 = ZV_BYTE - GPT_BYTE;
578 if (s1 < 0)
579 {
580 p2 = p1;
581 s2 = ZV_BYTE - BEGV_BYTE;
582 s1 = 0;
583 }
584 if (s2 < 0)
585 {
586 s1 = ZV_BYTE - BEGV_BYTE;
587 s2 = 0;
588 }
589 re_match_object = Qnil;
590 multibyte = ! NILP (current_buffer->enable_multibyte_characters);
591 }
592
593 buf = compile_pattern (regexp, 0, Qnil, 0, multibyte);
594 immediate_quit = 1;
595 len = re_match_2 (buf, (char *) p1, s1, (char *) p2, s2,
596 pos_byte, NULL, limit_byte);
597 immediate_quit = 0;
598
599 return len;
600 }
601
602 \f
603 /* The newline cache: remembering which sections of text have no newlines. */
604
605 /* If the user has requested newline caching, make sure it's on.
606 Otherwise, make sure it's off.
607 This is our cheezy way of associating an action with the change of
608 state of a buffer-local variable. */
609 static void
610 newline_cache_on_off (struct buffer *buf)
611 {
612 if (NILP (buf->cache_long_line_scans))
613 {
614 /* It should be off. */
615 if (buf->newline_cache)
616 {
617 free_region_cache (buf->newline_cache);
618 buf->newline_cache = 0;
619 }
620 }
621 else
622 {
623 /* It should be on. */
624 if (buf->newline_cache == 0)
625 buf->newline_cache = new_region_cache ();
626 }
627 }
628
629 \f
630 /* Search for COUNT instances of the character TARGET between START and END.
631
632 If COUNT is positive, search forwards; END must be >= START.
633 If COUNT is negative, search backwards for the -COUNTth instance;
634 END must be <= START.
635 If COUNT is zero, do anything you please; run rogue, for all I care.
636
637 If END is zero, use BEGV or ZV instead, as appropriate for the
638 direction indicated by COUNT.
639
640 If we find COUNT instances, set *SHORTAGE to zero, and return the
641 position past the COUNTth match. Note that for reverse motion
642 this is not the same as the usual convention for Emacs motion commands.
643
644 If we don't find COUNT instances before reaching END, set *SHORTAGE
645 to the number of TARGETs left unfound, and return END.
646
647 If ALLOW_QUIT is non-zero, set immediate_quit. That's good to do
648 except when inside redisplay. */
649
650 EMACS_INT
651 scan_buffer (register int target, EMACS_INT start, EMACS_INT end,
652 EMACS_INT count, int *shortage, int allow_quit)
653 {
654 struct region_cache *newline_cache;
655 int direction;
656
657 if (count > 0)
658 {
659 direction = 1;
660 if (! end) end = ZV;
661 }
662 else
663 {
664 direction = -1;
665 if (! end) end = BEGV;
666 }
667
668 newline_cache_on_off (current_buffer);
669 newline_cache = current_buffer->newline_cache;
670
671 if (shortage != 0)
672 *shortage = 0;
673
674 immediate_quit = allow_quit;
675
676 if (count > 0)
677 while (start != end)
678 {
679 /* Our innermost scanning loop is very simple; it doesn't know
680 about gaps, buffer ends, or the newline cache. ceiling is
681 the position of the last character before the next such
682 obstacle --- the last character the dumb search loop should
683 examine. */
684 EMACS_INT ceiling_byte = CHAR_TO_BYTE (end) - 1;
685 EMACS_INT start_byte = CHAR_TO_BYTE (start);
686 EMACS_INT tem;
687
688 /* If we're looking for a newline, consult the newline cache
689 to see where we can avoid some scanning. */
690 if (target == '\n' && newline_cache)
691 {
692 EMACS_INT next_change;
693 immediate_quit = 0;
694 while (region_cache_forward
695 (current_buffer, newline_cache, start_byte, &next_change))
696 start_byte = next_change;
697 immediate_quit = allow_quit;
698
699 /* START should never be after END. */
700 if (start_byte > ceiling_byte)
701 start_byte = ceiling_byte;
702
703 /* Now the text after start is an unknown region, and
704 next_change is the position of the next known region. */
705 ceiling_byte = min (next_change - 1, ceiling_byte);
706 }
707
708 /* The dumb loop can only scan text stored in contiguous
709 bytes. BUFFER_CEILING_OF returns the last character
710 position that is contiguous, so the ceiling is the
711 position after that. */
712 tem = BUFFER_CEILING_OF (start_byte);
713 ceiling_byte = min (tem, ceiling_byte);
714
715 {
716 /* The termination address of the dumb loop. */
717 register unsigned char *ceiling_addr
718 = BYTE_POS_ADDR (ceiling_byte) + 1;
719 register unsigned char *cursor
720 = BYTE_POS_ADDR (start_byte);
721 unsigned char *base = cursor;
722
723 while (cursor < ceiling_addr)
724 {
725 unsigned char *scan_start = cursor;
726
727 /* The dumb loop. */
728 while (*cursor != target && ++cursor < ceiling_addr)
729 ;
730
731 /* If we're looking for newlines, cache the fact that
732 the region from start to cursor is free of them. */
733 if (target == '\n' && newline_cache)
734 know_region_cache (current_buffer, newline_cache,
735 start_byte + scan_start - base,
736 start_byte + cursor - base);
737
738 /* Did we find the target character? */
739 if (cursor < ceiling_addr)
740 {
741 if (--count == 0)
742 {
743 immediate_quit = 0;
744 return BYTE_TO_CHAR (start_byte + cursor - base + 1);
745 }
746 cursor++;
747 }
748 }
749
750 start = BYTE_TO_CHAR (start_byte + cursor - base);
751 }
752 }
753 else
754 while (start > end)
755 {
756 /* The last character to check before the next obstacle. */
757 EMACS_INT ceiling_byte = CHAR_TO_BYTE (end);
758 EMACS_INT start_byte = CHAR_TO_BYTE (start);
759 EMACS_INT tem;
760
761 /* Consult the newline cache, if appropriate. */
762 if (target == '\n' && newline_cache)
763 {
764 EMACS_INT next_change;
765 immediate_quit = 0;
766 while (region_cache_backward
767 (current_buffer, newline_cache, start_byte, &next_change))
768 start_byte = next_change;
769 immediate_quit = allow_quit;
770
771 /* Start should never be at or before end. */
772 if (start_byte <= ceiling_byte)
773 start_byte = ceiling_byte + 1;
774
775 /* Now the text before start is an unknown region, and
776 next_change is the position of the next known region. */
777 ceiling_byte = max (next_change, ceiling_byte);
778 }
779
780 /* Stop scanning before the gap. */
781 tem = BUFFER_FLOOR_OF (start_byte - 1);
782 ceiling_byte = max (tem, ceiling_byte);
783
784 {
785 /* The termination address of the dumb loop. */
786 register unsigned char *ceiling_addr = BYTE_POS_ADDR (ceiling_byte);
787 register unsigned char *cursor = BYTE_POS_ADDR (start_byte - 1);
788 unsigned char *base = cursor;
789
790 while (cursor >= ceiling_addr)
791 {
792 unsigned char *scan_start = cursor;
793
794 while (*cursor != target && --cursor >= ceiling_addr)
795 ;
796
797 /* If we're looking for newlines, cache the fact that
798 the region from after the cursor to start is free of them. */
799 if (target == '\n' && newline_cache)
800 know_region_cache (current_buffer, newline_cache,
801 start_byte + cursor - base,
802 start_byte + scan_start - base);
803
804 /* Did we find the target character? */
805 if (cursor >= ceiling_addr)
806 {
807 if (++count >= 0)
808 {
809 immediate_quit = 0;
810 return BYTE_TO_CHAR (start_byte + cursor - base);
811 }
812 cursor--;
813 }
814 }
815
816 start = BYTE_TO_CHAR (start_byte + cursor - base);
817 }
818 }
819
820 immediate_quit = 0;
821 if (shortage != 0)
822 *shortage = count * direction;
823 return start;
824 }
825 \f
826 /* Search for COUNT instances of a line boundary, which means either a
827 newline or (if selective display enabled) a carriage return.
828 Start at START. If COUNT is negative, search backwards.
829
830 We report the resulting position by calling TEMP_SET_PT_BOTH.
831
832 If we find COUNT instances. we position after (always after,
833 even if scanning backwards) the COUNTth match, and return 0.
834
835 If we don't find COUNT instances before reaching the end of the
836 buffer (or the beginning, if scanning backwards), we return
837 the number of line boundaries left unfound, and position at
838 the limit we bumped up against.
839
840 If ALLOW_QUIT is non-zero, set immediate_quit. That's good to do
841 except in special cases. */
842
843 EMACS_INT
844 scan_newline (EMACS_INT start, EMACS_INT start_byte,
845 EMACS_INT limit, EMACS_INT limit_byte,
846 register EMACS_INT count, int allow_quit)
847 {
848 int direction = ((count > 0) ? 1 : -1);
849
850 register unsigned char *cursor;
851 unsigned char *base;
852
853 EMACS_INT ceiling;
854 register unsigned char *ceiling_addr;
855
856 int old_immediate_quit = immediate_quit;
857
858 /* The code that follows is like scan_buffer
859 but checks for either newline or carriage return. */
860
861 if (allow_quit)
862 immediate_quit++;
863
864 start_byte = CHAR_TO_BYTE (start);
865
866 if (count > 0)
867 {
868 while (start_byte < limit_byte)
869 {
870 ceiling = BUFFER_CEILING_OF (start_byte);
871 ceiling = min (limit_byte - 1, ceiling);
872 ceiling_addr = BYTE_POS_ADDR (ceiling) + 1;
873 base = (cursor = BYTE_POS_ADDR (start_byte));
874 while (1)
875 {
876 while (*cursor != '\n' && ++cursor != ceiling_addr)
877 ;
878
879 if (cursor != ceiling_addr)
880 {
881 if (--count == 0)
882 {
883 immediate_quit = old_immediate_quit;
884 start_byte = start_byte + cursor - base + 1;
885 start = BYTE_TO_CHAR (start_byte);
886 TEMP_SET_PT_BOTH (start, start_byte);
887 return 0;
888 }
889 else
890 if (++cursor == ceiling_addr)
891 break;
892 }
893 else
894 break;
895 }
896 start_byte += cursor - base;
897 }
898 }
899 else
900 {
901 while (start_byte > limit_byte)
902 {
903 ceiling = BUFFER_FLOOR_OF (start_byte - 1);
904 ceiling = max (limit_byte, ceiling);
905 ceiling_addr = BYTE_POS_ADDR (ceiling) - 1;
906 base = (cursor = BYTE_POS_ADDR (start_byte - 1) + 1);
907 while (1)
908 {
909 while (--cursor != ceiling_addr && *cursor != '\n')
910 ;
911
912 if (cursor != ceiling_addr)
913 {
914 if (++count == 0)
915 {
916 immediate_quit = old_immediate_quit;
917 /* Return the position AFTER the match we found. */
918 start_byte = start_byte + cursor - base + 1;
919 start = BYTE_TO_CHAR (start_byte);
920 TEMP_SET_PT_BOTH (start, start_byte);
921 return 0;
922 }
923 }
924 else
925 break;
926 }
927 /* Here we add 1 to compensate for the last decrement
928 of CURSOR, which took it past the valid range. */
929 start_byte += cursor - base + 1;
930 }
931 }
932
933 TEMP_SET_PT_BOTH (limit, limit_byte);
934 immediate_quit = old_immediate_quit;
935
936 return count * direction;
937 }
938
939 EMACS_INT
940 find_next_newline_no_quit (EMACS_INT from, EMACS_INT cnt)
941 {
942 return scan_buffer ('\n', from, 0, cnt, (int *) 0, 0);
943 }
944
945 /* Like find_next_newline, but returns position before the newline,
946 not after, and only search up to TO. This isn't just
947 find_next_newline (...)-1, because you might hit TO. */
948
949 EMACS_INT
950 find_before_next_newline (EMACS_INT from, EMACS_INT to, EMACS_INT cnt)
951 {
952 int shortage;
953 EMACS_INT pos = scan_buffer ('\n', from, to, cnt, &shortage, 1);
954
955 if (shortage == 0)
956 pos--;
957
958 return pos;
959 }
960 \f
961 /* Subroutines of Lisp buffer search functions. */
962
963 static Lisp_Object
964 search_command (Lisp_Object string, Lisp_Object bound, Lisp_Object noerror,
965 Lisp_Object count, int direction, int RE, int posix)
966 {
967 register int np;
968 EMACS_INT lim, lim_byte;
969 int n = direction;
970
971 if (!NILP (count))
972 {
973 CHECK_NUMBER (count);
974 n *= XINT (count);
975 }
976
977 CHECK_STRING (string);
978 if (NILP (bound))
979 {
980 if (n > 0)
981 lim = ZV, lim_byte = ZV_BYTE;
982 else
983 lim = BEGV, lim_byte = BEGV_BYTE;
984 }
985 else
986 {
987 CHECK_NUMBER_COERCE_MARKER (bound);
988 lim = XINT (bound);
989 if (n > 0 ? lim < PT : lim > PT)
990 error ("Invalid search bound (wrong side of point)");
991 if (lim > ZV)
992 lim = ZV, lim_byte = ZV_BYTE;
993 else if (lim < BEGV)
994 lim = BEGV, lim_byte = BEGV_BYTE;
995 else
996 lim_byte = CHAR_TO_BYTE (lim);
997 }
998
999 /* This is so set_image_of_range_1 in regex.c can find the EQV table. */
1000 XCHAR_TABLE (current_buffer->case_canon_table)->extras[2]
1001 = current_buffer->case_eqv_table;
1002
1003 np = search_buffer (string, PT, PT_BYTE, lim, lim_byte, n, RE,
1004 (!NILP (current_buffer->case_fold_search)
1005 ? current_buffer->case_canon_table
1006 : Qnil),
1007 (!NILP (current_buffer->case_fold_search)
1008 ? current_buffer->case_eqv_table
1009 : Qnil),
1010 posix);
1011 if (np <= 0)
1012 {
1013 if (NILP (noerror))
1014 xsignal1 (Qsearch_failed, string);
1015
1016 if (!EQ (noerror, Qt))
1017 {
1018 if (lim < BEGV || lim > ZV)
1019 abort ();
1020 SET_PT_BOTH (lim, lim_byte);
1021 return Qnil;
1022 #if 0 /* This would be clean, but maybe programs depend on
1023 a value of nil here. */
1024 np = lim;
1025 #endif
1026 }
1027 else
1028 return Qnil;
1029 }
1030
1031 if (np < BEGV || np > ZV)
1032 abort ();
1033
1034 SET_PT (np);
1035
1036 return make_number (np);
1037 }
1038 \f
1039 /* Return 1 if REGEXP it matches just one constant string. */
1040
1041 static int
1042 trivial_regexp_p (Lisp_Object regexp)
1043 {
1044 EMACS_INT len = SBYTES (regexp);
1045 unsigned char *s = SDATA (regexp);
1046 while (--len >= 0)
1047 {
1048 switch (*s++)
1049 {
1050 case '.': case '*': case '+': case '?': case '[': case '^': case '$':
1051 return 0;
1052 case '\\':
1053 if (--len < 0)
1054 return 0;
1055 switch (*s++)
1056 {
1057 case '|': case '(': case ')': case '`': case '\'': case 'b':
1058 case 'B': case '<': case '>': case 'w': case 'W': case 's':
1059 case 'S': case '=': case '{': case '}': case '_':
1060 case 'c': case 'C': /* for categoryspec and notcategoryspec */
1061 case '1': case '2': case '3': case '4': case '5':
1062 case '6': case '7': case '8': case '9':
1063 return 0;
1064 }
1065 }
1066 }
1067 return 1;
1068 }
1069
1070 /* Search for the n'th occurrence of STRING in the current buffer,
1071 starting at position POS and stopping at position LIM,
1072 treating STRING as a literal string if RE is false or as
1073 a regular expression if RE is true.
1074
1075 If N is positive, searching is forward and LIM must be greater than POS.
1076 If N is negative, searching is backward and LIM must be less than POS.
1077
1078 Returns -x if x occurrences remain to be found (x > 0),
1079 or else the position at the beginning of the Nth occurrence
1080 (if searching backward) or the end (if searching forward).
1081
1082 POSIX is nonzero if we want full backtracking (POSIX style)
1083 for this pattern. 0 means backtrack only enough to get a valid match. */
1084
1085 #define TRANSLATE(out, trt, d) \
1086 do \
1087 { \
1088 if (! NILP (trt)) \
1089 { \
1090 Lisp_Object temp; \
1091 temp = Faref (trt, make_number (d)); \
1092 if (INTEGERP (temp)) \
1093 out = XINT (temp); \
1094 else \
1095 out = d; \
1096 } \
1097 else \
1098 out = d; \
1099 } \
1100 while (0)
1101
1102 /* Only used in search_buffer, to record the end position of the match
1103 when searching regexps and SEARCH_REGS should not be changed
1104 (i.e. Vinhibit_changing_match_data is non-nil). */
1105 static struct re_registers search_regs_1;
1106
1107 static EMACS_INT
1108 search_buffer (Lisp_Object string, EMACS_INT pos, EMACS_INT pos_byte,
1109 EMACS_INT lim, EMACS_INT lim_byte, EMACS_INT n,
1110 int RE, Lisp_Object trt, Lisp_Object inverse_trt, int posix)
1111 {
1112 EMACS_INT len = SCHARS (string);
1113 EMACS_INT len_byte = SBYTES (string);
1114 register int i;
1115
1116 if (running_asynch_code)
1117 save_search_regs ();
1118
1119 /* Searching 0 times means don't move. */
1120 /* Null string is found at starting position. */
1121 if (len == 0 || n == 0)
1122 {
1123 set_search_regs (pos_byte, 0);
1124 return pos;
1125 }
1126
1127 if (RE && !(trivial_regexp_p (string) && NILP (Vsearch_spaces_regexp)))
1128 {
1129 unsigned char *p1, *p2;
1130 EMACS_INT s1, s2;
1131 struct re_pattern_buffer *bufp;
1132
1133 bufp = compile_pattern (string,
1134 (NILP (Vinhibit_changing_match_data)
1135 ? &search_regs : &search_regs_1),
1136 trt, posix,
1137 !NILP (current_buffer->enable_multibyte_characters));
1138
1139 immediate_quit = 1; /* Quit immediately if user types ^G,
1140 because letting this function finish
1141 can take too long. */
1142 QUIT; /* Do a pending quit right away,
1143 to avoid paradoxical behavior */
1144 /* Get pointers and sizes of the two strings
1145 that make up the visible portion of the buffer. */
1146
1147 p1 = BEGV_ADDR;
1148 s1 = GPT_BYTE - BEGV_BYTE;
1149 p2 = GAP_END_ADDR;
1150 s2 = ZV_BYTE - GPT_BYTE;
1151 if (s1 < 0)
1152 {
1153 p2 = p1;
1154 s2 = ZV_BYTE - BEGV_BYTE;
1155 s1 = 0;
1156 }
1157 if (s2 < 0)
1158 {
1159 s1 = ZV_BYTE - BEGV_BYTE;
1160 s2 = 0;
1161 }
1162 re_match_object = Qnil;
1163
1164 while (n < 0)
1165 {
1166 EMACS_INT val;
1167 val = re_search_2 (bufp, (char *) p1, s1, (char *) p2, s2,
1168 pos_byte - BEGV_BYTE, lim_byte - pos_byte,
1169 (NILP (Vinhibit_changing_match_data)
1170 ? &search_regs : &search_regs_1),
1171 /* Don't allow match past current point */
1172 pos_byte - BEGV_BYTE);
1173 if (val == -2)
1174 {
1175 matcher_overflow ();
1176 }
1177 if (val >= 0)
1178 {
1179 if (NILP (Vinhibit_changing_match_data))
1180 {
1181 pos_byte = search_regs.start[0] + BEGV_BYTE;
1182 for (i = 0; i < search_regs.num_regs; i++)
1183 if (search_regs.start[i] >= 0)
1184 {
1185 search_regs.start[i]
1186 = BYTE_TO_CHAR (search_regs.start[i] + BEGV_BYTE);
1187 search_regs.end[i]
1188 = BYTE_TO_CHAR (search_regs.end[i] + BEGV_BYTE);
1189 }
1190 XSETBUFFER (last_thing_searched, current_buffer);
1191 /* Set pos to the new position. */
1192 pos = search_regs.start[0];
1193 }
1194 else
1195 {
1196 pos_byte = search_regs_1.start[0] + BEGV_BYTE;
1197 /* Set pos to the new position. */
1198 pos = BYTE_TO_CHAR (search_regs_1.start[0] + BEGV_BYTE);
1199 }
1200 }
1201 else
1202 {
1203 immediate_quit = 0;
1204 return (n);
1205 }
1206 n++;
1207 }
1208 while (n > 0)
1209 {
1210 EMACS_INT val;
1211 val = re_search_2 (bufp, (char *) p1, s1, (char *) p2, s2,
1212 pos_byte - BEGV_BYTE, lim_byte - pos_byte,
1213 (NILP (Vinhibit_changing_match_data)
1214 ? &search_regs : &search_regs_1),
1215 lim_byte - BEGV_BYTE);
1216 if (val == -2)
1217 {
1218 matcher_overflow ();
1219 }
1220 if (val >= 0)
1221 {
1222 if (NILP (Vinhibit_changing_match_data))
1223 {
1224 pos_byte = search_regs.end[0] + BEGV_BYTE;
1225 for (i = 0; i < search_regs.num_regs; i++)
1226 if (search_regs.start[i] >= 0)
1227 {
1228 search_regs.start[i]
1229 = BYTE_TO_CHAR (search_regs.start[i] + BEGV_BYTE);
1230 search_regs.end[i]
1231 = BYTE_TO_CHAR (search_regs.end[i] + BEGV_BYTE);
1232 }
1233 XSETBUFFER (last_thing_searched, current_buffer);
1234 pos = search_regs.end[0];
1235 }
1236 else
1237 {
1238 pos_byte = search_regs_1.end[0] + BEGV_BYTE;
1239 pos = BYTE_TO_CHAR (search_regs_1.end[0] + BEGV_BYTE);
1240 }
1241 }
1242 else
1243 {
1244 immediate_quit = 0;
1245 return (0 - n);
1246 }
1247 n--;
1248 }
1249 immediate_quit = 0;
1250 return (pos);
1251 }
1252 else /* non-RE case */
1253 {
1254 unsigned char *raw_pattern, *pat;
1255 EMACS_INT raw_pattern_size;
1256 EMACS_INT raw_pattern_size_byte;
1257 unsigned char *patbuf;
1258 int multibyte = !NILP (current_buffer->enable_multibyte_characters);
1259 unsigned char *base_pat;
1260 /* Set to positive if we find a non-ASCII char that need
1261 translation. Otherwise set to zero later. */
1262 int char_base = -1;
1263 int boyer_moore_ok = 1;
1264
1265 /* MULTIBYTE says whether the text to be searched is multibyte.
1266 We must convert PATTERN to match that, or we will not really
1267 find things right. */
1268
1269 if (multibyte == STRING_MULTIBYTE (string))
1270 {
1271 raw_pattern = SDATA (string);
1272 raw_pattern_size = SCHARS (string);
1273 raw_pattern_size_byte = SBYTES (string);
1274 }
1275 else if (multibyte)
1276 {
1277 raw_pattern_size = SCHARS (string);
1278 raw_pattern_size_byte
1279 = count_size_as_multibyte (SDATA (string),
1280 raw_pattern_size);
1281 raw_pattern = (unsigned char *) alloca (raw_pattern_size_byte + 1);
1282 copy_text (SDATA (string), raw_pattern,
1283 SCHARS (string), 0, 1);
1284 }
1285 else
1286 {
1287 /* Converting multibyte to single-byte.
1288
1289 ??? Perhaps this conversion should be done in a special way
1290 by subtracting nonascii-insert-offset from each non-ASCII char,
1291 so that only the multibyte chars which really correspond to
1292 the chosen single-byte character set can possibly match. */
1293 raw_pattern_size = SCHARS (string);
1294 raw_pattern_size_byte = SCHARS (string);
1295 raw_pattern = (unsigned char *) alloca (raw_pattern_size + 1);
1296 copy_text (SDATA (string), raw_pattern,
1297 SBYTES (string), 1, 0);
1298 }
1299
1300 /* Copy and optionally translate the pattern. */
1301 len = raw_pattern_size;
1302 len_byte = raw_pattern_size_byte;
1303 patbuf = (unsigned char *) alloca (len * MAX_MULTIBYTE_LENGTH);
1304 pat = patbuf;
1305 base_pat = raw_pattern;
1306 if (multibyte)
1307 {
1308 /* Fill patbuf by translated characters in STRING while
1309 checking if we can use boyer-moore search. If TRT is
1310 non-nil, we can use boyer-moore search only if TRT can be
1311 represented by the byte array of 256 elements. For that,
1312 all non-ASCII case-equivalents of all case-senstive
1313 characters in STRING must belong to the same charset and
1314 row. */
1315
1316 while (--len >= 0)
1317 {
1318 unsigned char str_base[MAX_MULTIBYTE_LENGTH], *str;
1319 int c, translated, inverse;
1320 int in_charlen, charlen;
1321
1322 /* If we got here and the RE flag is set, it's because we're
1323 dealing with a regexp known to be trivial, so the backslash
1324 just quotes the next character. */
1325 if (RE && *base_pat == '\\')
1326 {
1327 len--;
1328 raw_pattern_size--;
1329 len_byte--;
1330 base_pat++;
1331 }
1332
1333 c = STRING_CHAR_AND_LENGTH (base_pat, in_charlen);
1334
1335 if (NILP (trt))
1336 {
1337 str = base_pat;
1338 charlen = in_charlen;
1339 }
1340 else
1341 {
1342 /* Translate the character. */
1343 TRANSLATE (translated, trt, c);
1344 charlen = CHAR_STRING (translated, str_base);
1345 str = str_base;
1346
1347 /* Check if C has any other case-equivalents. */
1348 TRANSLATE (inverse, inverse_trt, c);
1349 /* If so, check if we can use boyer-moore. */
1350 if (c != inverse && boyer_moore_ok)
1351 {
1352 /* Check if all equivalents belong to the same
1353 group of characters. Note that the check of C
1354 itself is done by the last iteration. */
1355 int this_char_base = -1;
1356
1357 while (boyer_moore_ok)
1358 {
1359 if (ASCII_BYTE_P (inverse))
1360 {
1361 if (this_char_base > 0)
1362 boyer_moore_ok = 0;
1363 else
1364 this_char_base = 0;
1365 }
1366 else if (CHAR_BYTE8_P (inverse))
1367 /* Boyer-moore search can't handle a
1368 translation of an eight-bit
1369 character. */
1370 boyer_moore_ok = 0;
1371 else if (this_char_base < 0)
1372 {
1373 this_char_base = inverse & ~0x3F;
1374 if (char_base < 0)
1375 char_base = this_char_base;
1376 else if (this_char_base != char_base)
1377 boyer_moore_ok = 0;
1378 }
1379 else if ((inverse & ~0x3F) != this_char_base)
1380 boyer_moore_ok = 0;
1381 if (c == inverse)
1382 break;
1383 TRANSLATE (inverse, inverse_trt, inverse);
1384 }
1385 }
1386 }
1387
1388 /* Store this character into the translated pattern. */
1389 memcpy (pat, str, charlen);
1390 pat += charlen;
1391 base_pat += in_charlen;
1392 len_byte -= in_charlen;
1393 }
1394
1395 /* If char_base is still negative we didn't find any translated
1396 non-ASCII characters. */
1397 if (char_base < 0)
1398 char_base = 0;
1399 }
1400 else
1401 {
1402 /* Unibyte buffer. */
1403 char_base = 0;
1404 while (--len >= 0)
1405 {
1406 int c, translated;
1407
1408 /* If we got here and the RE flag is set, it's because we're
1409 dealing with a regexp known to be trivial, so the backslash
1410 just quotes the next character. */
1411 if (RE && *base_pat == '\\')
1412 {
1413 len--;
1414 raw_pattern_size--;
1415 base_pat++;
1416 }
1417 c = *base_pat++;
1418 TRANSLATE (translated, trt, c);
1419 *pat++ = translated;
1420 }
1421 }
1422
1423 len_byte = pat - patbuf;
1424 len = raw_pattern_size;
1425 pat = base_pat = patbuf;
1426
1427 if (boyer_moore_ok)
1428 return boyer_moore (n, pat, len, len_byte, trt, inverse_trt,
1429 pos, pos_byte, lim, lim_byte,
1430 char_base);
1431 else
1432 return simple_search (n, pat, len, len_byte, trt,
1433 pos, pos_byte, lim, lim_byte);
1434 }
1435 }
1436 \f
1437 /* Do a simple string search N times for the string PAT,
1438 whose length is LEN/LEN_BYTE,
1439 from buffer position POS/POS_BYTE until LIM/LIM_BYTE.
1440 TRT is the translation table.
1441
1442 Return the character position where the match is found.
1443 Otherwise, if M matches remained to be found, return -M.
1444
1445 This kind of search works regardless of what is in PAT and
1446 regardless of what is in TRT. It is used in cases where
1447 boyer_moore cannot work. */
1448
1449 static EMACS_INT
1450 simple_search (EMACS_INT n, unsigned char *pat,
1451 EMACS_INT len, EMACS_INT len_byte, Lisp_Object trt,
1452 EMACS_INT pos, EMACS_INT pos_byte,
1453 EMACS_INT lim, EMACS_INT lim_byte)
1454 {
1455 int multibyte = ! NILP (current_buffer->enable_multibyte_characters);
1456 int forward = n > 0;
1457 /* Number of buffer bytes matched. Note that this may be different
1458 from len_byte in a multibyte buffer. */
1459 EMACS_INT match_byte;
1460
1461 if (lim > pos && multibyte)
1462 while (n > 0)
1463 {
1464 while (1)
1465 {
1466 /* Try matching at position POS. */
1467 EMACS_INT this_pos = pos;
1468 EMACS_INT this_pos_byte = pos_byte;
1469 EMACS_INT this_len = len;
1470 unsigned char *p = pat;
1471 if (pos + len > lim || pos_byte + len_byte > lim_byte)
1472 goto stop;
1473
1474 while (this_len > 0)
1475 {
1476 int charlen, buf_charlen;
1477 int pat_ch, buf_ch;
1478
1479 pat_ch = STRING_CHAR_AND_LENGTH (p, charlen);
1480 buf_ch = STRING_CHAR_AND_LENGTH (BYTE_POS_ADDR (this_pos_byte),
1481 buf_charlen);
1482 TRANSLATE (buf_ch, trt, buf_ch);
1483
1484 if (buf_ch != pat_ch)
1485 break;
1486
1487 this_len--;
1488 p += charlen;
1489
1490 this_pos_byte += buf_charlen;
1491 this_pos++;
1492 }
1493
1494 if (this_len == 0)
1495 {
1496 match_byte = this_pos_byte - pos_byte;
1497 pos += len;
1498 pos_byte += match_byte;
1499 break;
1500 }
1501
1502 INC_BOTH (pos, pos_byte);
1503 }
1504
1505 n--;
1506 }
1507 else if (lim > pos)
1508 while (n > 0)
1509 {
1510 while (1)
1511 {
1512 /* Try matching at position POS. */
1513 EMACS_INT this_pos = pos;
1514 EMACS_INT this_len = len;
1515 unsigned char *p = pat;
1516
1517 if (pos + len > lim)
1518 goto stop;
1519
1520 while (this_len > 0)
1521 {
1522 int pat_ch = *p++;
1523 int buf_ch = FETCH_BYTE (this_pos);
1524 TRANSLATE (buf_ch, trt, buf_ch);
1525
1526 if (buf_ch != pat_ch)
1527 break;
1528
1529 this_len--;
1530 this_pos++;
1531 }
1532
1533 if (this_len == 0)
1534 {
1535 match_byte = len;
1536 pos += len;
1537 break;
1538 }
1539
1540 pos++;
1541 }
1542
1543 n--;
1544 }
1545 /* Backwards search. */
1546 else if (lim < pos && multibyte)
1547 while (n < 0)
1548 {
1549 while (1)
1550 {
1551 /* Try matching at position POS. */
1552 EMACS_INT this_pos = pos;
1553 EMACS_INT this_pos_byte = pos_byte;
1554 EMACS_INT this_len = len;
1555 const unsigned char *p = pat + len_byte;
1556
1557 if (this_pos - len < lim || (pos_byte - len_byte) < lim_byte)
1558 goto stop;
1559
1560 while (this_len > 0)
1561 {
1562 int charlen;
1563 int pat_ch, buf_ch;
1564
1565 DEC_BOTH (this_pos, this_pos_byte);
1566 PREV_CHAR_BOUNDARY (p, pat);
1567 pat_ch = STRING_CHAR (p);
1568 buf_ch = STRING_CHAR (BYTE_POS_ADDR (this_pos_byte));
1569 TRANSLATE (buf_ch, trt, buf_ch);
1570
1571 if (buf_ch != pat_ch)
1572 break;
1573
1574 this_len--;
1575 }
1576
1577 if (this_len == 0)
1578 {
1579 match_byte = pos_byte - this_pos_byte;
1580 pos = this_pos;
1581 pos_byte = this_pos_byte;
1582 break;
1583 }
1584
1585 DEC_BOTH (pos, pos_byte);
1586 }
1587
1588 n++;
1589 }
1590 else if (lim < pos)
1591 while (n < 0)
1592 {
1593 while (1)
1594 {
1595 /* Try matching at position POS. */
1596 EMACS_INT this_pos = pos - len;
1597 EMACS_INT this_len = len;
1598 unsigned char *p = pat;
1599
1600 if (this_pos < lim)
1601 goto stop;
1602
1603 while (this_len > 0)
1604 {
1605 int pat_ch = *p++;
1606 int buf_ch = FETCH_BYTE (this_pos);
1607 TRANSLATE (buf_ch, trt, buf_ch);
1608
1609 if (buf_ch != pat_ch)
1610 break;
1611 this_len--;
1612 this_pos++;
1613 }
1614
1615 if (this_len == 0)
1616 {
1617 match_byte = len;
1618 pos -= len;
1619 break;
1620 }
1621
1622 pos--;
1623 }
1624
1625 n++;
1626 }
1627
1628 stop:
1629 if (n == 0)
1630 {
1631 if (forward)
1632 set_search_regs ((multibyte ? pos_byte : pos) - match_byte, match_byte);
1633 else
1634 set_search_regs (multibyte ? pos_byte : pos, match_byte);
1635
1636 return pos;
1637 }
1638 else if (n > 0)
1639 return -n;
1640 else
1641 return n;
1642 }
1643 \f
1644 /* Do Boyer-Moore search N times for the string BASE_PAT,
1645 whose length is LEN/LEN_BYTE,
1646 from buffer position POS/POS_BYTE until LIM/LIM_BYTE.
1647 DIRECTION says which direction we search in.
1648 TRT and INVERSE_TRT are translation tables.
1649 Characters in PAT are already translated by TRT.
1650
1651 This kind of search works if all the characters in BASE_PAT that
1652 have nontrivial translation are the same aside from the last byte.
1653 This makes it possible to translate just the last byte of a
1654 character, and do so after just a simple test of the context.
1655 CHAR_BASE is nonzero if there is such a non-ASCII character.
1656
1657 If that criterion is not satisfied, do not call this function. */
1658
1659 static EMACS_INT
1660 boyer_moore (EMACS_INT n, unsigned char *base_pat,
1661 EMACS_INT len, EMACS_INT len_byte,
1662 Lisp_Object trt, Lisp_Object inverse_trt,
1663 EMACS_INT pos, EMACS_INT pos_byte,
1664 EMACS_INT lim, EMACS_INT lim_byte, int char_base)
1665 {
1666 int direction = ((n > 0) ? 1 : -1);
1667 register EMACS_INT dirlen;
1668 EMACS_INT limit;
1669 int stride_for_teases = 0;
1670 int BM_tab[0400];
1671 register unsigned char *cursor, *p_limit;
1672 register EMACS_INT i;
1673 register int j;
1674 unsigned char *pat, *pat_end;
1675 int multibyte = ! NILP (current_buffer->enable_multibyte_characters);
1676
1677 unsigned char simple_translate[0400];
1678 /* These are set to the preceding bytes of a byte to be translated
1679 if char_base is nonzero. As the maximum byte length of a
1680 multibyte character is 5, we have to check at most four previous
1681 bytes. */
1682 int translate_prev_byte1 = 0;
1683 int translate_prev_byte2 = 0;
1684 int translate_prev_byte3 = 0;
1685 int translate_prev_byte4 = 0;
1686
1687 /* The general approach is that we are going to maintain that we know
1688 the first (closest to the present position, in whatever direction
1689 we're searching) character that could possibly be the last
1690 (furthest from present position) character of a valid match. We
1691 advance the state of our knowledge by looking at that character
1692 and seeing whether it indeed matches the last character of the
1693 pattern. If it does, we take a closer look. If it does not, we
1694 move our pointer (to putative last characters) as far as is
1695 logically possible. This amount of movement, which I call a
1696 stride, will be the length of the pattern if the actual character
1697 appears nowhere in the pattern, otherwise it will be the distance
1698 from the last occurrence of that character to the end of the
1699 pattern. If the amount is zero we have a possible match. */
1700
1701 /* Here we make a "mickey mouse" BM table. The stride of the search
1702 is determined only by the last character of the putative match.
1703 If that character does not match, we will stride the proper
1704 distance to propose a match that superimposes it on the last
1705 instance of a character that matches it (per trt), or misses
1706 it entirely if there is none. */
1707
1708 dirlen = len_byte * direction;
1709
1710 /* Record position after the end of the pattern. */
1711 pat_end = base_pat + len_byte;
1712 /* BASE_PAT points to a character that we start scanning from.
1713 It is the first character in a forward search,
1714 the last character in a backward search. */
1715 if (direction < 0)
1716 base_pat = pat_end - 1;
1717
1718 /* A character that does not appear in the pattern induces a
1719 stride equal to the pattern length. */
1720 for (i = 0; i < 0400; i++)
1721 BM_tab[i] = dirlen;
1722
1723 /* We use this for translation, instead of TRT itself.
1724 We fill this in to handle the characters that actually
1725 occur in the pattern. Others don't matter anyway! */
1726 for (i = 0; i < 0400; i++)
1727 simple_translate[i] = i;
1728
1729 if (char_base)
1730 {
1731 /* Setup translate_prev_byte1/2/3/4 from CHAR_BASE. Only a
1732 byte following them are the target of translation. */
1733 unsigned char str[MAX_MULTIBYTE_LENGTH];
1734 int len = CHAR_STRING (char_base, str);
1735
1736 translate_prev_byte1 = str[len - 2];
1737 if (len > 2)
1738 {
1739 translate_prev_byte2 = str[len - 3];
1740 if (len > 3)
1741 {
1742 translate_prev_byte3 = str[len - 4];
1743 if (len > 4)
1744 translate_prev_byte4 = str[len - 5];
1745 }
1746 }
1747 }
1748
1749 i = 0;
1750 while (i != dirlen)
1751 {
1752 unsigned char *ptr = base_pat + i;
1753 i += direction;
1754 if (! NILP (trt))
1755 {
1756 /* If the byte currently looking at is the last of a
1757 character to check case-equivalents, set CH to that
1758 character. An ASCII character and a non-ASCII character
1759 matching with CHAR_BASE are to be checked. */
1760 int ch = -1;
1761
1762 if (ASCII_BYTE_P (*ptr) || ! multibyte)
1763 ch = *ptr;
1764 else if (char_base
1765 && ((pat_end - ptr) == 1 || CHAR_HEAD_P (ptr[1])))
1766 {
1767 unsigned char *charstart = ptr - 1;
1768
1769 while (! (CHAR_HEAD_P (*charstart)))
1770 charstart--;
1771 ch = STRING_CHAR (charstart);
1772 if (char_base != (ch & ~0x3F))
1773 ch = -1;
1774 }
1775
1776 if (ch >= 0200)
1777 j = (ch & 0x3F) | 0200;
1778 else
1779 j = *ptr;
1780
1781 if (i == dirlen)
1782 stride_for_teases = BM_tab[j];
1783
1784 BM_tab[j] = dirlen - i;
1785 /* A translation table is accompanied by its inverse -- see */
1786 /* comment following downcase_table for details */
1787 if (ch >= 0)
1788 {
1789 int starting_ch = ch;
1790 int starting_j = j;
1791
1792 while (1)
1793 {
1794 TRANSLATE (ch, inverse_trt, ch);
1795 if (ch >= 0200)
1796 j = (ch & 0x3F) | 0200;
1797 else
1798 j = ch;
1799
1800 /* For all the characters that map into CH,
1801 set up simple_translate to map the last byte
1802 into STARTING_J. */
1803 simple_translate[j] = starting_j;
1804 if (ch == starting_ch)
1805 break;
1806 BM_tab[j] = dirlen - i;
1807 }
1808 }
1809 }
1810 else
1811 {
1812 j = *ptr;
1813
1814 if (i == dirlen)
1815 stride_for_teases = BM_tab[j];
1816 BM_tab[j] = dirlen - i;
1817 }
1818 /* stride_for_teases tells how much to stride if we get a
1819 match on the far character but are subsequently
1820 disappointed, by recording what the stride would have been
1821 for that character if the last character had been
1822 different. */
1823 }
1824 pos_byte += dirlen - ((direction > 0) ? direction : 0);
1825 /* loop invariant - POS_BYTE points at where last char (first
1826 char if reverse) of pattern would align in a possible match. */
1827 while (n != 0)
1828 {
1829 EMACS_INT tail_end;
1830 unsigned char *tail_end_ptr;
1831
1832 /* It's been reported that some (broken) compiler thinks that
1833 Boolean expressions in an arithmetic context are unsigned.
1834 Using an explicit ?1:0 prevents this. */
1835 if ((lim_byte - pos_byte - ((direction > 0) ? 1 : 0)) * direction
1836 < 0)
1837 return (n * (0 - direction));
1838 /* First we do the part we can by pointers (maybe nothing) */
1839 QUIT;
1840 pat = base_pat;
1841 limit = pos_byte - dirlen + direction;
1842 if (direction > 0)
1843 {
1844 limit = BUFFER_CEILING_OF (limit);
1845 /* LIMIT is now the last (not beyond-last!) value POS_BYTE
1846 can take on without hitting edge of buffer or the gap. */
1847 limit = min (limit, pos_byte + 20000);
1848 limit = min (limit, lim_byte - 1);
1849 }
1850 else
1851 {
1852 limit = BUFFER_FLOOR_OF (limit);
1853 /* LIMIT is now the last (not beyond-last!) value POS_BYTE
1854 can take on without hitting edge of buffer or the gap. */
1855 limit = max (limit, pos_byte - 20000);
1856 limit = max (limit, lim_byte);
1857 }
1858 tail_end = BUFFER_CEILING_OF (pos_byte) + 1;
1859 tail_end_ptr = BYTE_POS_ADDR (tail_end);
1860
1861 if ((limit - pos_byte) * direction > 20)
1862 {
1863 unsigned char *p2;
1864
1865 p_limit = BYTE_POS_ADDR (limit);
1866 p2 = (cursor = BYTE_POS_ADDR (pos_byte));
1867 /* In this loop, pos + cursor - p2 is the surrogate for pos. */
1868 while (1) /* use one cursor setting as long as i can */
1869 {
1870 if (direction > 0) /* worth duplicating */
1871 {
1872 while (cursor <= p_limit)
1873 {
1874 if (BM_tab[*cursor] == 0)
1875 goto hit;
1876 cursor += BM_tab[*cursor];
1877 }
1878 }
1879 else
1880 {
1881 while (cursor >= p_limit)
1882 {
1883 if (BM_tab[*cursor] == 0)
1884 goto hit;
1885 cursor += BM_tab[*cursor];
1886 }
1887 }
1888 /* If you are here, cursor is beyond the end of the
1889 searched region. You fail to match within the
1890 permitted region and would otherwise try a character
1891 beyond that region. */
1892 break;
1893
1894 hit:
1895 i = dirlen - direction;
1896 if (! NILP (trt))
1897 {
1898 while ((i -= direction) + direction != 0)
1899 {
1900 int ch;
1901 cursor -= direction;
1902 /* Translate only the last byte of a character. */
1903 if (! multibyte
1904 || ((cursor == tail_end_ptr
1905 || CHAR_HEAD_P (cursor[1]))
1906 && (CHAR_HEAD_P (cursor[0])
1907 /* Check if this is the last byte of
1908 a translable character. */
1909 || (translate_prev_byte1 == cursor[-1]
1910 && (CHAR_HEAD_P (translate_prev_byte1)
1911 || (translate_prev_byte2 == cursor[-2]
1912 && (CHAR_HEAD_P (translate_prev_byte2)
1913 || (translate_prev_byte3 == cursor[-3]))))))))
1914 ch = simple_translate[*cursor];
1915 else
1916 ch = *cursor;
1917 if (pat[i] != ch)
1918 break;
1919 }
1920 }
1921 else
1922 {
1923 while ((i -= direction) + direction != 0)
1924 {
1925 cursor -= direction;
1926 if (pat[i] != *cursor)
1927 break;
1928 }
1929 }
1930 cursor += dirlen - i - direction; /* fix cursor */
1931 if (i + direction == 0)
1932 {
1933 EMACS_INT position, start, end;
1934
1935 cursor -= direction;
1936
1937 position = pos_byte + cursor - p2 + ((direction > 0)
1938 ? 1 - len_byte : 0);
1939 set_search_regs (position, len_byte);
1940
1941 if (NILP (Vinhibit_changing_match_data))
1942 {
1943 start = search_regs.start[0];
1944 end = search_regs.end[0];
1945 }
1946 else
1947 /* If Vinhibit_changing_match_data is non-nil,
1948 search_regs will not be changed. So let's
1949 compute start and end here. */
1950 {
1951 start = BYTE_TO_CHAR (position);
1952 end = BYTE_TO_CHAR (position + len_byte);
1953 }
1954
1955 if ((n -= direction) != 0)
1956 cursor += dirlen; /* to resume search */
1957 else
1958 return direction > 0 ? end : start;
1959 }
1960 else
1961 cursor += stride_for_teases; /* <sigh> we lose - */
1962 }
1963 pos_byte += cursor - p2;
1964 }
1965 else
1966 /* Now we'll pick up a clump that has to be done the hard
1967 way because it covers a discontinuity. */
1968 {
1969 limit = ((direction > 0)
1970 ? BUFFER_CEILING_OF (pos_byte - dirlen + 1)
1971 : BUFFER_FLOOR_OF (pos_byte - dirlen - 1));
1972 limit = ((direction > 0)
1973 ? min (limit + len_byte, lim_byte - 1)
1974 : max (limit - len_byte, lim_byte));
1975 /* LIMIT is now the last value POS_BYTE can have
1976 and still be valid for a possible match. */
1977 while (1)
1978 {
1979 /* This loop can be coded for space rather than
1980 speed because it will usually run only once.
1981 (the reach is at most len + 21, and typically
1982 does not exceed len). */
1983 while ((limit - pos_byte) * direction >= 0)
1984 {
1985 int ch = FETCH_BYTE (pos_byte);
1986 if (BM_tab[ch] == 0)
1987 goto hit2;
1988 pos_byte += BM_tab[ch];
1989 }
1990 break; /* ran off the end */
1991
1992 hit2:
1993 /* Found what might be a match. */
1994 i = dirlen - direction;
1995 while ((i -= direction) + direction != 0)
1996 {
1997 int ch;
1998 unsigned char *ptr;
1999 pos_byte -= direction;
2000 ptr = BYTE_POS_ADDR (pos_byte);
2001 /* Translate only the last byte of a character. */
2002 if (! multibyte
2003 || ((ptr == tail_end_ptr
2004 || CHAR_HEAD_P (ptr[1]))
2005 && (CHAR_HEAD_P (ptr[0])
2006 /* Check if this is the last byte of a
2007 translable character. */
2008 || (translate_prev_byte1 == ptr[-1]
2009 && (CHAR_HEAD_P (translate_prev_byte1)
2010 || (translate_prev_byte2 == ptr[-2]
2011 && (CHAR_HEAD_P (translate_prev_byte2)
2012 || translate_prev_byte3 == ptr[-3])))))))
2013 ch = simple_translate[*ptr];
2014 else
2015 ch = *ptr;
2016 if (pat[i] != ch)
2017 break;
2018 }
2019 /* Above loop has moved POS_BYTE part or all the way
2020 back to the first pos (last pos if reverse).
2021 Set it once again at the last (first if reverse) char. */
2022 pos_byte += dirlen - i - direction;
2023 if (i + direction == 0)
2024 {
2025 EMACS_INT position, start, end;
2026 pos_byte -= direction;
2027
2028 position = pos_byte + ((direction > 0) ? 1 - len_byte : 0);
2029 set_search_regs (position, len_byte);
2030
2031 if (NILP (Vinhibit_changing_match_data))
2032 {
2033 start = search_regs.start[0];
2034 end = search_regs.end[0];
2035 }
2036 else
2037 /* If Vinhibit_changing_match_data is non-nil,
2038 search_regs will not be changed. So let's
2039 compute start and end here. */
2040 {
2041 start = BYTE_TO_CHAR (position);
2042 end = BYTE_TO_CHAR (position + len_byte);
2043 }
2044
2045 if ((n -= direction) != 0)
2046 pos_byte += dirlen; /* to resume search */
2047 else
2048 return direction > 0 ? end : start;
2049 }
2050 else
2051 pos_byte += stride_for_teases;
2052 }
2053 }
2054 /* We have done one clump. Can we continue? */
2055 if ((lim_byte - pos_byte) * direction < 0)
2056 return ((0 - n) * direction);
2057 }
2058 return BYTE_TO_CHAR (pos_byte);
2059 }
2060
2061 /* Record beginning BEG_BYTE and end BEG_BYTE + NBYTES
2062 for the overall match just found in the current buffer.
2063 Also clear out the match data for registers 1 and up. */
2064
2065 static void
2066 set_search_regs (EMACS_INT beg_byte, EMACS_INT nbytes)
2067 {
2068 int i;
2069
2070 if (!NILP (Vinhibit_changing_match_data))
2071 return;
2072
2073 /* Make sure we have registers in which to store
2074 the match position. */
2075 if (search_regs.num_regs == 0)
2076 {
2077 search_regs.start = (regoff_t *) xmalloc (2 * sizeof (regoff_t));
2078 search_regs.end = (regoff_t *) xmalloc (2 * sizeof (regoff_t));
2079 search_regs.num_regs = 2;
2080 }
2081
2082 /* Clear out the other registers. */
2083 for (i = 1; i < search_regs.num_regs; i++)
2084 {
2085 search_regs.start[i] = -1;
2086 search_regs.end[i] = -1;
2087 }
2088
2089 search_regs.start[0] = BYTE_TO_CHAR (beg_byte);
2090 search_regs.end[0] = BYTE_TO_CHAR (beg_byte + nbytes);
2091 XSETBUFFER (last_thing_searched, current_buffer);
2092 }
2093 \f
2094 /* Given STRING, a string of words separated by word delimiters,
2095 compute a regexp that matches those exact words separated by
2096 arbitrary punctuation. If LAX is nonzero, the end of the string
2097 need not match a word boundary unless it ends in whitespace. */
2098
2099 static Lisp_Object
2100 wordify (Lisp_Object string, int lax)
2101 {
2102 register unsigned char *p, *o;
2103 register EMACS_INT i, i_byte, len, punct_count = 0, word_count = 0;
2104 Lisp_Object val;
2105 int prev_c = 0;
2106 EMACS_INT adjust;
2107 int whitespace_at_end;
2108
2109 CHECK_STRING (string);
2110 p = SDATA (string);
2111 len = SCHARS (string);
2112
2113 for (i = 0, i_byte = 0; i < len; )
2114 {
2115 int c;
2116
2117 FETCH_STRING_CHAR_AS_MULTIBYTE_ADVANCE (c, string, i, i_byte);
2118
2119 if (SYNTAX (c) != Sword)
2120 {
2121 punct_count++;
2122 if (i > 0 && SYNTAX (prev_c) == Sword)
2123 word_count++;
2124 }
2125
2126 prev_c = c;
2127 }
2128
2129 if (SYNTAX (prev_c) == Sword)
2130 {
2131 word_count++;
2132 whitespace_at_end = 0;
2133 }
2134 else
2135 whitespace_at_end = 1;
2136
2137 if (!word_count)
2138 return empty_unibyte_string;
2139
2140 adjust = - punct_count + 5 * (word_count - 1)
2141 + ((lax && !whitespace_at_end) ? 2 : 4);
2142 if (STRING_MULTIBYTE (string))
2143 val = make_uninit_multibyte_string (len + adjust,
2144 SBYTES (string)
2145 + adjust);
2146 else
2147 val = make_uninit_string (len + adjust);
2148
2149 o = SDATA (val);
2150 *o++ = '\\';
2151 *o++ = 'b';
2152 prev_c = 0;
2153
2154 for (i = 0, i_byte = 0; i < len; )
2155 {
2156 int c;
2157 EMACS_INT i_byte_orig = i_byte;
2158
2159 FETCH_STRING_CHAR_AS_MULTIBYTE_ADVANCE (c, string, i, i_byte);
2160
2161 if (SYNTAX (c) == Sword)
2162 {
2163 memcpy (o, SDATA (string) + i_byte_orig, i_byte - i_byte_orig);
2164 o += i_byte - i_byte_orig;
2165 }
2166 else if (i > 0 && SYNTAX (prev_c) == Sword && --word_count)
2167 {
2168 *o++ = '\\';
2169 *o++ = 'W';
2170 *o++ = '\\';
2171 *o++ = 'W';
2172 *o++ = '*';
2173 }
2174
2175 prev_c = c;
2176 }
2177
2178 if (!lax || whitespace_at_end)
2179 {
2180 *o++ = '\\';
2181 *o++ = 'b';
2182 }
2183
2184 return val;
2185 }
2186 \f
2187 DEFUN ("search-backward", Fsearch_backward, Ssearch_backward, 1, 4,
2188 "MSearch backward: ",
2189 doc: /* Search backward from point for STRING.
2190 Set point to the beginning of the occurrence found, and return point.
2191 An optional second argument bounds the search; it is a buffer position.
2192 The match found must not extend before that position.
2193 Optional third argument, if t, means if fail just return nil (no error).
2194 If not nil and not t, position at limit of search and return nil.
2195 Optional fourth argument is repeat count--search for successive occurrences.
2196
2197 Search case-sensitivity is determined by the value of the variable
2198 `case-fold-search', which see.
2199
2200 See also the functions `match-beginning', `match-end' and `replace-match'. */)
2201 (Lisp_Object string, Lisp_Object bound, Lisp_Object noerror, Lisp_Object count)
2202 {
2203 return search_command (string, bound, noerror, count, -1, 0, 0);
2204 }
2205
2206 DEFUN ("search-forward", Fsearch_forward, Ssearch_forward, 1, 4, "MSearch: ",
2207 doc: /* Search forward from point for STRING.
2208 Set point to the end of the occurrence found, and return point.
2209 An optional second argument bounds the search; it is a buffer position.
2210 The match found must not extend after that position. A value of nil is
2211 equivalent to (point-max).
2212 Optional third argument, if t, means if fail just return nil (no error).
2213 If not nil and not t, move to limit of search and return nil.
2214 Optional fourth argument is repeat count--search for successive occurrences.
2215
2216 Search case-sensitivity is determined by the value of the variable
2217 `case-fold-search', which see.
2218
2219 See also the functions `match-beginning', `match-end' and `replace-match'. */)
2220 (Lisp_Object string, Lisp_Object bound, Lisp_Object noerror, Lisp_Object count)
2221 {
2222 return search_command (string, bound, noerror, count, 1, 0, 0);
2223 }
2224
2225 DEFUN ("word-search-backward", Fword_search_backward, Sword_search_backward, 1, 4,
2226 "sWord search backward: ",
2227 doc: /* Search backward from point for STRING, ignoring differences in punctuation.
2228 Set point to the beginning of the occurrence found, and return point.
2229 An optional second argument bounds the search; it is a buffer position.
2230 The match found must not extend before that position.
2231 Optional third argument, if t, means if fail just return nil (no error).
2232 If not nil and not t, move to limit of search and return nil.
2233 Optional fourth argument is repeat count--search for successive occurrences. */)
2234 (Lisp_Object string, Lisp_Object bound, Lisp_Object noerror, Lisp_Object count)
2235 {
2236 return search_command (wordify (string, 0), bound, noerror, count, -1, 1, 0);
2237 }
2238
2239 DEFUN ("word-search-forward", Fword_search_forward, Sword_search_forward, 1, 4,
2240 "sWord search: ",
2241 doc: /* Search forward from point for STRING, ignoring differences in punctuation.
2242 Set point to the end of the occurrence found, and return point.
2243 An optional second argument bounds the search; it is a buffer position.
2244 The match found must not extend after that position.
2245 Optional third argument, if t, means if fail just return nil (no error).
2246 If not nil and not t, move to limit of search and return nil.
2247 Optional fourth argument is repeat count--search for successive occurrences. */)
2248 (Lisp_Object string, Lisp_Object bound, Lisp_Object noerror, Lisp_Object count)
2249 {
2250 return search_command (wordify (string, 0), bound, noerror, count, 1, 1, 0);
2251 }
2252
2253 DEFUN ("word-search-backward-lax", Fword_search_backward_lax, Sword_search_backward_lax, 1, 4,
2254 "sWord search backward: ",
2255 doc: /* Search backward from point for STRING, ignoring differences in punctuation.
2256 Set point to the beginning of the occurrence found, and return point.
2257
2258 Unlike `word-search-backward', the end of STRING need not match a word
2259 boundary unless it ends in whitespace.
2260
2261 An optional second argument bounds the search; it is a buffer position.
2262 The match found must not extend before that position.
2263 Optional third argument, if t, means if fail just return nil (no error).
2264 If not nil and not t, move to limit of search and return nil.
2265 Optional fourth argument is repeat count--search for successive occurrences. */)
2266 (Lisp_Object string, Lisp_Object bound, Lisp_Object noerror, Lisp_Object count)
2267 {
2268 return search_command (wordify (string, 1), bound, noerror, count, -1, 1, 0);
2269 }
2270
2271 DEFUN ("word-search-forward-lax", Fword_search_forward_lax, Sword_search_forward_lax, 1, 4,
2272 "sWord search: ",
2273 doc: /* Search forward from point for STRING, ignoring differences in punctuation.
2274 Set point to the end of the occurrence found, and return point.
2275
2276 Unlike `word-search-forward', the end of STRING need not match a word
2277 boundary unless it ends in whitespace.
2278
2279 An optional second argument bounds the search; it is a buffer position.
2280 The match found must not extend after that position.
2281 Optional third argument, if t, means if fail just return nil (no error).
2282 If not nil and not t, move to limit of search and return nil.
2283 Optional fourth argument is repeat count--search for successive occurrences. */)
2284 (Lisp_Object string, Lisp_Object bound, Lisp_Object noerror, Lisp_Object count)
2285 {
2286 return search_command (wordify (string, 1), bound, noerror, count, 1, 1, 0);
2287 }
2288
2289 DEFUN ("re-search-backward", Fre_search_backward, Sre_search_backward, 1, 4,
2290 "sRE search backward: ",
2291 doc: /* Search backward from point for match for regular expression REGEXP.
2292 Set point to the beginning of the match, and return point.
2293 The match found is the one starting last in the buffer
2294 and yet ending before the origin of the search.
2295 An optional second argument bounds the search; it is a buffer position.
2296 The match found must start at or after that position.
2297 Optional third argument, if t, means if fail just return nil (no error).
2298 If not nil and not t, move to limit of search and return nil.
2299 Optional fourth argument is repeat count--search for successive occurrences.
2300 See also the functions `match-beginning', `match-end', `match-string',
2301 and `replace-match'. */)
2302 (Lisp_Object regexp, Lisp_Object bound, Lisp_Object noerror, Lisp_Object count)
2303 {
2304 return search_command (regexp, bound, noerror, count, -1, 1, 0);
2305 }
2306
2307 DEFUN ("re-search-forward", Fre_search_forward, Sre_search_forward, 1, 4,
2308 "sRE search: ",
2309 doc: /* Search forward from point for regular expression REGEXP.
2310 Set point to the end of the occurrence found, and return point.
2311 An optional second argument bounds the search; it is a buffer position.
2312 The match found must not extend after that position.
2313 Optional third argument, if t, means if fail just return nil (no error).
2314 If not nil and not t, move to limit of search and return nil.
2315 Optional fourth argument is repeat count--search for successive occurrences.
2316 See also the functions `match-beginning', `match-end', `match-string',
2317 and `replace-match'. */)
2318 (Lisp_Object regexp, Lisp_Object bound, Lisp_Object noerror, Lisp_Object count)
2319 {
2320 return search_command (regexp, bound, noerror, count, 1, 1, 0);
2321 }
2322
2323 DEFUN ("posix-search-backward", Fposix_search_backward, Sposix_search_backward, 1, 4,
2324 "sPosix search backward: ",
2325 doc: /* Search backward from point for match for regular expression REGEXP.
2326 Find the longest match in accord with Posix regular expression rules.
2327 Set point to the beginning of the match, and return point.
2328 The match found is the one starting last in the buffer
2329 and yet ending before the origin of the search.
2330 An optional second argument bounds the search; it is a buffer position.
2331 The match found must start at or after that position.
2332 Optional third argument, if t, means if fail just return nil (no error).
2333 If not nil and not t, move to limit of search and return nil.
2334 Optional fourth argument is repeat count--search for successive occurrences.
2335 See also the functions `match-beginning', `match-end', `match-string',
2336 and `replace-match'. */)
2337 (Lisp_Object regexp, Lisp_Object bound, Lisp_Object noerror, Lisp_Object count)
2338 {
2339 return search_command (regexp, bound, noerror, count, -1, 1, 1);
2340 }
2341
2342 DEFUN ("posix-search-forward", Fposix_search_forward, Sposix_search_forward, 1, 4,
2343 "sPosix search: ",
2344 doc: /* Search forward from point for regular expression REGEXP.
2345 Find the longest match in accord with Posix regular expression rules.
2346 Set point to the end of the occurrence found, and return point.
2347 An optional second argument bounds the search; it is a buffer position.
2348 The match found must not extend after that position.
2349 Optional third argument, if t, means if fail just return nil (no error).
2350 If not nil and not t, move to limit of search and return nil.
2351 Optional fourth argument is repeat count--search for successive occurrences.
2352 See also the functions `match-beginning', `match-end', `match-string',
2353 and `replace-match'. */)
2354 (Lisp_Object regexp, Lisp_Object bound, Lisp_Object noerror, Lisp_Object count)
2355 {
2356 return search_command (regexp, bound, noerror, count, 1, 1, 1);
2357 }
2358 \f
2359 DEFUN ("replace-match", Freplace_match, Sreplace_match, 1, 5, 0,
2360 doc: /* Replace text matched by last search with NEWTEXT.
2361 Leave point at the end of the replacement text.
2362
2363 If second arg FIXEDCASE is non-nil, do not alter case of replacement text.
2364 Otherwise maybe capitalize the whole text, or maybe just word initials,
2365 based on the replaced text.
2366 If the replaced text has only capital letters
2367 and has at least one multiletter word, convert NEWTEXT to all caps.
2368 Otherwise if all words are capitalized in the replaced text,
2369 capitalize each word in NEWTEXT.
2370
2371 If third arg LITERAL is non-nil, insert NEWTEXT literally.
2372 Otherwise treat `\\' as special:
2373 `\\&' in NEWTEXT means substitute original matched text.
2374 `\\N' means substitute what matched the Nth `\\(...\\)'.
2375 If Nth parens didn't match, substitute nothing.
2376 `\\\\' means insert one `\\'.
2377 Case conversion does not apply to these substitutions.
2378
2379 FIXEDCASE and LITERAL are optional arguments.
2380
2381 The optional fourth argument STRING can be a string to modify.
2382 This is meaningful when the previous match was done against STRING,
2383 using `string-match'. When used this way, `replace-match'
2384 creates and returns a new string made by copying STRING and replacing
2385 the part of STRING that was matched.
2386
2387 The optional fifth argument SUBEXP specifies a subexpression;
2388 it says to replace just that subexpression with NEWTEXT,
2389 rather than replacing the entire matched text.
2390 This is, in a vague sense, the inverse of using `\\N' in NEWTEXT;
2391 `\\N' copies subexp N into NEWTEXT, but using N as SUBEXP puts
2392 NEWTEXT in place of subexp N.
2393 This is useful only after a regular expression search or match,
2394 since only regular expressions have distinguished subexpressions. */)
2395 (Lisp_Object newtext, Lisp_Object fixedcase, Lisp_Object literal, Lisp_Object string, Lisp_Object subexp)
2396 {
2397 enum { nochange, all_caps, cap_initial } case_action;
2398 register EMACS_INT pos, pos_byte;
2399 int some_multiletter_word;
2400 int some_lowercase;
2401 int some_uppercase;
2402 int some_nonuppercase_initial;
2403 register int c, prevc;
2404 int sub;
2405 EMACS_INT opoint, newpoint;
2406
2407 CHECK_STRING (newtext);
2408
2409 if (! NILP (string))
2410 CHECK_STRING (string);
2411
2412 case_action = nochange; /* We tried an initialization */
2413 /* but some C compilers blew it */
2414
2415 if (search_regs.num_regs <= 0)
2416 error ("`replace-match' called before any match found");
2417
2418 if (NILP (subexp))
2419 sub = 0;
2420 else
2421 {
2422 CHECK_NUMBER (subexp);
2423 sub = XINT (subexp);
2424 if (sub < 0 || sub >= search_regs.num_regs)
2425 args_out_of_range (subexp, make_number (search_regs.num_regs));
2426 }
2427
2428 if (NILP (string))
2429 {
2430 if (search_regs.start[sub] < BEGV
2431 || search_regs.start[sub] > search_regs.end[sub]
2432 || search_regs.end[sub] > ZV)
2433 args_out_of_range (make_number (search_regs.start[sub]),
2434 make_number (search_regs.end[sub]));
2435 }
2436 else
2437 {
2438 if (search_regs.start[sub] < 0
2439 || search_regs.start[sub] > search_regs.end[sub]
2440 || search_regs.end[sub] > SCHARS (string))
2441 args_out_of_range (make_number (search_regs.start[sub]),
2442 make_number (search_regs.end[sub]));
2443 }
2444
2445 if (NILP (fixedcase))
2446 {
2447 /* Decide how to casify by examining the matched text. */
2448 EMACS_INT last;
2449
2450 pos = search_regs.start[sub];
2451 last = search_regs.end[sub];
2452
2453 if (NILP (string))
2454 pos_byte = CHAR_TO_BYTE (pos);
2455 else
2456 pos_byte = string_char_to_byte (string, pos);
2457
2458 prevc = '\n';
2459 case_action = all_caps;
2460
2461 /* some_multiletter_word is set nonzero if any original word
2462 is more than one letter long. */
2463 some_multiletter_word = 0;
2464 some_lowercase = 0;
2465 some_nonuppercase_initial = 0;
2466 some_uppercase = 0;
2467
2468 while (pos < last)
2469 {
2470 if (NILP (string))
2471 {
2472 c = FETCH_CHAR_AS_MULTIBYTE (pos_byte);
2473 INC_BOTH (pos, pos_byte);
2474 }
2475 else
2476 FETCH_STRING_CHAR_AS_MULTIBYTE_ADVANCE (c, string, pos, pos_byte);
2477
2478 if (LOWERCASEP (c))
2479 {
2480 /* Cannot be all caps if any original char is lower case */
2481
2482 some_lowercase = 1;
2483 if (SYNTAX (prevc) != Sword)
2484 some_nonuppercase_initial = 1;
2485 else
2486 some_multiletter_word = 1;
2487 }
2488 else if (UPPERCASEP (c))
2489 {
2490 some_uppercase = 1;
2491 if (SYNTAX (prevc) != Sword)
2492 ;
2493 else
2494 some_multiletter_word = 1;
2495 }
2496 else
2497 {
2498 /* If the initial is a caseless word constituent,
2499 treat that like a lowercase initial. */
2500 if (SYNTAX (prevc) != Sword)
2501 some_nonuppercase_initial = 1;
2502 }
2503
2504 prevc = c;
2505 }
2506
2507 /* Convert to all caps if the old text is all caps
2508 and has at least one multiletter word. */
2509 if (! some_lowercase && some_multiletter_word)
2510 case_action = all_caps;
2511 /* Capitalize each word, if the old text has all capitalized words. */
2512 else if (!some_nonuppercase_initial && some_multiletter_word)
2513 case_action = cap_initial;
2514 else if (!some_nonuppercase_initial && some_uppercase)
2515 /* Should x -> yz, operating on X, give Yz or YZ?
2516 We'll assume the latter. */
2517 case_action = all_caps;
2518 else
2519 case_action = nochange;
2520 }
2521
2522 /* Do replacement in a string. */
2523 if (!NILP (string))
2524 {
2525 Lisp_Object before, after;
2526
2527 before = Fsubstring (string, make_number (0),
2528 make_number (search_regs.start[sub]));
2529 after = Fsubstring (string, make_number (search_regs.end[sub]), Qnil);
2530
2531 /* Substitute parts of the match into NEWTEXT
2532 if desired. */
2533 if (NILP (literal))
2534 {
2535 EMACS_INT lastpos = 0;
2536 EMACS_INT lastpos_byte = 0;
2537 /* We build up the substituted string in ACCUM. */
2538 Lisp_Object accum;
2539 Lisp_Object middle;
2540 int length = SBYTES (newtext);
2541
2542 accum = Qnil;
2543
2544 for (pos_byte = 0, pos = 0; pos_byte < length;)
2545 {
2546 EMACS_INT substart = -1;
2547 EMACS_INT subend = 0;
2548 int delbackslash = 0;
2549
2550 FETCH_STRING_CHAR_ADVANCE (c, newtext, pos, pos_byte);
2551
2552 if (c == '\\')
2553 {
2554 FETCH_STRING_CHAR_ADVANCE (c, newtext, pos, pos_byte);
2555
2556 if (c == '&')
2557 {
2558 substart = search_regs.start[sub];
2559 subend = search_regs.end[sub];
2560 }
2561 else if (c >= '1' && c <= '9')
2562 {
2563 if (search_regs.start[c - '0'] >= 0
2564 && c <= search_regs.num_regs + '0')
2565 {
2566 substart = search_regs.start[c - '0'];
2567 subend = search_regs.end[c - '0'];
2568 }
2569 else
2570 {
2571 /* If that subexp did not match,
2572 replace \\N with nothing. */
2573 substart = 0;
2574 subend = 0;
2575 }
2576 }
2577 else if (c == '\\')
2578 delbackslash = 1;
2579 else
2580 error ("Invalid use of `\\' in replacement text");
2581 }
2582 if (substart >= 0)
2583 {
2584 if (pos - 2 != lastpos)
2585 middle = substring_both (newtext, lastpos,
2586 lastpos_byte,
2587 pos - 2, pos_byte - 2);
2588 else
2589 middle = Qnil;
2590 accum = concat3 (accum, middle,
2591 Fsubstring (string,
2592 make_number (substart),
2593 make_number (subend)));
2594 lastpos = pos;
2595 lastpos_byte = pos_byte;
2596 }
2597 else if (delbackslash)
2598 {
2599 middle = substring_both (newtext, lastpos,
2600 lastpos_byte,
2601 pos - 1, pos_byte - 1);
2602
2603 accum = concat2 (accum, middle);
2604 lastpos = pos;
2605 lastpos_byte = pos_byte;
2606 }
2607 }
2608
2609 if (pos != lastpos)
2610 middle = substring_both (newtext, lastpos,
2611 lastpos_byte,
2612 pos, pos_byte);
2613 else
2614 middle = Qnil;
2615
2616 newtext = concat2 (accum, middle);
2617 }
2618
2619 /* Do case substitution in NEWTEXT if desired. */
2620 if (case_action == all_caps)
2621 newtext = Fupcase (newtext);
2622 else if (case_action == cap_initial)
2623 newtext = Fupcase_initials (newtext);
2624
2625 return concat3 (before, newtext, after);
2626 }
2627
2628 /* Record point, then move (quietly) to the start of the match. */
2629 if (PT >= search_regs.end[sub])
2630 opoint = PT - ZV;
2631 else if (PT > search_regs.start[sub])
2632 opoint = search_regs.end[sub] - ZV;
2633 else
2634 opoint = PT;
2635
2636 /* If we want non-literal replacement,
2637 perform substitution on the replacement string. */
2638 if (NILP (literal))
2639 {
2640 EMACS_INT length = SBYTES (newtext);
2641 unsigned char *substed;
2642 EMACS_INT substed_alloc_size, substed_len;
2643 int buf_multibyte = !NILP (current_buffer->enable_multibyte_characters);
2644 int str_multibyte = STRING_MULTIBYTE (newtext);
2645 Lisp_Object rev_tbl;
2646 int really_changed = 0;
2647
2648 rev_tbl = Qnil;
2649
2650 substed_alloc_size = length * 2 + 100;
2651 substed = (unsigned char *) xmalloc (substed_alloc_size + 1);
2652 substed_len = 0;
2653
2654 /* Go thru NEWTEXT, producing the actual text to insert in
2655 SUBSTED while adjusting multibyteness to that of the current
2656 buffer. */
2657
2658 for (pos_byte = 0, pos = 0; pos_byte < length;)
2659 {
2660 unsigned char str[MAX_MULTIBYTE_LENGTH];
2661 const unsigned char *add_stuff = NULL;
2662 EMACS_INT add_len = 0;
2663 int idx = -1;
2664
2665 if (str_multibyte)
2666 {
2667 FETCH_STRING_CHAR_ADVANCE_NO_CHECK (c, newtext, pos, pos_byte);
2668 if (!buf_multibyte)
2669 c = multibyte_char_to_unibyte (c, rev_tbl);
2670 }
2671 else
2672 {
2673 /* Note that we don't have to increment POS. */
2674 c = SREF (newtext, pos_byte++);
2675 if (buf_multibyte)
2676 MAKE_CHAR_MULTIBYTE (c);
2677 }
2678
2679 /* Either set ADD_STUFF and ADD_LEN to the text to put in SUBSTED,
2680 or set IDX to a match index, which means put that part
2681 of the buffer text into SUBSTED. */
2682
2683 if (c == '\\')
2684 {
2685 really_changed = 1;
2686
2687 if (str_multibyte)
2688 {
2689 FETCH_STRING_CHAR_ADVANCE_NO_CHECK (c, newtext,
2690 pos, pos_byte);
2691 if (!buf_multibyte && !ASCII_CHAR_P (c))
2692 c = multibyte_char_to_unibyte (c, rev_tbl);
2693 }
2694 else
2695 {
2696 c = SREF (newtext, pos_byte++);
2697 if (buf_multibyte)
2698 MAKE_CHAR_MULTIBYTE (c);
2699 }
2700
2701 if (c == '&')
2702 idx = sub;
2703 else if (c >= '1' && c <= '9' && c <= search_regs.num_regs + '0')
2704 {
2705 if (search_regs.start[c - '0'] >= 1)
2706 idx = c - '0';
2707 }
2708 else if (c == '\\')
2709 add_len = 1, add_stuff = "\\";
2710 else
2711 {
2712 xfree (substed);
2713 error ("Invalid use of `\\' in replacement text");
2714 }
2715 }
2716 else
2717 {
2718 add_len = CHAR_STRING (c, str);
2719 add_stuff = str;
2720 }
2721
2722 /* If we want to copy part of a previous match,
2723 set up ADD_STUFF and ADD_LEN to point to it. */
2724 if (idx >= 0)
2725 {
2726 EMACS_INT begbyte = CHAR_TO_BYTE (search_regs.start[idx]);
2727 add_len = CHAR_TO_BYTE (search_regs.end[idx]) - begbyte;
2728 if (search_regs.start[idx] < GPT && GPT < search_regs.end[idx])
2729 move_gap (search_regs.start[idx]);
2730 add_stuff = BYTE_POS_ADDR (begbyte);
2731 }
2732
2733 /* Now the stuff we want to add to SUBSTED
2734 is invariably ADD_LEN bytes starting at ADD_STUFF. */
2735
2736 /* Make sure SUBSTED is big enough. */
2737 if (substed_len + add_len >= substed_alloc_size)
2738 {
2739 substed_alloc_size = substed_len + add_len + 500;
2740 substed = (unsigned char *) xrealloc (substed,
2741 substed_alloc_size + 1);
2742 }
2743
2744 /* Now add to the end of SUBSTED. */
2745 if (add_stuff)
2746 {
2747 memcpy (substed + substed_len, add_stuff, add_len);
2748 substed_len += add_len;
2749 }
2750 }
2751
2752 if (really_changed)
2753 {
2754 if (buf_multibyte)
2755 {
2756 EMACS_INT nchars =
2757 multibyte_chars_in_text (substed, substed_len);
2758
2759 newtext = make_multibyte_string (substed, nchars, substed_len);
2760 }
2761 else
2762 newtext = make_unibyte_string (substed, substed_len);
2763 }
2764 xfree (substed);
2765 }
2766
2767 /* Replace the old text with the new in the cleanest possible way. */
2768 replace_range (search_regs.start[sub], search_regs.end[sub],
2769 newtext, 1, 0, 1);
2770 newpoint = search_regs.start[sub] + SCHARS (newtext);
2771
2772 if (case_action == all_caps)
2773 Fupcase_region (make_number (search_regs.start[sub]),
2774 make_number (newpoint));
2775 else if (case_action == cap_initial)
2776 Fupcase_initials_region (make_number (search_regs.start[sub]),
2777 make_number (newpoint));
2778
2779 /* Adjust search data for this change. */
2780 {
2781 EMACS_INT oldend = search_regs.end[sub];
2782 EMACS_INT oldstart = search_regs.start[sub];
2783 EMACS_INT change = newpoint - search_regs.end[sub];
2784 int i;
2785
2786 for (i = 0; i < search_regs.num_regs; i++)
2787 {
2788 if (search_regs.start[i] >= oldend)
2789 search_regs.start[i] += change;
2790 else if (search_regs.start[i] > oldstart)
2791 search_regs.start[i] = oldstart;
2792 if (search_regs.end[i] >= oldend)
2793 search_regs.end[i] += change;
2794 else if (search_regs.end[i] > oldstart)
2795 search_regs.end[i] = oldstart;
2796 }
2797 }
2798
2799 /* Put point back where it was in the text. */
2800 if (opoint <= 0)
2801 TEMP_SET_PT (opoint + ZV);
2802 else
2803 TEMP_SET_PT (opoint);
2804
2805 /* Now move point "officially" to the start of the inserted replacement. */
2806 move_if_not_intangible (newpoint);
2807
2808 return Qnil;
2809 }
2810 \f
2811 static Lisp_Object
2812 match_limit (Lisp_Object num, int beginningp)
2813 {
2814 register int n;
2815
2816 CHECK_NUMBER (num);
2817 n = XINT (num);
2818 if (n < 0)
2819 args_out_of_range (num, make_number (0));
2820 if (search_regs.num_regs <= 0)
2821 error ("No match data, because no search succeeded");
2822 if (n >= search_regs.num_regs
2823 || search_regs.start[n] < 0)
2824 return Qnil;
2825 return (make_number ((beginningp) ? search_regs.start[n]
2826 : search_regs.end[n]));
2827 }
2828
2829 DEFUN ("match-beginning", Fmatch_beginning, Smatch_beginning, 1, 1, 0,
2830 doc: /* Return position of start of text matched by last search.
2831 SUBEXP, a number, specifies which parenthesized expression in the last
2832 regexp.
2833 Value is nil if SUBEXPth pair didn't match, or there were less than
2834 SUBEXP pairs.
2835 Zero means the entire text matched by the whole regexp or whole string. */)
2836 (Lisp_Object subexp)
2837 {
2838 return match_limit (subexp, 1);
2839 }
2840
2841 DEFUN ("match-end", Fmatch_end, Smatch_end, 1, 1, 0,
2842 doc: /* Return position of end of text matched by last search.
2843 SUBEXP, a number, specifies which parenthesized expression in the last
2844 regexp.
2845 Value is nil if SUBEXPth pair didn't match, or there were less than
2846 SUBEXP pairs.
2847 Zero means the entire text matched by the whole regexp or whole string. */)
2848 (Lisp_Object subexp)
2849 {
2850 return match_limit (subexp, 0);
2851 }
2852
2853 DEFUN ("match-data", Fmatch_data, Smatch_data, 0, 3, 0,
2854 doc: /* Return a list containing all info on what the last search matched.
2855 Element 2N is `(match-beginning N)'; element 2N + 1 is `(match-end N)'.
2856 All the elements are markers or nil (nil if the Nth pair didn't match)
2857 if the last match was on a buffer; integers or nil if a string was matched.
2858 Use `set-match-data' to reinstate the data in this list.
2859
2860 If INTEGERS (the optional first argument) is non-nil, always use
2861 integers \(rather than markers) to represent buffer positions. In
2862 this case, and if the last match was in a buffer, the buffer will get
2863 stored as one additional element at the end of the list.
2864
2865 If REUSE is a list, reuse it as part of the value. If REUSE is long
2866 enough to hold all the values, and if INTEGERS is non-nil, no consing
2867 is done.
2868
2869 If optional third arg RESEAT is non-nil, any previous markers on the
2870 REUSE list will be modified to point to nowhere.
2871
2872 Return value is undefined if the last search failed. */)
2873 (Lisp_Object integers, Lisp_Object reuse, Lisp_Object reseat)
2874 {
2875 Lisp_Object tail, prev;
2876 Lisp_Object *data;
2877 int i, len;
2878
2879 if (!NILP (reseat))
2880 for (tail = reuse; CONSP (tail); tail = XCDR (tail))
2881 if (MARKERP (XCAR (tail)))
2882 {
2883 unchain_marker (XMARKER (XCAR (tail)));
2884 XSETCAR (tail, Qnil);
2885 }
2886
2887 if (NILP (last_thing_searched))
2888 return Qnil;
2889
2890 prev = Qnil;
2891
2892 data = (Lisp_Object *) alloca ((2 * search_regs.num_regs + 1)
2893 * sizeof (Lisp_Object));
2894
2895 len = 0;
2896 for (i = 0; i < search_regs.num_regs; i++)
2897 {
2898 int start = search_regs.start[i];
2899 if (start >= 0)
2900 {
2901 if (EQ (last_thing_searched, Qt)
2902 || ! NILP (integers))
2903 {
2904 XSETFASTINT (data[2 * i], start);
2905 XSETFASTINT (data[2 * i + 1], search_regs.end[i]);
2906 }
2907 else if (BUFFERP (last_thing_searched))
2908 {
2909 data[2 * i] = Fmake_marker ();
2910 Fset_marker (data[2 * i],
2911 make_number (start),
2912 last_thing_searched);
2913 data[2 * i + 1] = Fmake_marker ();
2914 Fset_marker (data[2 * i + 1],
2915 make_number (search_regs.end[i]),
2916 last_thing_searched);
2917 }
2918 else
2919 /* last_thing_searched must always be Qt, a buffer, or Qnil. */
2920 abort ();
2921
2922 len = 2 * i + 2;
2923 }
2924 else
2925 data[2 * i] = data[2 * i + 1] = Qnil;
2926 }
2927
2928 if (BUFFERP (last_thing_searched) && !NILP (integers))
2929 {
2930 data[len] = last_thing_searched;
2931 len++;
2932 }
2933
2934 /* If REUSE is not usable, cons up the values and return them. */
2935 if (! CONSP (reuse))
2936 return Flist (len, data);
2937
2938 /* If REUSE is a list, store as many value elements as will fit
2939 into the elements of REUSE. */
2940 for (i = 0, tail = reuse; CONSP (tail);
2941 i++, tail = XCDR (tail))
2942 {
2943 if (i < len)
2944 XSETCAR (tail, data[i]);
2945 else
2946 XSETCAR (tail, Qnil);
2947 prev = tail;
2948 }
2949
2950 /* If we couldn't fit all value elements into REUSE,
2951 cons up the rest of them and add them to the end of REUSE. */
2952 if (i < len)
2953 XSETCDR (prev, Flist (len - i, data + i));
2954
2955 return reuse;
2956 }
2957
2958 /* We used to have an internal use variant of `reseat' described as:
2959
2960 If RESEAT is `evaporate', put the markers back on the free list
2961 immediately. No other references to the markers must exist in this
2962 case, so it is used only internally on the unwind stack and
2963 save-match-data from Lisp.
2964
2965 But it was ill-conceived: those supposedly-internal markers get exposed via
2966 the undo-list, so freeing them here is unsafe. */
2967
2968 DEFUN ("set-match-data", Fset_match_data, Sset_match_data, 1, 2, 0,
2969 doc: /* Set internal data on last search match from elements of LIST.
2970 LIST should have been created by calling `match-data' previously.
2971
2972 If optional arg RESEAT is non-nil, make markers on LIST point nowhere. */)
2973 (register Lisp_Object list, Lisp_Object reseat)
2974 {
2975 register int i;
2976 register Lisp_Object marker;
2977
2978 if (running_asynch_code)
2979 save_search_regs ();
2980
2981 CHECK_LIST (list);
2982
2983 /* Unless we find a marker with a buffer or an explicit buffer
2984 in LIST, assume that this match data came from a string. */
2985 last_thing_searched = Qt;
2986
2987 /* Allocate registers if they don't already exist. */
2988 {
2989 int length = XFASTINT (Flength (list)) / 2;
2990
2991 if (length > search_regs.num_regs)
2992 {
2993 if (search_regs.num_regs == 0)
2994 {
2995 search_regs.start
2996 = (regoff_t *) xmalloc (length * sizeof (regoff_t));
2997 search_regs.end
2998 = (regoff_t *) xmalloc (length * sizeof (regoff_t));
2999 }
3000 else
3001 {
3002 search_regs.start
3003 = (regoff_t *) xrealloc (search_regs.start,
3004 length * sizeof (regoff_t));
3005 search_regs.end
3006 = (regoff_t *) xrealloc (search_regs.end,
3007 length * sizeof (regoff_t));
3008 }
3009
3010 for (i = search_regs.num_regs; i < length; i++)
3011 search_regs.start[i] = -1;
3012
3013 search_regs.num_regs = length;
3014 }
3015
3016 for (i = 0; CONSP (list); i++)
3017 {
3018 marker = XCAR (list);
3019 if (BUFFERP (marker))
3020 {
3021 last_thing_searched = marker;
3022 break;
3023 }
3024 if (i >= length)
3025 break;
3026 if (NILP (marker))
3027 {
3028 search_regs.start[i] = -1;
3029 list = XCDR (list);
3030 }
3031 else
3032 {
3033 EMACS_INT from;
3034 Lisp_Object m;
3035
3036 m = marker;
3037 if (MARKERP (marker))
3038 {
3039 if (XMARKER (marker)->buffer == 0)
3040 XSETFASTINT (marker, 0);
3041 else
3042 XSETBUFFER (last_thing_searched, XMARKER (marker)->buffer);
3043 }
3044
3045 CHECK_NUMBER_COERCE_MARKER (marker);
3046 from = XINT (marker);
3047
3048 if (!NILP (reseat) && MARKERP (m))
3049 {
3050 unchain_marker (XMARKER (m));
3051 XSETCAR (list, Qnil);
3052 }
3053
3054 if ((list = XCDR (list), !CONSP (list)))
3055 break;
3056
3057 m = marker = XCAR (list);
3058
3059 if (MARKERP (marker) && XMARKER (marker)->buffer == 0)
3060 XSETFASTINT (marker, 0);
3061
3062 CHECK_NUMBER_COERCE_MARKER (marker);
3063 search_regs.start[i] = from;
3064 search_regs.end[i] = XINT (marker);
3065
3066 if (!NILP (reseat) && MARKERP (m))
3067 {
3068 unchain_marker (XMARKER (m));
3069 XSETCAR (list, Qnil);
3070 }
3071 }
3072 list = XCDR (list);
3073 }
3074
3075 for (; i < search_regs.num_regs; i++)
3076 search_regs.start[i] = -1;
3077 }
3078
3079 return Qnil;
3080 }
3081
3082 /* If non-zero the match data have been saved in saved_search_regs
3083 during the execution of a sentinel or filter. */
3084 static int search_regs_saved;
3085 static struct re_registers saved_search_regs;
3086 static Lisp_Object saved_last_thing_searched;
3087
3088 /* Called from Flooking_at, Fstring_match, search_buffer, Fstore_match_data
3089 if asynchronous code (filter or sentinel) is running. */
3090 static void
3091 save_search_regs (void)
3092 {
3093 if (!search_regs_saved)
3094 {
3095 saved_search_regs.num_regs = search_regs.num_regs;
3096 saved_search_regs.start = search_regs.start;
3097 saved_search_regs.end = search_regs.end;
3098 saved_last_thing_searched = last_thing_searched;
3099 last_thing_searched = Qnil;
3100 search_regs.num_regs = 0;
3101 search_regs.start = 0;
3102 search_regs.end = 0;
3103
3104 search_regs_saved = 1;
3105 }
3106 }
3107
3108 /* Called upon exit from filters and sentinels. */
3109 void
3110 restore_search_regs (void)
3111 {
3112 if (search_regs_saved)
3113 {
3114 if (search_regs.num_regs > 0)
3115 {
3116 xfree (search_regs.start);
3117 xfree (search_regs.end);
3118 }
3119 search_regs.num_regs = saved_search_regs.num_regs;
3120 search_regs.start = saved_search_regs.start;
3121 search_regs.end = saved_search_regs.end;
3122 last_thing_searched = saved_last_thing_searched;
3123 saved_last_thing_searched = Qnil;
3124 search_regs_saved = 0;
3125 }
3126 }
3127
3128 static Lisp_Object
3129 unwind_set_match_data (Lisp_Object list)
3130 {
3131 /* It is NOT ALWAYS safe to free (evaporate) the markers immediately. */
3132 return Fset_match_data (list, Qt);
3133 }
3134
3135 /* Called to unwind protect the match data. */
3136 void
3137 record_unwind_save_match_data (void)
3138 {
3139 record_unwind_protect (unwind_set_match_data,
3140 Fmatch_data (Qnil, Qnil, Qnil));
3141 }
3142
3143 /* Quote a string to inactivate reg-expr chars */
3144
3145 DEFUN ("regexp-quote", Fregexp_quote, Sregexp_quote, 1, 1, 0,
3146 doc: /* Return a regexp string which matches exactly STRING and nothing else. */)
3147 (Lisp_Object string)
3148 {
3149 register unsigned char *in, *out, *end;
3150 register unsigned char *temp;
3151 int backslashes_added = 0;
3152
3153 CHECK_STRING (string);
3154
3155 temp = (unsigned char *) alloca (SBYTES (string) * 2);
3156
3157 /* Now copy the data into the new string, inserting escapes. */
3158
3159 in = SDATA (string);
3160 end = in + SBYTES (string);
3161 out = temp;
3162
3163 for (; in != end; in++)
3164 {
3165 if (*in == '['
3166 || *in == '*' || *in == '.' || *in == '\\'
3167 || *in == '?' || *in == '+'
3168 || *in == '^' || *in == '$')
3169 *out++ = '\\', backslashes_added++;
3170 *out++ = *in;
3171 }
3172
3173 return make_specified_string (temp,
3174 SCHARS (string) + backslashes_added,
3175 out - temp,
3176 STRING_MULTIBYTE (string));
3177 }
3178 \f
3179 void
3180 syms_of_search (void)
3181 {
3182 register int i;
3183
3184 for (i = 0; i < REGEXP_CACHE_SIZE; ++i)
3185 {
3186 searchbufs[i].buf.allocated = 100;
3187 searchbufs[i].buf.buffer = (unsigned char *) xmalloc (100);
3188 searchbufs[i].buf.fastmap = searchbufs[i].fastmap;
3189 searchbufs[i].regexp = Qnil;
3190 searchbufs[i].whitespace_regexp = Qnil;
3191 searchbufs[i].syntax_table = Qnil;
3192 staticpro (&searchbufs[i].regexp);
3193 staticpro (&searchbufs[i].whitespace_regexp);
3194 staticpro (&searchbufs[i].syntax_table);
3195 searchbufs[i].next = (i == REGEXP_CACHE_SIZE-1 ? 0 : &searchbufs[i+1]);
3196 }
3197 searchbuf_head = &searchbufs[0];
3198
3199 Qsearch_failed = intern_c_string ("search-failed");
3200 staticpro (&Qsearch_failed);
3201 Qinvalid_regexp = intern_c_string ("invalid-regexp");
3202 staticpro (&Qinvalid_regexp);
3203
3204 Fput (Qsearch_failed, Qerror_conditions,
3205 pure_cons (Qsearch_failed, pure_cons (Qerror, Qnil)));
3206 Fput (Qsearch_failed, Qerror_message,
3207 make_pure_c_string ("Search failed"));
3208
3209 Fput (Qinvalid_regexp, Qerror_conditions,
3210 pure_cons (Qinvalid_regexp, pure_cons (Qerror, Qnil)));
3211 Fput (Qinvalid_regexp, Qerror_message,
3212 make_pure_c_string ("Invalid regexp"));
3213
3214 last_thing_searched = Qnil;
3215 staticpro (&last_thing_searched);
3216
3217 saved_last_thing_searched = Qnil;
3218 staticpro (&saved_last_thing_searched);
3219
3220 DEFVAR_LISP ("search-spaces-regexp", Vsearch_spaces_regexp,
3221 doc: /* Regexp to substitute for bunches of spaces in regexp search.
3222 Some commands use this for user-specified regexps.
3223 Spaces that occur inside character classes or repetition operators
3224 or other such regexp constructs are not replaced with this.
3225 A value of nil (which is the normal value) means treat spaces literally. */);
3226 Vsearch_spaces_regexp = Qnil;
3227
3228 DEFVAR_LISP ("inhibit-changing-match-data", Vinhibit_changing_match_data,
3229 doc: /* Internal use only.
3230 If non-nil, the primitive searching and matching functions
3231 such as `looking-at', `string-match', `re-search-forward', etc.,
3232 do not set the match data. The proper way to use this variable
3233 is to bind it with `let' around a small expression. */);
3234 Vinhibit_changing_match_data = Qnil;
3235
3236 defsubr (&Slooking_at);
3237 defsubr (&Sposix_looking_at);
3238 defsubr (&Sstring_match);
3239 defsubr (&Sposix_string_match);
3240 defsubr (&Ssearch_forward);
3241 defsubr (&Ssearch_backward);
3242 defsubr (&Sword_search_forward);
3243 defsubr (&Sword_search_backward);
3244 defsubr (&Sword_search_forward_lax);
3245 defsubr (&Sword_search_backward_lax);
3246 defsubr (&Sre_search_forward);
3247 defsubr (&Sre_search_backward);
3248 defsubr (&Sposix_search_forward);
3249 defsubr (&Sposix_search_backward);
3250 defsubr (&Sreplace_match);
3251 defsubr (&Smatch_beginning);
3252 defsubr (&Smatch_end);
3253 defsubr (&Smatch_data);
3254 defsubr (&Sset_match_data);
3255 defsubr (&Sregexp_quote);
3256 }