*** empty log message ***

[gnu-emacs] / src / search.c
diff --git a/src/search.c b/src/search.c

index 89a4a5ad68e2735a91e85f2a6cf924f62fae8a2c..96ea41e8f8ec4903f9d6017f77133ea03ed6d851 100644 (file)
--- a/src/search.c
+++ b/src/search.c
@@ -24,7 +24,7 @@ Boston, MA 02111-1307, USA.  */
  #include "syntax.h"
  #include "category.h"
  #include "buffer.h"
-#include "charset.h"
+#include "character.h"
  #include "region-cache.h"
  #include "commands.h"
  #include "blockinput.h"
@@ -104,9 +104,8 @@ matcher_overflow ()
     subexpression bounds.
     POSIX is nonzero if we want full backtracking (POSIX style)
     for this pattern.  0 means backtrack only enough to get a valid match.
-   MULTIBYTE is nonzero if we want to handle multibyte characters in
-   PATTERN.  0 means all multibyte characters are recognized just as
-   sequences of binary data.  */
+   MULTIBYTE is nonzero iff a target of match is a multibyte buffer or
+   string.  */
  
  static void
  compile_pattern_1 (cp, pattern, translate, regp, posix, multibyte)
@@ -117,51 +116,19 @@ compile_pattern_1 (cp, pattern, translate, regp, posix, multibyte)
       int posix;
       int multibyte;
  {
-  unsigned char *raw_pattern;
-  int raw_pattern_size;
    char *val;
    reg_syntax_t old;
  
-  /* MULTIBYTE says whether the text to be searched is multibyte.
-     We must convert PATTERN to match that, or we will not really
-     find things right.  */
-
-  if (multibyte == STRING_MULTIBYTE (pattern))
-    {
-      raw_pattern = (unsigned char *) SDATA (pattern);
-      raw_pattern_size = SBYTES (pattern);
-    }
-  else if (multibyte)
-    {
-      raw_pattern_size = count_size_as_multibyte (SDATA (pattern),
-                                                 SCHARS (pattern));
-      raw_pattern = (unsigned char *) alloca (raw_pattern_size + 1);
-      copy_text (SDATA (pattern), raw_pattern,
-                SCHARS (pattern), 0, 1);
-    }
-  else
-    {
-      /* Converting multibyte to single-byte.
-
-        ??? Perhaps this conversion should be done in a special way
-        by subtracting nonascii-insert-offset from each non-ASCII char,
-        so that only the multibyte chars which really correspond to
-        the chosen single-byte character set can possibly match.  */
-      raw_pattern_size = SCHARS (pattern);
-      raw_pattern = (unsigned char *) alloca (raw_pattern_size + 1);
-      copy_text (SDATA (pattern), raw_pattern,
-                SBYTES (pattern), 1, 0);
-    }
-
    cp->regexp = Qnil;
    cp->buf.translate = (! NILP (translate) ? translate : make_number (0));
    cp->posix = posix;
-  cp->buf.multibyte = multibyte;
+  cp->buf.multibyte = STRING_MULTIBYTE (pattern);
+  cp->buf.target_multibyte = multibyte;
    BLOCK_INPUT;
    old = re_set_syntax (RE_SYNTAX_EMACS
                        | (posix ? 0 : RE_NO_POSIX_BACKTRACKING));
-  val = (char *) re_compile_pattern ((char *)raw_pattern,
-                                    raw_pattern_size, &cp->buf);
+  val = (char *) re_compile_pattern ((char *) SDATA (pattern),
+                                    SBYTES (pattern), &cp->buf);
    re_set_syntax (old);
    UNBLOCK_INPUT;
    if (val)
@@ -222,7 +189,7 @@ compile_pattern (pattern, regp, translate, posix, multibyte)
           && !NILP (Fstring_equal (cp->regexp, pattern))
           && EQ (cp->buf.translate, (! NILP (translate) ? translate : make_number (0)))
           && cp->posix == posix
-         && cp->buf.multibyte == multibyte)
+         && cp->buf.target_multibyte == multibyte)
         break;
  
        /* If we're at the end of the cache, compile into the nil cell
@@ -1140,7 +1107,12 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n,
        unsigned char *patbuf;
        int multibyte = !NILP (current_buffer->enable_multibyte_characters);
        unsigned char *base_pat = SDATA (string);
-      int charset_base = -1;
+      /* High bits of char; 0 for ASCII characters, (CHAR & ~0x3F)
+        otherwise.  Characters of the same high bits have the same
+        sequence of bytes but last.  To do the BM search, all
+        characters in STRING must have the same high bits (including
+        their case translations).  */
+      int char_high_bits = -1;
        int boyer_moore_ok = 1;
  
        /* MULTIBYTE says whether the text to be searched is multibyte.
@@ -1181,16 +1153,15 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n,
        /* Copy and optionally translate the pattern.  */
        len = raw_pattern_size;
        len_byte = raw_pattern_size_byte;
-      patbuf = (unsigned char *) alloca (len_byte);
+      patbuf = (unsigned char *) alloca (len * MAX_MULTIBYTE_LENGTH);
        pat = patbuf;
        base_pat = raw_pattern;
        if (multibyte)
         {
           while (--len >= 0)
             {
-             unsigned char str[MAX_MULTIBYTE_LENGTH];
               int c, translated, inverse;
-             int in_charlen, charlen;
+             int in_charlen;
  
               /* If we got here and the RE flag is set, it's because we're
                  dealing with a regexp known to be trivial, so the backslash
@@ -1206,23 +1177,6 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n,
  
               /* Translate the character, if requested.  */
               TRANSLATE (translated, trt, c);
-             /* If translation changed the byte-length, go back
-                to the original character.  */
-             charlen = CHAR_STRING (translated, str);
-             if (in_charlen != charlen)
-               {
-                 translated = c;
-                 charlen = CHAR_STRING (c, str);
-               }
-
-             /* If we are searching for something strange,
-                an invalid multibyte code, don't use boyer-moore.  */
-             if (! ASCII_BYTE_P (translated)
-                 && (charlen == 1 /* 8bit code */
-                     || charlen != in_charlen /* invalid multibyte code */
-                     ))
-               boyer_moore_ok = 0;
-
               TRANSLATE (inverse, inverse_trt, c);
  
               /* Did this char actually get translated?
@@ -1231,22 +1185,22 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n,
                 {
                   /* Keep track of which character set row
                      contains the characters that need translation.  */
-                 int charset_base_code = c & ~CHAR_FIELD3_MASK;
-                 int inverse_charset_base = inverse & ~CHAR_FIELD3_MASK;
+                 int this_high_bit = ASCII_CHAR_P (c) ? 0 : (c & ~0x3F);
+                 int c1 = inverse != c ? inverse : translated;
+                 int trt_high_bit = ASCII_CHAR_P (c1) ? 0 : (c1 & ~0x3F);
  
-                 if (charset_base_code != inverse_charset_base)
+                 if (this_high_bit != trt_high_bit)
                     boyer_moore_ok = 0;
-                 else if (charset_base == -1)
-                   charset_base = charset_base_code;
-                 else if (charset_base != charset_base_code)
+                 else if (char_high_bits == -1)
+                   char_high_bits = this_high_bit;
+                 else if (char_high_bits != this_high_bit)
                     /* If two different rows appear, needing translation,
                        then we cannot use boyer_moore search.  */
                     boyer_moore_ok = 0;
                 }
  
               /* Store this character into the translated pattern.  */
-             bcopy (str, pat, charlen);
-             pat += charlen;
+             CHAR_STRING_ADVANCE (translated, pat);
               base_pat += in_charlen;
               len_byte -= in_charlen;
             }
@@ -1254,7 +1208,7 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n,
        else
         {
           /* Unibyte buffer.  */
-         charset_base = 0;
+         char_high_bits = 0;
           while (--len >= 0)
             {
               int c, translated;
@@ -1280,7 +1234,7 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n,
        if (boyer_moore_ok)
         return boyer_moore (n, pat, len, len_byte, trt, inverse_trt,
                             pos, pos_byte, lim, lim_byte,
-                           charset_base);
+                           char_high_bits);
        else
         return simple_search (n, pat, len, len_byte, trt,
                               pos, pos_byte, lim, lim_byte);
@@ -1513,7 +1467,7 @@ simple_search (n, pat, len, len_byte, trt, pos, pos_byte, lim, lim_byte)
  
  static int
  boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
-            pos, pos_byte, lim, lim_byte, charset_base)
+            pos, pos_byte, lim, lim_byte, char_high_bits)
       int n;
       unsigned char *base_pat;
       int len, len_byte;
@@ -1521,7 +1475,7 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
       Lisp_Object inverse_trt;
       int pos, pos_byte;
       int lim, lim_byte;
-     int charset_base;
+     int char_high_bits;
  {
    int direction = ((n > 0) ? 1 : -1);
    register int dirlen;
@@ -1622,7 +1576,8 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
               while (! CHAR_HEAD_P (*charstart))
                 charstart--;
               untranslated = STRING_CHAR (charstart, ptr - charstart + 1);
-             if (charset_base == (untranslated & ~CHAR_FIELD3_MASK))
+             if (char_high_bits
+                 == (ASCII_CHAR_P (untranslated) ? 0 : untranslated & ~0x3F))
                 {
                   TRANSLATE (ch, trt, untranslated);
                   if (! CHAR_HEAD_P (*ptr))
@@ -1646,8 +1601,9 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
               this_translated = 0;
             }
  
-         if (ch > 0400)
-           j = ((unsigned char) ch) | 0200;
+         if (this_translated
+             && ch >= 0200)
+           j = (ch & 0x3F) | 0200;
           else
             j = (unsigned char) ch;
  
@@ -1664,8 +1620,8 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
               while (1)
                 {
                   TRANSLATE (ch, inverse_trt, ch);
-                 if (ch > 0400)
-                   j = ((unsigned char) ch) | 0200;
+                 if (ch > 0200)
+                   j = (ch & 0x3F) | 0200;
                   else
                     j = (unsigned char) ch;
  
@@ -1958,7 +1914,7 @@ wordify (string)
      {
        int c;
  
-      FETCH_STRING_CHAR_ADVANCE (c, string, i, i_byte);
+      FETCH_STRING_CHAR_AS_MULTIBYTE_ADVANCE (c, string, i, i_byte);
  
        if (SYNTAX (c) != Sword)
         {
@@ -1993,7 +1949,7 @@ wordify (string)
        int c;
        int i_byte_orig = i_byte;
  
-      FETCH_STRING_CHAR_ADVANCE (c, string, i, i_byte);
+      FETCH_STRING_CHAR_AS_MULTIBYTE_ADVANCE (c, string, i, i_byte);
  
        if (SYNTAX (c) == Sword)
         {
@@ -2277,11 +2233,11 @@ since only regular expressions have distinguished subexpressions.  */)
         {
           if (NILP (string))
             {
-             c = FETCH_CHAR (pos_byte);
+             c = FETCH_CHAR_AS_MULTIBYTE (pos_byte);
               INC_BOTH (pos, pos_byte);
             }
           else
-           FETCH_STRING_CHAR_ADVANCE (c, string, pos, pos_byte);
+           FETCH_STRING_CHAR_AS_MULTIBYTE_ADVANCE (c, string, pos, pos_byte);
  
           if (LOWERCASEP (c))
             {
@@ -2445,10 +2401,7 @@ since only regular expressions have distinguished subexpressions.  */)
        Lisp_Object rev_tbl;
        int really_changed = 0;
  
-      rev_tbl= (!buf_multibyte && CHAR_TABLE_P (Vnonascii_translation_table)
-               ? Fchar_table_extra_slot (Vnonascii_translation_table,
-                                         make_number (0))
-               : Qnil);
+      rev_tbl = Qnil;
  
        substed_alloc_size = length * 2 + 100;
        substed = (unsigned char *) xmalloc (substed_alloc_size + 1);
@@ -2491,7 +2444,7 @@ since only regular expressions have distinguished subexpressions.  */)
                 {
                   FETCH_STRING_CHAR_ADVANCE_NO_CHECK (c, newtext,
                                                       pos, pos_byte);
-                 if (!buf_multibyte && !SINGLE_BYTE_CHAR_P (c))
+                 if (!buf_multibyte && !ASCII_CHAR_P (c))
                     c = multibyte_char_to_unibyte (c, rev_tbl);
                 }
               else