code.delx.au - gnu-emacs/blob - src/character.c

   1 /* Basic character support.
   2    Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4    Copyright (C) 2001 Free Software Foundation, Inc.
   5    Copyright (C) 2001, 2002
   6      National Institute of Advanced Industrial Science and Technology (AIST)
   7      Registration Number H13PRO009
   8
   9 This file is part of GNU Emacs.
  10
  11 GNU Emacs is free software; you can redistribute it and/or modify
  12 it under the terms of the GNU General Public License as published by
  13 the Free Software Foundation; either version 2, or (at your option)
  14 any later version.
  15
  16 GNU Emacs is distributed in the hope that it will be useful,
  17 but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19 GNU General Public License for more details.
  20
  21 You should have received a copy of the GNU General Public License
  22 along with GNU Emacs; see the file COPYING.  If not, write to
  23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  24 Boston, MA 02111-1307, USA.  */
  25
  26 /* At first, see the document in `character.h' to understand the code
  27    in this file.  */
  28
  29 #ifdef emacs
  30 #include <config.h>
  31 #endif
  32
  33 #include <stdio.h>
  34
  35 #ifdef emacs
  36
  37 #include <sys/types.h>
  38 #include "lisp.h"
  39 #include "character.h"
  40 #include "buffer.h"
  41 #include "charset.h"
  42 #include "composite.h"
  43 #include "disptab.h"
  44
  45 #else  /* not emacs */
  46
  47 #include "mulelib.h"
  48
  49 #endif /* emacs */
  50
  51 Lisp_Object Qcharacterp;
  52
  53 /* Vector of translation table ever defined.
  54    ID of a translation table is used to index this vector.  */
  55 Lisp_Object Vtranslation_table_vector;
  56
  57 /* A char-table for characters which may invoke auto-filling.  */
  58 Lisp_Object Vauto_fill_chars;
  59
  60 Lisp_Object Qauto_fill_chars;
  61
  62 Lisp_Object Vchar_unify_table;
  63
  64 /* A char-table.  An element is non-nil iff the corresponding
  65    character has a printable glyph.  */
  66 Lisp_Object Vprintable_chars;
  67
  68 /* A char-table.  An elemnent is a column-width of the corresponding
  69    character.  */
  70 Lisp_Object Vchar_width_table;
  71
  72 /* A char-table.  An element is a symbol indicating the direction
  73    property of corresponding character.  */
  74 Lisp_Object Vchar_direction_table;
  75
  76 /* Variable used locally in the macro FETCH_MULTIBYTE_CHAR.  */
  77 unsigned char *_fetch_multibyte_char_p;
  78
  79 /* Char table of scripts.  */
  80 Lisp_Object Vchar_script_table;
  81
  82 static Lisp_Object Qchar_script_table;
  83
  84 /* Mapping table from unibyte chars to multibyte chars.  */
  85 int unibyte_to_multibyte_table[256];
  86
  87 \f
  88
  89 int
  90 char_string (c, p)
  91      int c;
  92      unsigned char *p;
  93 {
  94   int bytes;
  95
  96   if (c & CHAR_MODIFIER_MASK)
  97     {
  98       /* As a character not less than 256 can't have modifier bits, we
  99          just ignore the bits.  */
 100       if (SINGLE_BYTE_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
 101         {
 102           /* For Meta, Shift, and Control modifiers, we need special care.  */
 103           if (c & CHAR_META)
 104             {
 105               /* Move the meta bit to the right place for a string.  */
 106               c = (c & ~CHAR_META) | 0x80;
 107             }
 108           if (c & CHAR_SHIFT)
 109             {
 110               /* Shift modifier is valid only with [A-Za-z].  */
 111               if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
 112                 c &= ~CHAR_SHIFT;
 113               else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
 114                 c = (c & ~CHAR_SHIFT) - ('a' - 'A');
 115             }
 116           if (c & CHAR_CTL)
 117             {
 118               /* Simulate the code in lread.c.  */
 119               /* Allow `\C- ' and `\C-?'.  */
 120               if (c == (CHAR_CTL | ' '))
 121                 c = 0;
 122               else if (c == (CHAR_CTL | '?'))
 123                 c = 127;
 124               /* ASCII control chars are made from letters (both cases),
 125                  as well as the non-letters within 0100...0137.  */
 126               else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
 127                 c &= (037 | (~0177 & ~CHAR_CTL));
 128               else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
 129                 c &= (037 | (~0177 & ~CHAR_CTL));
 130             }
 131         }
 132
 133       /* If C still has any modifier bits, just ignore it.  */
 134       c &= ~CHAR_MODIFIER_MASK;
 135     }
 136
 137   MAYBE_UNIFY_CHAR (c);
 138
 139   if (c <= MAX_3_BYTE_CHAR)
 140     {
 141       bytes = CHAR_STRING (c, p);
 142     }
 143   else if (c <= MAX_4_BYTE_CHAR)
 144     {
 145       p[0] = (0xF0 | (c >> 18));
 146       p[1] = (0x80 | ((c >> 12) & 0x3F));
 147       p[2] = (0x80 | ((c >> 6) & 0x3F));
 148       p[3] = (0x80 | (c & 0x3F));
 149       bytes = 4;
 150     }
 151   else if (c <= MAX_5_BYTE_CHAR)
 152     {
 153       p[0] = 0xF8;
 154       p[1] = (0x80 | ((c >> 18) & 0x0F));
 155       p[2] = (0x80 | ((c >> 12) & 0x3F));
 156       p[3] = (0x80 | ((c >> 6) & 0x3F));
 157       p[4] = (0x80 | (c & 0x3F));
 158       bytes = 5;
 159     }
 160   else
 161     {
 162       c = CHAR_TO_BYTE8 (c);
 163       bytes = BYTE8_STRING (c, p);
 164     }
 165
 166   return bytes;
 167 }
 168
 169
 170 int
 171 string_char (p, advanced, len)
 172      const unsigned char *p;
 173      const unsigned char **advanced;
 174      int *len;
 175 {
 176   int c;
 177   const unsigned char *saved_p = p;
 178
 179   if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
 180     {
 181       c = STRING_CHAR_ADVANCE (p);
 182     }
 183   else if (! (*p & 0x08))
 184     {
 185       c = ((((p)[0] & 0xF) << 18)
 186            | (((p)[1] & 0x3F) << 12)
 187            | (((p)[2] & 0x3F) << 6)
 188            | ((p)[3] & 0x3F));
 189       p += 4;
 190     }
 191   else
 192     {
 193       c = ((((p)[1] & 0x3F) << 18)
 194            | (((p)[2] & 0x3F) << 12)
 195            | (((p)[3] & 0x3F) << 6)
 196            | ((p)[4] & 0x3F));
 197       p += 5;
 198     }
 199
 200   MAYBE_UNIFY_CHAR (c);
 201
 202   if (len)
 203     *len = p - saved_p;
 204   if (advanced)
 205     *advanced = p;
 206   return c;
 207 }
 208
 209
 210 /* Translate character C by translation table TABLE.  If C is
 211    negative, translate a character specified by CHARSET and CODE.  If
 212    no translation is found in TABLE, return the untranslated
 213    character.  */
 214
 215 int
 216 translate_char (table, c)
 217      Lisp_Object table;
 218      int c;
 219 {
 220   Lisp_Object ch;
 221
 222   if (! CHAR_TABLE_P (table))
 223     return c;
 224   ch = CHAR_TABLE_REF (table, c);
 225   if (! CHARACTERP (ch))
 226     return c;
 227   return XINT (ch);
 228 }
 229
 230 /* Convert the multibyte character C to unibyte 8-bit character based
 231    on the current value of charset_unibyte.  If dimension of
 232    charset_unibyte is more than one, return (C & 0xFF).
 233
 234    The argument REV_TBL is now ignored.  It will be removed in the
 235    future.  */
 236
 237 int
 238 multibyte_char_to_unibyte (c, rev_tbl)
 239      int c;
 240      Lisp_Object rev_tbl;
 241 {
 242   struct charset *charset;
 243   unsigned c1;
 244
 245   if (CHAR_BYTE8_P (c))
 246     return CHAR_TO_BYTE8 (c);
 247   charset = CHARSET_FROM_ID (charset_unibyte);
 248   c1 = ENCODE_CHAR (charset, c);
 249   return ((c1 != CHARSET_INVALID_CODE (charset)) ? c1 : c & 0xFF);
 250 }
 251
 252
 253 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
 254        doc: /* Return non-nil if OBJECT is a character.  */)
 255      (object, ignore)
 256      Lisp_Object object, ignore;
 257 {
 258   return (CHARACTERP (object) ? Qt : Qnil);
 259 }
 260
 261 DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
 262        doc: /* Return the character of the maximum code.  */)
 263      ()
 264 {
 265   return make_number (MAX_CHAR);
 266 }
 267
 268 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
 269        Sunibyte_char_to_multibyte, 1, 1, 0,
 270        doc: /* Convert the unibyte character CH to multibyte character.
 271 The multibyte character is a result of decoding CH by
 272 the current unibyte charset (see `unibyte-charset').  */)
 273      (ch)
 274      Lisp_Object ch;
 275 {
 276   int c;
 277   struct charset *charset;
 278
 279   CHECK_CHARACTER (ch);
 280   c = XFASTINT (ch);
 281   if (c >= 0400)
 282     error ("Invalid unibyte character: %d", c);
 283   charset = CHARSET_FROM_ID (charset_unibyte);
 284   c = DECODE_CHAR (charset, c);
 285   if (c < 0)
 286     c = BYTE8_TO_CHAR (XFASTINT (ch));
 287   return make_number (c);
 288 }
 289
 290 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
 291        Smultibyte_char_to_unibyte, 1, 1, 0,
 292        doc: /* Convert the multibyte character CH to unibyte character.\n\
 293 The unibyte character is a result of encoding CH by
 294 the current primary charset (value of `charset-primary').  */)
 295      (ch)
 296      Lisp_Object ch;
 297 {
 298   int c;
 299
 300   CHECK_CHARACTER (ch);
 301   c = XFASTINT (ch);
 302   c = CHAR_TO_BYTE8 (c);
 303   return make_number (c);
 304 }
 305
 306 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
 307        doc: /* Return 1 regardless of the argument CHAR.
 308 This is now an obsolete function.  We keep it just for backward compatibility.   */)
 309      (ch)
 310      Lisp_Object ch;
 311 {
 312   CHECK_CHARACTER (ch);
 313   return make_number (1);
 314 }
 315
 316 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
 317        doc: /* Return width of CHAR when displayed in the current buffer.
 318 The width is measured by how many columns it occupies on the screen.
 319 Tab is taken to occupy `tab-width' columns.  */)
 320      (ch)
 321        Lisp_Object ch;
 322 {
 323   Lisp_Object disp;
 324   int c, width;
 325   struct Lisp_Char_Table *dp = buffer_display_table ();
 326
 327   CHECK_CHARACTER (ch);
 328   c = XINT (ch);
 329
 330   /* Get the way the display table would display it.  */
 331   disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
 332
 333   if (VECTORP (disp))
 334     width = ASIZE (disp);
 335   else
 336     width = CHAR_WIDTH (c);
 337
 338   return make_number (width);
 339 }
 340
 341 /* Return width of string STR of length LEN when displayed in the
 342    current buffer.  The width is measured by how many columns it
 343    occupies on the screen.  If PRECISION > 0, return the width of
 344    longest substring that doesn't exceed PRECISION, and set number of
 345    characters and bytes of the substring in *NCHARS and *NBYTES
 346    respectively.  */
 347
 348 int
 349 c_string_width (str, len, precision, nchars, nbytes)
 350      unsigned char *str;
 351      int precision, *nchars, *nbytes;
 352 {
 353   int i = 0, i_byte = 0;
 354   int width = 0;
 355   struct Lisp_Char_Table *dp = buffer_display_table ();
 356
 357   while (i_byte < len)
 358     {
 359       int bytes, thiswidth;
 360       Lisp_Object val;
 361       int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
 362
 363       if (dp)
 364         {
 365           val = DISP_CHAR_VECTOR (dp, c);
 366           if (VECTORP (val))
 367             thiswidth = XVECTOR (val)->size;
 368           else
 369             thiswidth = CHAR_WIDTH (c);
 370         }
 371       else
 372         {
 373           thiswidth = CHAR_WIDTH (c);
 374         }
 375
 376       if (precision > 0
 377           && (width + thiswidth > precision))
 378         {
 379           *nchars = i;
 380           *nbytes = i_byte;
 381           return width;
 382         }
 383       i++;
 384       i_byte += bytes;
 385       width += thiswidth;
 386   }
 387
 388   if (precision > 0)
 389     {
 390       *nchars = i;
 391       *nbytes = i_byte;
 392     }
 393
 394   return width;
 395 }
 396
 397 /* Return width of string STR of length LEN when displayed in the
 398    current buffer.  The width is measured by how many columns it
 399    occupies on the screen.  */
 400
 401 int
 402 strwidth (str, len)
 403      unsigned char *str;
 404      int len;
 405 {
 406   return c_string_width (str, len, -1, NULL, NULL);
 407 }
 408
 409 /* Return width of Lisp string STRING when displayed in the current
 410    buffer.  The width is measured by how many columns it occupies on
 411    the screen while paying attention to compositions.  If PRECISION >
 412    0, return the width of longest substring that doesn't exceed
 413    PRECISION, and set number of characters and bytes of the substring
 414    in *NCHARS and *NBYTES respectively.  */
 415
 416 int
 417 lisp_string_width (string, precision, nchars, nbytes)
 418      Lisp_Object string;
 419      int precision, *nchars, *nbytes;
 420 {
 421   int len = XSTRING (string)->size;
 422   unsigned char *str = XSTRING (string)->data;
 423   int i = 0, i_byte = 0;
 424   int width = 0;
 425   struct Lisp_Char_Table *dp = buffer_display_table ();
 426
 427   while (i < len)
 428     {
 429       int chars, bytes, thiswidth;
 430       Lisp_Object val;
 431       int cmp_id;
 432       int ignore, end;
 433
 434       if (find_composition (i, -1, &ignore, &end, &val, string)
 435           && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
 436               >= 0))
 437         {
 438           thiswidth = composition_table[cmp_id]->width;
 439           chars = end - i;
 440           bytes = string_char_to_byte (string, end) - i_byte;
 441         }
 442       else if (dp)
 443         {
 444           int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
 445
 446           chars = 1;
 447           val = DISP_CHAR_VECTOR (dp, c);
 448           if (VECTORP (val))
 449             thiswidth = XVECTOR (val)->size;
 450           else
 451             thiswidth = CHAR_WIDTH (c);
 452         }
 453       else
 454         {
 455           int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
 456
 457           chars = 1;
 458           thiswidth = CHAR_WIDTH (c);
 459         }
 460
 461       if (precision > 0
 462           && (width + thiswidth > precision))
 463         {
 464           *nchars = i;
 465           *nbytes = i_byte;
 466           return width;
 467         }
 468       i += chars;
 469       i_byte += bytes;
 470       width += thiswidth;
 471   }
 472
 473   if (precision > 0)
 474     {
 475       *nchars = i;
 476       *nbytes = i_byte;
 477     }
 478
 479   return width;
 480 }
 481
 482 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
 483        doc: /* Return width of STRING when displayed in the current buffer.
 484 Width is measured by how many columns it occupies on the screen.
 485 When calculating width of a multibyte character in STRING,
 486 only the base leading-code is considered; the validity of
 487 the following bytes is not checked.  Tabs in STRING are always
 488 taken to occupy `tab-width' columns.  */)
 489      (str)
 490      Lisp_Object str;
 491 {
 492   Lisp_Object val;
 493
 494   CHECK_STRING (str);
 495   XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
 496   return val;
 497 }
 498
 499 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
 500        doc: /* Return the direction of CHAR.
 501 The returned value is 0 for left-to-right and 1 for right-to-left.  */)
 502      (ch)
 503      Lisp_Object ch;
 504 {
 505   int c;
 506
 507   CHECK_CHARACTER (ch);
 508   c = XINT (ch);
 509   return CHAR_TABLE_REF (Vchar_direction_table, c);
 510 }
 511
 512 DEFUN ("chars-in-region", Fchars_in_region, Schars_in_region, 2, 2, 0,
 513        doc: /* Return number of characters between BEG and END.
 514 This is now an obsolete function.  We keep it just for backward compatibility.  */)
 515      (beg, end)
 516      Lisp_Object beg, end;
 517 {
 518   int from, to;
 519
 520   CHECK_NUMBER_COERCE_MARKER (beg);
 521   CHECK_NUMBER_COERCE_MARKER (end);
 522
 523   from = min (XFASTINT (beg), XFASTINT (end));
 524   to = max (XFASTINT (beg), XFASTINT (end));
 525
 526   return make_number (to - from);
 527 }
 528
 529 /* Return the number of characters in the NBYTES bytes at PTR.
 530    This works by looking at the contents and checking for multibyte
 531    sequences while assuming that there's no invalid sequence.
 532    However, if the current buffer has enable-multibyte-characters =
 533    nil, we treat each byte as a character.  */
 534
 535 int
 536 chars_in_text (ptr, nbytes)
 537      unsigned char *ptr;
 538      int nbytes;
 539 {
 540   /* current_buffer is null at early stages of Emacs initialization.  */
 541   if (current_buffer == 0
 542       || NILP (current_buffer->enable_multibyte_characters))
 543     return nbytes;
 544
 545   return multibyte_chars_in_text (ptr, nbytes);
 546 }
 547
 548 /* Return the number of characters in the NBYTES bytes at PTR.
 549    This works by looking at the contents and checking for multibyte
 550    sequences while assuming that there's no invalid sequence.  It
 551    ignores enable-multibyte-characters.  */
 552
 553 int
 554 multibyte_chars_in_text (ptr, nbytes)
 555      unsigned char *ptr;
 556      int nbytes;
 557 {
 558   unsigned char *endp = ptr + nbytes;
 559   int chars = 0;
 560
 561   while (ptr < endp)
 562     {
 563       int len = MULTIBYTE_LENGTH (ptr, endp);
 564
 565       if (len == 0)
 566         abort ();
 567       ptr += len;
 568       chars++;
 569     }
 570
 571   return chars;
 572 }
 573
 574 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
 575    characters and bytes in it, and store them in *NCHARS and *NBYTES
 576    respectively.  On counting bytes, pay attention to that 8-bit
 577    characters not constructing a valid multibyte sequence are
 578    represented by 2-byte in a multibyte text.  */
 579
 580 void
 581 parse_str_as_multibyte (str, len, nchars, nbytes)
 582      unsigned char *str;
 583      int len, *nchars, *nbytes;
 584 {
 585   unsigned char *endp = str + len;
 586   int n, chars = 0, bytes = 0;
 587
 588   if (len >= MAX_MULTIBYTE_LENGTH)
 589     {
 590       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 591       while (str < adjusted_endp)
 592         {
 593           if ((n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
 594             str += n, bytes += n;
 595           else
 596             str++, bytes += 2;
 597           chars++;
 598         }
 599     }
 600   while (str < endp)
 601     {
 602       if ((n = MULTIBYTE_LENGTH (str, endp)) > 0)
 603         str += n, bytes += n;
 604       else
 605         str++, bytes += 2;
 606       chars++;
 607     }
 608
 609   *nchars = chars;
 610   *nbytes = bytes;
 611   return;
 612 }
 613
 614 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
 615    It actually converts only such 8-bit characters that don't contruct
 616    a multibyte sequence to multibyte forms of Latin-1 characters.  If
 617    NCHARS is nonzero, set *NCHARS to the number of characters in the
 618    text.  It is assured that we can use LEN bytes at STR as a work
 619    area and that is enough.  Return the number of bytes of the
 620    resulting text.  */
 621
 622 int
 623 str_as_multibyte (str, len, nbytes, nchars)
 624      unsigned char *str;
 625      int len, nbytes, *nchars;
 626 {
 627   unsigned char *p = str, *endp = str + nbytes;
 628   unsigned char *to;
 629   int chars = 0;
 630   int n;
 631
 632   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 633     {
 634       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 635       while (p < adjusted_endp
 636              && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 637         p += n, chars++;
 638     }
 639   while ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
 640     p += n, chars++;
 641   if (nchars)
 642     *nchars = chars;
 643   if (p == endp)
 644     return nbytes;
 645
 646   to = p;
 647   nbytes = endp - p;
 648   endp = str + len;
 649   safe_bcopy ((char *) p, (char *) (endp - nbytes), nbytes);
 650   p = endp - nbytes;
 651
 652   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 653     {
 654       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 655       while (p < adjusted_endp)
 656         {
 657           if ((n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 658             {
 659               while (n--)
 660                 *to++ = *p++;
 661             }
 662           else
 663             {
 664               int c = *p++;
 665               c = BYTE8_TO_CHAR (c);
 666               to += CHAR_STRING (c, to);
 667             }
 668         }
 669       chars++;
 670     }
 671   while (p < endp)
 672     {
 673       if ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
 674         {
 675           while (n--)
 676             *to++ = *p++;
 677         }
 678       else
 679         {
 680           int c = *p++;
 681           c = BYTE8_TO_CHAR (c);
 682           to += CHAR_STRING (c, to);
 683         }
 684       chars++;
 685     }
 686   if (nchars)
 687     *nchars = chars;
 688   return (to - str);
 689 }
 690
 691 /* Parse unibyte string at STR of LEN bytes, and return the number of
 692    bytes it may ocupy when converted to multibyte string by
 693    `str_to_multibyte'.  */
 694
 695 int
 696 parse_str_to_multibyte (str, len)
 697      unsigned char *str;
 698      int len;
 699 {
 700   unsigned char *endp = str + len;
 701   int bytes;
 702
 703   for (bytes = 0; str < endp; str++)
 704     bytes += (*str < 0x80) ? 1 : 2;
 705   return bytes;
 706 }
 707
 708
 709 /* Convert unibyte text at STR of NBYTES bytes to a multibyte text
 710    that contains the same single-byte characters.  It actually
 711    converts all 8-bit characters to multibyte forms.  It is assured
 712    that we can use LEN bytes at STR as a work area and that is
 713    enough.  */
 714
 715 int
 716 str_to_multibyte (str, len, bytes)
 717      unsigned char *str;
 718      int len, bytes;
 719 {
 720   unsigned char *p = str, *endp = str + bytes;
 721   unsigned char *to;
 722
 723   while (p < endp && *p < 0x80) p++;
 724   if (p == endp)
 725     return bytes;
 726   to = p;
 727   bytes = endp - p;
 728   endp = str + len;
 729   safe_bcopy ((char *) p, (char *) (endp - bytes), bytes);
 730   p = endp - bytes;
 731   while (p < endp)
 732     {
 733       int c = *p++;
 734
 735       if (c >= 0x80)
 736         c = BYTE8_TO_CHAR (c);
 737       to += CHAR_STRING (c, to);
 738     }
 739   return (to - str);
 740 }
 741
 742 /* Arrange multibyte text at STR of LEN bytes as a unibyte text.  It
 743    actually converts characters in the range 0x80..0xFF to
 744    unibyte.  */
 745
 746 int
 747 str_as_unibyte (str, bytes)
 748      unsigned char *str;
 749      int bytes;
 750 {
 751   const unsigned char *p = str, *endp = str + bytes;
 752   unsigned char *to;
 753   int c, len;
 754
 755   while (p < endp)
 756     {
 757       c = *p;
 758       len = BYTES_BY_CHAR_HEAD (c);
 759       if (CHAR_BYTE8_HEAD_P (c))
 760         break;
 761       p += len;
 762     }
 763   to = str + (p - str);
 764   while (p < endp)
 765     {
 766       c = *p;
 767       len = BYTES_BY_CHAR_HEAD (c);
 768       if (CHAR_BYTE8_HEAD_P (c))
 769         {
 770           c = STRING_CHAR_ADVANCE (p);
 771           *to++ = CHAR_TO_BYTE8 (c);
 772         }
 773       else
 774         {
 775           while (len--) *to++ = *p++;
 776         }
 777     }
 778   return (to - str);
 779 }
 780
 781 int
 782 string_count_byte8 (string)
 783      Lisp_Object string;
 784 {
 785   int multibyte = STRING_MULTIBYTE (string);
 786   int nbytes = STRING_BYTES (XSTRING (string));
 787   unsigned char *p = XSTRING (string)->data;
 788   unsigned char *pend = p + nbytes;
 789   int count = 0;
 790   int c, len;
 791
 792   if (multibyte)
 793     while (p < pend)
 794       {
 795         c = *p;
 796         len = BYTES_BY_CHAR_HEAD (c);
 797
 798         if (CHAR_BYTE8_HEAD_P (c))
 799           count++;
 800         p += len;
 801       }
 802   else
 803     while (p < pend)
 804       {
 805         if (*p++ >= 0x80)
 806           count++;
 807       }
 808   return count;
 809 }
 810
 811
 812 Lisp_Object
 813 string_escape_byte8 (string)
 814      Lisp_Object string;
 815 {
 816   int nchars = XSTRING (string)->size;
 817   int nbytes = STRING_BYTES (XSTRING (string));
 818   int multibyte = STRING_MULTIBYTE (string);
 819   int byte8_count;
 820   const unsigned char *src, *src_end;
 821   unsigned char *dst;
 822   Lisp_Object val;
 823   int c, len;
 824
 825   if (multibyte && nchars == nbytes)
 826     return string;
 827
 828   byte8_count = string_count_byte8 (string);
 829
 830   if (byte8_count == 0)
 831     return string;
 832
 833   if (multibyte)
 834     /* Convert 2-byte sequence of byte8 chars to 4-byte octal.  */
 835     val = make_uninit_multibyte_string (nchars + byte8_count * 3,
 836                                         nbytes + byte8_count * 2);
 837   else
 838     /* Convert 1-byte sequence of byte8 chars to 4-byte octal.  */
 839     val = make_uninit_string (nbytes + byte8_count * 3);
 840
 841   src = XSTRING (string)->data;
 842   src_end = src + nbytes;
 843   dst = XSTRING (val)->data;
 844   if (multibyte)
 845     while (src < src_end)
 846       {
 847         c = *src;
 848         len = BYTES_BY_CHAR_HEAD (c);
 849
 850         if (CHAR_BYTE8_HEAD_P (c))
 851           {
 852             c = STRING_CHAR_ADVANCE (src);
 853             c = CHAR_TO_BYTE8 (c);
 854             sprintf ((char *) dst, "\\%03o", c);
 855             dst += 4;
 856           }
 857         else
 858           while (len--) *dst++ = *src++;
 859       }
 860   else
 861     while (src < src_end)
 862       {
 863         c = *src++;
 864         if (c >= 0x80)
 865           {
 866             sprintf ((char *) dst, "\\%03o", c);
 867             dst += 4;
 868           }
 869         else
 870           *dst++ = c;
 871       }
 872   return val;
 873 }
 874
 875 \f
 876 DEFUN ("string", Fstring, Sstring, 1, MANY, 0,
 877        doc: /*
 878 Concatenate all the argument characters and make the result a string.
 879 usage: (string &rest CHARACTERS)  */)
 880      (n, args)
 881      int n;
 882      Lisp_Object *args;
 883 {
 884   int i;
 885   unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n);
 886   unsigned char *p = buf;
 887   int c;
 888
 889   for (i = 0; i < n; i++)
 890     {
 891       CHECK_CHARACTER (args[i]);
 892       c = XINT (args[i]);
 893       p += CHAR_STRING (c, p);
 894     }
 895
 896   return make_string_from_bytes ((char *) buf, n, p - buf);
 897 }
 898
 899 void
 900 init_character_once ()
 901 {
 902 }
 903
 904 #ifdef emacs
 905
 906 void
 907 syms_of_character ()
 908 {
 909   DEFSYM (Qcharacterp, "characterp");
 910   DEFSYM (Qauto_fill_chars, "auto-fill-chars");
 911
 912   staticpro (&Vchar_unify_table);
 913   Vchar_unify_table = Qnil;
 914
 915   defsubr (&Smax_char);
 916   defsubr (&Scharacterp);
 917   defsubr (&Sunibyte_char_to_multibyte);
 918   defsubr (&Smultibyte_char_to_unibyte);
 919   defsubr (&Schar_bytes);
 920   defsubr (&Schar_width);
 921   defsubr (&Sstring_width);
 922   defsubr (&Schar_direction);
 923   defsubr (&Schars_in_region);
 924   defsubr (&Sstring);
 925
 926   DEFVAR_LISP ("translation-table-vector",  &Vtranslation_table_vector,
 927                doc: /*
 928 Vector recording all translation tables ever defined.
 929 Each element is a pair (SYMBOL . TABLE) relating the table to the
 930 symbol naming it.  The ID of a translation table is an index into this vector.  */);
 931   Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
 932
 933   DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
 934                doc: /*
 935 A char-table for characters which invoke auto-filling.
 936 Such characters have value t in this table.  */);
 937   Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
 938   CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
 939   CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
 940
 941   DEFVAR_LISP ("char-width-table", &Vchar_width_table,
 942                doc: /*
 943 A char-table for width (columns) of each character.  */);
 944   Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
 945   char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
 946   char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
 947                         make_number (4));
 948
 949   DEFVAR_LISP ("char-direction-table", &Vchar_direction_table,
 950                doc: /* A char-table for direction of each character.  */);
 951   Vchar_direction_table = Fmake_char_table (Qnil, make_number (1));
 952
 953   DEFVAR_LISP ("printable-chars", &Vprintable_chars,
 954                doc: /* A char-table for each printable character.  */);
 955   Vprintable_chars = Fmake_char_table (Qnil, Qnil);
 956   Fset_char_table_range (Vprintable_chars,
 957                          Fcons (make_number (32), make_number (126)), Qt);
 958   Fset_char_table_range (Vprintable_chars,
 959                          Fcons (make_number (160),
 960                                 make_number (MAX_5_BYTE_CHAR)), Qt);
 961
 962   DEFVAR_LISP ("char-script-table", &Vchar_script_table,
 963                doc: /* Char table of script symbols.
 964 It has one extra slot whose value is a list of script symbols.  */);
 965
 966   /* Intern this now in case it isn't already done.
 967      Setting this variable twice is harmless.
 968      But don't staticpro it here--that is done in alloc.c.  */
 969   Qchar_table_extra_slots = intern ("char-table-extra-slots");
 970   DEFSYM (Qchar_script_table, "char-script-table");
 971   Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
 972   Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
 973 }
 974
 975 #endif /* emacs */