1 /* Basic character support.
2 Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001 Free Software Foundation, Inc.
5 Copyright (C) 2001, 2002
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
9 This file is part of GNU Emacs.
11 GNU Emacs is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
16 GNU Emacs is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with GNU Emacs; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 Boston, MA 02111-1307, USA. */
26 /* At first, see the document in `character.h' to understand the code
37 #include <sys/types.h>
39 #include "character.h"
42 #include "composite.h"
51 Lisp_Object Qcharacterp
;
53 /* Vector of translation table ever defined.
54 ID of a translation table is used to index this vector. */
55 Lisp_Object Vtranslation_table_vector
;
57 /* A char-table for characters which may invoke auto-filling. */
58 Lisp_Object Vauto_fill_chars
;
60 Lisp_Object Qauto_fill_chars
;
62 Lisp_Object Vchar_unify_table
;
64 /* A char-table. An element is non-nil iff the corresponding
65 character has a printable glyph. */
66 Lisp_Object Vprintable_chars
;
68 /* A char-table. An elemnent is a column-width of the corresponding
70 Lisp_Object Vchar_width_table
;
72 /* A char-table. An element is a symbol indicating the direction
73 property of corresponding character. */
74 Lisp_Object Vchar_direction_table
;
76 /* Variable used locally in the macro FETCH_MULTIBYTE_CHAR. */
77 unsigned char *_fetch_multibyte_char_p
;
79 /* Char table of scripts. */
80 Lisp_Object Vchar_script_table
;
82 static Lisp_Object Qchar_script_table
;
84 /* Mapping table from unibyte chars to multibyte chars. */
85 int unibyte_to_multibyte_table
[256];
96 if (c
& CHAR_MODIFIER_MASK
)
98 /* As a character not less than 256 can't have modifier bits, we
99 just ignore the bits. */
100 if (SINGLE_BYTE_CHAR_P ((c
& ~CHAR_MODIFIER_MASK
)))
102 /* For Meta, Shift, and Control modifiers, we need special care. */
105 /* Move the meta bit to the right place for a string. */
106 c
= (c
& ~CHAR_META
) | 0x80;
110 /* Shift modifier is valid only with [A-Za-z]. */
111 if ((c
& 0377) >= 'A' && (c
& 0377) <= 'Z')
113 else if ((c
& 0377) >= 'a' && (c
& 0377) <= 'z')
114 c
= (c
& ~CHAR_SHIFT
) - ('a' - 'A');
118 /* Simulate the code in lread.c. */
119 /* Allow `\C- ' and `\C-?'. */
120 if (c
== (CHAR_CTL
| ' '))
122 else if (c
== (CHAR_CTL
| '?'))
124 /* ASCII control chars are made from letters (both cases),
125 as well as the non-letters within 0100...0137. */
126 else if ((c
& 0137) >= 0101 && (c
& 0137) <= 0132)
127 c
&= (037 | (~0177 & ~CHAR_CTL
));
128 else if ((c
& 0177) >= 0100 && (c
& 0177) <= 0137)
129 c
&= (037 | (~0177 & ~CHAR_CTL
));
133 /* If C still has any modifier bits, just ignore it. */
134 c
&= ~CHAR_MODIFIER_MASK
;
137 MAYBE_UNIFY_CHAR (c
);
139 if (c
<= MAX_3_BYTE_CHAR
)
141 bytes
= CHAR_STRING (c
, p
);
143 else if (c
<= MAX_4_BYTE_CHAR
)
145 p
[0] = (0xF0 | (c
>> 18));
146 p
[1] = (0x80 | ((c
>> 12) & 0x3F));
147 p
[2] = (0x80 | ((c
>> 6) & 0x3F));
148 p
[3] = (0x80 | (c
& 0x3F));
151 else if (c
<= MAX_5_BYTE_CHAR
)
154 p
[1] = (0x80 | ((c
>> 18) & 0x0F));
155 p
[2] = (0x80 | ((c
>> 12) & 0x3F));
156 p
[3] = (0x80 | ((c
>> 6) & 0x3F));
157 p
[4] = (0x80 | (c
& 0x3F));
162 c
= CHAR_TO_BYTE8 (c
);
163 bytes
= BYTE8_STRING (c
, p
);
171 string_char (p
, advanced
, len
)
172 const unsigned char *p
;
173 const unsigned char **advanced
;
177 const unsigned char *saved_p
= p
;
179 if (*p
< 0x80 || ! (*p
& 0x20) || ! (*p
& 0x10))
181 c
= STRING_CHAR_ADVANCE (p
);
183 else if (! (*p
& 0x08))
185 c
= ((((p
)[0] & 0xF) << 18)
186 | (((p
)[1] & 0x3F) << 12)
187 | (((p
)[2] & 0x3F) << 6)
193 c
= ((((p
)[1] & 0x3F) << 18)
194 | (((p
)[2] & 0x3F) << 12)
195 | (((p
)[3] & 0x3F) << 6)
200 MAYBE_UNIFY_CHAR (c
);
210 /* Translate character C by translation table TABLE. If C is
211 negative, translate a character specified by CHARSET and CODE. If
212 no translation is found in TABLE, return the untranslated
216 translate_char (table
, c
)
222 if (! CHAR_TABLE_P (table
))
224 ch
= CHAR_TABLE_REF (table
, c
);
225 if (! CHARACTERP (ch
))
230 /* Convert the multibyte character C to unibyte 8-bit character based
231 on the current value of charset_unibyte. If dimension of
232 charset_unibyte is more than one, return (C & 0xFF).
234 The argument REV_TBL is now ignored. It will be removed in the
238 multibyte_char_to_unibyte (c
, rev_tbl
)
242 struct charset
*charset
;
245 if (CHAR_BYTE8_P (c
))
246 return CHAR_TO_BYTE8 (c
);
247 charset
= CHARSET_FROM_ID (charset_unibyte
);
248 c1
= ENCODE_CHAR (charset
, c
);
249 return ((c1
!= CHARSET_INVALID_CODE (charset
)) ? c1
: c
& 0xFF);
253 DEFUN ("characterp", Fcharacterp
, Scharacterp
, 1, 2, 0,
254 doc
: /* Return non-nil if OBJECT is a character. */)
256 Lisp_Object object
, ignore
;
258 return (CHARACTERP (object
) ? Qt
: Qnil
);
261 DEFUN ("max-char", Fmax_char
, Smax_char
, 0, 0, 0,
262 doc
: /* Return the character of the maximum code. */)
265 return make_number (MAX_CHAR
);
268 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte
,
269 Sunibyte_char_to_multibyte
, 1, 1, 0,
270 doc
: /* Convert the unibyte character CH to multibyte character.
271 The multibyte character is a result of decoding CH by
272 the current unibyte charset (see `unibyte-charset'). */)
277 struct charset
*charset
;
279 CHECK_CHARACTER (ch
);
282 error ("Invalid unibyte character: %d", c
);
283 charset
= CHARSET_FROM_ID (charset_unibyte
);
284 c
= DECODE_CHAR (charset
, c
);
286 c
= BYTE8_TO_CHAR (XFASTINT (ch
));
287 return make_number (c
);
290 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte
,
291 Smultibyte_char_to_unibyte
, 1, 1, 0,
292 doc
: /* Convert the multibyte character CH to unibyte character.\n\
293 The unibyte character is a result of encoding CH by
294 the current primary charset (value of `charset-primary'). */)
300 CHECK_CHARACTER (ch
);
302 c
= CHAR_TO_BYTE8 (c
);
303 return make_number (c
);
306 DEFUN ("char-bytes", Fchar_bytes
, Schar_bytes
, 1, 1, 0,
307 doc
: /* Return 1 regardless of the argument CHAR.
308 This is now an obsolete function. We keep it just for backward compatibility. */)
312 CHECK_CHARACTER (ch
);
313 return make_number (1);
316 DEFUN ("char-width", Fchar_width
, Schar_width
, 1, 1, 0,
317 doc
: /* Return width of CHAR when displayed in the current buffer.
318 The width is measured by how many columns it occupies on the screen.
319 Tab is taken to occupy `tab-width' columns. */)
325 struct Lisp_Char_Table
*dp
= buffer_display_table ();
327 CHECK_CHARACTER (ch
);
330 /* Get the way the display table would display it. */
331 disp
= dp
? DISP_CHAR_VECTOR (dp
, c
) : Qnil
;
334 width
= ASIZE (disp
);
336 width
= CHAR_WIDTH (c
);
338 return make_number (width
);
341 /* Return width of string STR of length LEN when displayed in the
342 current buffer. The width is measured by how many columns it
343 occupies on the screen. If PRECISION > 0, return the width of
344 longest substring that doesn't exceed PRECISION, and set number of
345 characters and bytes of the substring in *NCHARS and *NBYTES
349 c_string_width (str
, len
, precision
, nchars
, nbytes
)
351 int precision
, *nchars
, *nbytes
;
353 int i
= 0, i_byte
= 0;
355 struct Lisp_Char_Table
*dp
= buffer_display_table ();
359 int bytes
, thiswidth
;
361 int c
= STRING_CHAR_AND_LENGTH (str
+ i_byte
, len
- i_byte
, bytes
);
365 val
= DISP_CHAR_VECTOR (dp
, c
);
367 thiswidth
= XVECTOR (val
)->size
;
369 thiswidth
= CHAR_WIDTH (c
);
373 thiswidth
= CHAR_WIDTH (c
);
377 && (width
+ thiswidth
> precision
))
397 /* Return width of string STR of length LEN when displayed in the
398 current buffer. The width is measured by how many columns it
399 occupies on the screen. */
406 return c_string_width (str
, len
, -1, NULL
, NULL
);
409 /* Return width of Lisp string STRING when displayed in the current
410 buffer. The width is measured by how many columns it occupies on
411 the screen while paying attention to compositions. If PRECISION >
412 0, return the width of longest substring that doesn't exceed
413 PRECISION, and set number of characters and bytes of the substring
414 in *NCHARS and *NBYTES respectively. */
417 lisp_string_width (string
, precision
, nchars
, nbytes
)
419 int precision
, *nchars
, *nbytes
;
421 int len
= XSTRING (string
)->size
;
422 unsigned char *str
= XSTRING (string
)->data
;
423 int i
= 0, i_byte
= 0;
425 struct Lisp_Char_Table
*dp
= buffer_display_table ();
429 int chars
, bytes
, thiswidth
;
434 if (find_composition (i
, -1, &ignore
, &end
, &val
, string
)
435 && ((cmp_id
= get_composition_id (i
, i_byte
, end
- i
, val
, string
))
438 thiswidth
= composition_table
[cmp_id
]->width
;
440 bytes
= string_char_to_byte (string
, end
) - i_byte
;
444 int c
= STRING_CHAR_AND_LENGTH (str
+ i_byte
, len
- i_byte
, bytes
);
447 val
= DISP_CHAR_VECTOR (dp
, c
);
449 thiswidth
= XVECTOR (val
)->size
;
451 thiswidth
= CHAR_WIDTH (c
);
455 int c
= STRING_CHAR_AND_LENGTH (str
+ i_byte
, len
- i_byte
, bytes
);
458 thiswidth
= CHAR_WIDTH (c
);
462 && (width
+ thiswidth
> precision
))
482 DEFUN ("string-width", Fstring_width
, Sstring_width
, 1, 1, 0,
483 doc
: /* Return width of STRING when displayed in the current buffer.
484 Width is measured by how many columns it occupies on the screen.
485 When calculating width of a multibyte character in STRING,
486 only the base leading-code is considered; the validity of
487 the following bytes is not checked. Tabs in STRING are always
488 taken to occupy `tab-width' columns. */)
495 XSETFASTINT (val
, lisp_string_width (str
, -1, NULL
, NULL
));
499 DEFUN ("char-direction", Fchar_direction
, Schar_direction
, 1, 1, 0,
500 doc
: /* Return the direction of CHAR.
501 The returned value is 0 for left-to-right and 1 for right-to-left. */)
507 CHECK_CHARACTER (ch
);
509 return CHAR_TABLE_REF (Vchar_direction_table
, c
);
512 DEFUN ("chars-in-region", Fchars_in_region
, Schars_in_region
, 2, 2, 0,
513 doc
: /* Return number of characters between BEG and END.
514 This is now an obsolete function. We keep it just for backward compatibility. */)
516 Lisp_Object beg
, end
;
520 CHECK_NUMBER_COERCE_MARKER (beg
);
521 CHECK_NUMBER_COERCE_MARKER (end
);
523 from
= min (XFASTINT (beg
), XFASTINT (end
));
524 to
= max (XFASTINT (beg
), XFASTINT (end
));
526 return make_number (to
- from
);
529 /* Return the number of characters in the NBYTES bytes at PTR.
530 This works by looking at the contents and checking for multibyte
531 sequences while assuming that there's no invalid sequence.
532 However, if the current buffer has enable-multibyte-characters =
533 nil, we treat each byte as a character. */
536 chars_in_text (ptr
, nbytes
)
540 /* current_buffer is null at early stages of Emacs initialization. */
541 if (current_buffer
== 0
542 || NILP (current_buffer
->enable_multibyte_characters
))
545 return multibyte_chars_in_text (ptr
, nbytes
);
548 /* Return the number of characters in the NBYTES bytes at PTR.
549 This works by looking at the contents and checking for multibyte
550 sequences while assuming that there's no invalid sequence. It
551 ignores enable-multibyte-characters. */
554 multibyte_chars_in_text (ptr
, nbytes
)
558 unsigned char *endp
= ptr
+ nbytes
;
563 int len
= MULTIBYTE_LENGTH (ptr
, endp
);
574 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
575 characters and bytes in it, and store them in *NCHARS and *NBYTES
576 respectively. On counting bytes, pay attention to that 8-bit
577 characters not constructing a valid multibyte sequence are
578 represented by 2-byte in a multibyte text. */
581 parse_str_as_multibyte (str
, len
, nchars
, nbytes
)
583 int len
, *nchars
, *nbytes
;
585 unsigned char *endp
= str
+ len
;
586 int n
, chars
= 0, bytes
= 0;
588 if (len
>= MAX_MULTIBYTE_LENGTH
)
590 unsigned char *adjusted_endp
= endp
- MAX_MULTIBYTE_LENGTH
;
591 while (str
< adjusted_endp
)
593 if ((n
= MULTIBYTE_LENGTH_NO_CHECK (str
)) > 0)
594 str
+= n
, bytes
+= n
;
602 if ((n
= MULTIBYTE_LENGTH (str
, endp
)) > 0)
603 str
+= n
, bytes
+= n
;
614 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
615 It actually converts only such 8-bit characters that don't contruct
616 a multibyte sequence to multibyte forms of Latin-1 characters. If
617 NCHARS is nonzero, set *NCHARS to the number of characters in the
618 text. It is assured that we can use LEN bytes at STR as a work
619 area and that is enough. Return the number of bytes of the
623 str_as_multibyte (str
, len
, nbytes
, nchars
)
625 int len
, nbytes
, *nchars
;
627 unsigned char *p
= str
, *endp
= str
+ nbytes
;
632 if (nbytes
>= MAX_MULTIBYTE_LENGTH
)
634 unsigned char *adjusted_endp
= endp
- MAX_MULTIBYTE_LENGTH
;
635 while (p
< adjusted_endp
636 && (n
= MULTIBYTE_LENGTH_NO_CHECK (p
)) > 0)
639 while ((n
= MULTIBYTE_LENGTH (p
, endp
)) > 0)
649 safe_bcopy ((char *) p
, (char *) (endp
- nbytes
), nbytes
);
652 if (nbytes
>= MAX_MULTIBYTE_LENGTH
)
654 unsigned char *adjusted_endp
= endp
- MAX_MULTIBYTE_LENGTH
;
655 while (p
< adjusted_endp
)
657 if ((n
= MULTIBYTE_LENGTH_NO_CHECK (p
)) > 0)
665 c
= BYTE8_TO_CHAR (c
);
666 to
+= CHAR_STRING (c
, to
);
673 if ((n
= MULTIBYTE_LENGTH (p
, endp
)) > 0)
681 c
= BYTE8_TO_CHAR (c
);
682 to
+= CHAR_STRING (c
, to
);
691 /* Parse unibyte string at STR of LEN bytes, and return the number of
692 bytes it may ocupy when converted to multibyte string by
693 `str_to_multibyte'. */
696 parse_str_to_multibyte (str
, len
)
700 unsigned char *endp
= str
+ len
;
703 for (bytes
= 0; str
< endp
; str
++)
704 bytes
+= (*str
< 0x80) ? 1 : 2;
709 /* Convert unibyte text at STR of NBYTES bytes to a multibyte text
710 that contains the same single-byte characters. It actually
711 converts all 8-bit characters to multibyte forms. It is assured
712 that we can use LEN bytes at STR as a work area and that is
716 str_to_multibyte (str
, len
, bytes
)
720 unsigned char *p
= str
, *endp
= str
+ bytes
;
723 while (p
< endp
&& *p
< 0x80) p
++;
729 safe_bcopy ((char *) p
, (char *) (endp
- bytes
), bytes
);
736 c
= BYTE8_TO_CHAR (c
);
737 to
+= CHAR_STRING (c
, to
);
742 /* Arrange multibyte text at STR of LEN bytes as a unibyte text. It
743 actually converts characters in the range 0x80..0xFF to
747 str_as_unibyte (str
, bytes
)
751 const unsigned char *p
= str
, *endp
= str
+ bytes
;
758 len
= BYTES_BY_CHAR_HEAD (c
);
759 if (CHAR_BYTE8_HEAD_P (c
))
763 to
= str
+ (p
- str
);
767 len
= BYTES_BY_CHAR_HEAD (c
);
768 if (CHAR_BYTE8_HEAD_P (c
))
770 c
= STRING_CHAR_ADVANCE (p
);
771 *to
++ = CHAR_TO_BYTE8 (c
);
775 while (len
--) *to
++ = *p
++;
782 string_count_byte8 (string
)
785 int multibyte
= STRING_MULTIBYTE (string
);
786 int nbytes
= STRING_BYTES (XSTRING (string
));
787 unsigned char *p
= XSTRING (string
)->data
;
788 unsigned char *pend
= p
+ nbytes
;
796 len
= BYTES_BY_CHAR_HEAD (c
);
798 if (CHAR_BYTE8_HEAD_P (c
))
813 string_escape_byte8 (string
)
816 int nchars
= XSTRING (string
)->size
;
817 int nbytes
= STRING_BYTES (XSTRING (string
));
818 int multibyte
= STRING_MULTIBYTE (string
);
820 const unsigned char *src
, *src_end
;
825 if (multibyte
&& nchars
== nbytes
)
828 byte8_count
= string_count_byte8 (string
);
830 if (byte8_count
== 0)
834 /* Convert 2-byte sequence of byte8 chars to 4-byte octal. */
835 val
= make_uninit_multibyte_string (nchars
+ byte8_count
* 3,
836 nbytes
+ byte8_count
* 2);
838 /* Convert 1-byte sequence of byte8 chars to 4-byte octal. */
839 val
= make_uninit_string (nbytes
+ byte8_count
* 3);
841 src
= XSTRING (string
)->data
;
842 src_end
= src
+ nbytes
;
843 dst
= XSTRING (val
)->data
;
845 while (src
< src_end
)
848 len
= BYTES_BY_CHAR_HEAD (c
);
850 if (CHAR_BYTE8_HEAD_P (c
))
852 c
= STRING_CHAR_ADVANCE (src
);
853 c
= CHAR_TO_BYTE8 (c
);
854 sprintf ((char *) dst
, "\\%03o", c
);
858 while (len
--) *dst
++ = *src
++;
861 while (src
< src_end
)
866 sprintf ((char *) dst
, "\\%03o", c
);
876 DEFUN ("string", Fstring
, Sstring
, 1, MANY
, 0,
878 Concatenate all the argument characters and make the result a string.
879 usage: (string &rest CHARACTERS) */)
885 unsigned char *buf
= (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH
* n
);
886 unsigned char *p
= buf
;
889 for (i
= 0; i
< n
; i
++)
891 CHECK_CHARACTER (args
[i
]);
893 p
+= CHAR_STRING (c
, p
);
896 return make_string_from_bytes ((char *) buf
, n
, p
- buf
);
900 init_character_once ()
909 DEFSYM (Qcharacterp
, "characterp");
910 DEFSYM (Qauto_fill_chars
, "auto-fill-chars");
912 staticpro (&Vchar_unify_table
);
913 Vchar_unify_table
= Qnil
;
915 defsubr (&Smax_char
);
916 defsubr (&Scharacterp
);
917 defsubr (&Sunibyte_char_to_multibyte
);
918 defsubr (&Smultibyte_char_to_unibyte
);
919 defsubr (&Schar_bytes
);
920 defsubr (&Schar_width
);
921 defsubr (&Sstring_width
);
922 defsubr (&Schar_direction
);
923 defsubr (&Schars_in_region
);
926 DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector
,
928 Vector recording all translation tables ever defined.
929 Each element is a pair (SYMBOL . TABLE) relating the table to the
930 symbol naming it. The ID of a translation table is an index into this vector. */);
931 Vtranslation_table_vector
= Fmake_vector (make_number (16), Qnil
);
933 DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars
,
935 A char-table for characters which invoke auto-filling.
936 Such characters have value t in this table. */);
937 Vauto_fill_chars
= Fmake_char_table (Qauto_fill_chars
, Qnil
);
938 CHAR_TABLE_SET (Vauto_fill_chars
, ' ', Qt
);
939 CHAR_TABLE_SET (Vauto_fill_chars
, '\n', Qt
);
941 DEFVAR_LISP ("char-width-table", &Vchar_width_table
,
943 A char-table for width (columns) of each character. */);
944 Vchar_width_table
= Fmake_char_table (Qnil
, make_number (1));
945 char_table_set_range (Vchar_width_table
, 0x80, 0x9F, make_number (4));
946 char_table_set_range (Vchar_width_table
, MAX_5_BYTE_CHAR
+ 1, MAX_CHAR
,
949 DEFVAR_LISP ("char-direction-table", &Vchar_direction_table
,
950 doc
: /* A char-table for direction of each character. */);
951 Vchar_direction_table
= Fmake_char_table (Qnil
, make_number (1));
953 DEFVAR_LISP ("printable-chars", &Vprintable_chars
,
954 doc
: /* A char-table for each printable character. */);
955 Vprintable_chars
= Fmake_char_table (Qnil
, Qnil
);
956 Fset_char_table_range (Vprintable_chars
,
957 Fcons (make_number (32), make_number (126)), Qt
);
958 Fset_char_table_range (Vprintable_chars
,
959 Fcons (make_number (160),
960 make_number (MAX_5_BYTE_CHAR
)), Qt
);
962 DEFVAR_LISP ("char-script-table", &Vchar_script_table
,
963 doc
: /* Char table of script symbols.
964 It has one extra slot whose value is a list of script symbols. */);
966 /* Intern this now in case it isn't already done.
967 Setting this variable twice is harmless.
968 But don't staticpro it here--that is done in alloc.c. */
969 Qchar_table_extra_slots
= intern ("char-table-extra-slots");
970 DEFSYM (Qchar_script_table
, "char-script-table");
971 Fput (Qchar_script_table
, Qchar_table_extra_slots
, make_number (1));
972 Vchar_script_table
= Fmake_char_table (Qchar_script_table
, Qnil
);