]> code.delx.au - gnu-emacs/blob - src/character.c
*** empty log message ***
[gnu-emacs] / src / character.c
1 /* Basic character support.
2 Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001 Free Software Foundation, Inc.
5 Copyright (C) 2001, 2002
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
8
9 This file is part of GNU Emacs.
10
11 GNU Emacs is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
14 any later version.
15
16 GNU Emacs is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License
22 along with GNU Emacs; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 Boston, MA 02111-1307, USA. */
25
26 /* At first, see the document in `character.h' to understand the code
27 in this file. */
28
29 #ifdef emacs
30 #include <config.h>
31 #endif
32
33 #include <stdio.h>
34
35 #ifdef emacs
36
37 #include <sys/types.h>
38 #include "lisp.h"
39 #include "character.h"
40 #include "buffer.h"
41 #include "charset.h"
42 #include "composite.h"
43 #include "disptab.h"
44
45 #else /* not emacs */
46
47 #include "mulelib.h"
48
49 #endif /* emacs */
50
51 Lisp_Object Qcharacterp;
52
53 /* Vector of translation table ever defined.
54 ID of a translation table is used to index this vector. */
55 Lisp_Object Vtranslation_table_vector;
56
57 /* A char-table for characters which may invoke auto-filling. */
58 Lisp_Object Vauto_fill_chars;
59
60 Lisp_Object Qauto_fill_chars;
61
62 Lisp_Object Vchar_unify_table;
63
64 /* A char-table. An element is non-nil iff the corresponding
65 character has a printable glyph. */
66 Lisp_Object Vprintable_chars;
67
68 /* A char-table. An elemnent is a column-width of the corresponding
69 character. */
70 Lisp_Object Vchar_width_table;
71
72 /* A char-table. An element is a symbol indicating the direction
73 property of corresponding character. */
74 Lisp_Object Vchar_direction_table;
75
76 /* Variable used locally in the macro FETCH_MULTIBYTE_CHAR. */
77 unsigned char *_fetch_multibyte_char_p;
78
79 /* Char table of scripts. */
80 Lisp_Object Vchar_script_table;
81
82 static Lisp_Object Qchar_script_table;
83
84 /* Mapping table from unibyte chars to multibyte chars. */
85 int unibyte_to_multibyte_table[256];
86
87 \f
88
89 int
90 char_string (c, p)
91 int c;
92 unsigned char *p;
93 {
94 int bytes;
95
96 if (c & CHAR_MODIFIER_MASK)
97 {
98 /* As a character not less than 256 can't have modifier bits, we
99 just ignore the bits. */
100 if (SINGLE_BYTE_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
101 {
102 /* For Meta, Shift, and Control modifiers, we need special care. */
103 if (c & CHAR_META)
104 {
105 /* Move the meta bit to the right place for a string. */
106 c = (c & ~CHAR_META) | 0x80;
107 }
108 if (c & CHAR_SHIFT)
109 {
110 /* Shift modifier is valid only with [A-Za-z]. */
111 if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
112 c &= ~CHAR_SHIFT;
113 else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
114 c = (c & ~CHAR_SHIFT) - ('a' - 'A');
115 }
116 if (c & CHAR_CTL)
117 {
118 /* Simulate the code in lread.c. */
119 /* Allow `\C- ' and `\C-?'. */
120 if (c == (CHAR_CTL | ' '))
121 c = 0;
122 else if (c == (CHAR_CTL | '?'))
123 c = 127;
124 /* ASCII control chars are made from letters (both cases),
125 as well as the non-letters within 0100...0137. */
126 else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
127 c &= (037 | (~0177 & ~CHAR_CTL));
128 else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
129 c &= (037 | (~0177 & ~CHAR_CTL));
130 }
131 }
132
133 /* If C still has any modifier bits, just ignore it. */
134 c &= ~CHAR_MODIFIER_MASK;
135 }
136
137 MAYBE_UNIFY_CHAR (c);
138
139 if (c <= MAX_3_BYTE_CHAR)
140 {
141 bytes = CHAR_STRING (c, p);
142 }
143 else if (c <= MAX_4_BYTE_CHAR)
144 {
145 p[0] = (0xF0 | (c >> 18));
146 p[1] = (0x80 | ((c >> 12) & 0x3F));
147 p[2] = (0x80 | ((c >> 6) & 0x3F));
148 p[3] = (0x80 | (c & 0x3F));
149 bytes = 4;
150 }
151 else if (c <= MAX_5_BYTE_CHAR)
152 {
153 p[0] = 0xF8;
154 p[1] = (0x80 | ((c >> 18) & 0x0F));
155 p[2] = (0x80 | ((c >> 12) & 0x3F));
156 p[3] = (0x80 | ((c >> 6) & 0x3F));
157 p[4] = (0x80 | (c & 0x3F));
158 bytes = 5;
159 }
160 else
161 {
162 c = CHAR_TO_BYTE8 (c);
163 bytes = BYTE8_STRING (c, p);
164 }
165
166 return bytes;
167 }
168
169
170 int
171 string_char (p, advanced, len)
172 const unsigned char *p;
173 const unsigned char **advanced;
174 int *len;
175 {
176 int c;
177 const unsigned char *saved_p = p;
178
179 if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
180 {
181 c = STRING_CHAR_ADVANCE (p);
182 }
183 else if (! (*p & 0x08))
184 {
185 c = ((((p)[0] & 0xF) << 18)
186 | (((p)[1] & 0x3F) << 12)
187 | (((p)[2] & 0x3F) << 6)
188 | ((p)[3] & 0x3F));
189 p += 4;
190 }
191 else
192 {
193 c = ((((p)[1] & 0x3F) << 18)
194 | (((p)[2] & 0x3F) << 12)
195 | (((p)[3] & 0x3F) << 6)
196 | ((p)[4] & 0x3F));
197 p += 5;
198 }
199
200 MAYBE_UNIFY_CHAR (c);
201
202 if (len)
203 *len = p - saved_p;
204 if (advanced)
205 *advanced = p;
206 return c;
207 }
208
209
210 /* Translate character C by translation table TABLE. If C is
211 negative, translate a character specified by CHARSET and CODE. If
212 no translation is found in TABLE, return the untranslated
213 character. */
214
215 int
216 translate_char (table, c)
217 Lisp_Object table;
218 int c;
219 {
220 Lisp_Object ch;
221
222 if (! CHAR_TABLE_P (table))
223 return c;
224 ch = CHAR_TABLE_REF (table, c);
225 if (! CHARACTERP (ch))
226 return c;
227 return XINT (ch);
228 }
229
230 /* Convert the multibyte character C to unibyte 8-bit character based
231 on the current value of charset_unibyte. If dimension of
232 charset_unibyte is more than one, return (C & 0xFF).
233
234 The argument REV_TBL is now ignored. It will be removed in the
235 future. */
236
237 int
238 multibyte_char_to_unibyte (c, rev_tbl)
239 int c;
240 Lisp_Object rev_tbl;
241 {
242 struct charset *charset;
243 unsigned c1;
244
245 if (CHAR_BYTE8_P (c))
246 return CHAR_TO_BYTE8 (c);
247 charset = CHARSET_FROM_ID (charset_unibyte);
248 c1 = ENCODE_CHAR (charset, c);
249 return ((c1 != CHARSET_INVALID_CODE (charset)) ? c1 : c & 0xFF);
250 }
251
252
253 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
254 doc: /* Return non-nil if OBJECT is a character. */)
255 (object, ignore)
256 Lisp_Object object, ignore;
257 {
258 return (CHARACTERP (object) ? Qt : Qnil);
259 }
260
261 DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
262 doc: /* Return the character of the maximum code. */)
263 ()
264 {
265 return make_number (MAX_CHAR);
266 }
267
268 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
269 Sunibyte_char_to_multibyte, 1, 1, 0,
270 doc: /* Convert the unibyte character CH to multibyte character.
271 The multibyte character is a result of decoding CH by
272 the current unibyte charset (see `unibyte-charset'). */)
273 (ch)
274 Lisp_Object ch;
275 {
276 int c;
277 struct charset *charset;
278
279 CHECK_CHARACTER (ch);
280 c = XFASTINT (ch);
281 if (c >= 0400)
282 error ("Invalid unibyte character: %d", c);
283 charset = CHARSET_FROM_ID (charset_unibyte);
284 c = DECODE_CHAR (charset, c);
285 if (c < 0)
286 c = BYTE8_TO_CHAR (XFASTINT (ch));
287 return make_number (c);
288 }
289
290 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
291 Smultibyte_char_to_unibyte, 1, 1, 0,
292 doc: /* Convert the multibyte character CH to unibyte character.\n\
293 The unibyte character is a result of encoding CH by
294 the current primary charset (value of `charset-primary'). */)
295 (ch)
296 Lisp_Object ch;
297 {
298 int c;
299
300 CHECK_CHARACTER (ch);
301 c = XFASTINT (ch);
302 c = CHAR_TO_BYTE8 (c);
303 return make_number (c);
304 }
305
306 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
307 doc: /* Return 1 regardless of the argument CHAR.
308 This is now an obsolete function. We keep it just for backward compatibility. */)
309 (ch)
310 Lisp_Object ch;
311 {
312 CHECK_CHARACTER (ch);
313 return make_number (1);
314 }
315
316 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
317 doc: /* Return width of CHAR when displayed in the current buffer.
318 The width is measured by how many columns it occupies on the screen.
319 Tab is taken to occupy `tab-width' columns. */)
320 (ch)
321 Lisp_Object ch;
322 {
323 Lisp_Object disp;
324 int c, width;
325 struct Lisp_Char_Table *dp = buffer_display_table ();
326
327 CHECK_CHARACTER (ch);
328 c = XINT (ch);
329
330 /* Get the way the display table would display it. */
331 disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
332
333 if (VECTORP (disp))
334 width = ASIZE (disp);
335 else
336 width = CHAR_WIDTH (c);
337
338 return make_number (width);
339 }
340
341 /* Return width of string STR of length LEN when displayed in the
342 current buffer. The width is measured by how many columns it
343 occupies on the screen. If PRECISION > 0, return the width of
344 longest substring that doesn't exceed PRECISION, and set number of
345 characters and bytes of the substring in *NCHARS and *NBYTES
346 respectively. */
347
348 int
349 c_string_width (str, len, precision, nchars, nbytes)
350 unsigned char *str;
351 int precision, *nchars, *nbytes;
352 {
353 int i = 0, i_byte = 0;
354 int width = 0;
355 struct Lisp_Char_Table *dp = buffer_display_table ();
356
357 while (i_byte < len)
358 {
359 int bytes, thiswidth;
360 Lisp_Object val;
361 int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
362
363 if (dp)
364 {
365 val = DISP_CHAR_VECTOR (dp, c);
366 if (VECTORP (val))
367 thiswidth = XVECTOR (val)->size;
368 else
369 thiswidth = CHAR_WIDTH (c);
370 }
371 else
372 {
373 thiswidth = CHAR_WIDTH (c);
374 }
375
376 if (precision > 0
377 && (width + thiswidth > precision))
378 {
379 *nchars = i;
380 *nbytes = i_byte;
381 return width;
382 }
383 i++;
384 i_byte += bytes;
385 width += thiswidth;
386 }
387
388 if (precision > 0)
389 {
390 *nchars = i;
391 *nbytes = i_byte;
392 }
393
394 return width;
395 }
396
397 /* Return width of string STR of length LEN when displayed in the
398 current buffer. The width is measured by how many columns it
399 occupies on the screen. */
400
401 int
402 strwidth (str, len)
403 unsigned char *str;
404 int len;
405 {
406 return c_string_width (str, len, -1, NULL, NULL);
407 }
408
409 /* Return width of Lisp string STRING when displayed in the current
410 buffer. The width is measured by how many columns it occupies on
411 the screen while paying attention to compositions. If PRECISION >
412 0, return the width of longest substring that doesn't exceed
413 PRECISION, and set number of characters and bytes of the substring
414 in *NCHARS and *NBYTES respectively. */
415
416 int
417 lisp_string_width (string, precision, nchars, nbytes)
418 Lisp_Object string;
419 int precision, *nchars, *nbytes;
420 {
421 int len = XSTRING (string)->size;
422 unsigned char *str = XSTRING (string)->data;
423 int i = 0, i_byte = 0;
424 int width = 0;
425 struct Lisp_Char_Table *dp = buffer_display_table ();
426
427 while (i < len)
428 {
429 int chars, bytes, thiswidth;
430 Lisp_Object val;
431 int cmp_id;
432 int ignore, end;
433
434 if (find_composition (i, -1, &ignore, &end, &val, string)
435 && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
436 >= 0))
437 {
438 thiswidth = composition_table[cmp_id]->width;
439 chars = end - i;
440 bytes = string_char_to_byte (string, end) - i_byte;
441 }
442 else if (dp)
443 {
444 int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
445
446 chars = 1;
447 val = DISP_CHAR_VECTOR (dp, c);
448 if (VECTORP (val))
449 thiswidth = XVECTOR (val)->size;
450 else
451 thiswidth = CHAR_WIDTH (c);
452 }
453 else
454 {
455 int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
456
457 chars = 1;
458 thiswidth = CHAR_WIDTH (c);
459 }
460
461 if (precision > 0
462 && (width + thiswidth > precision))
463 {
464 *nchars = i;
465 *nbytes = i_byte;
466 return width;
467 }
468 i += chars;
469 i_byte += bytes;
470 width += thiswidth;
471 }
472
473 if (precision > 0)
474 {
475 *nchars = i;
476 *nbytes = i_byte;
477 }
478
479 return width;
480 }
481
482 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
483 doc: /* Return width of STRING when displayed in the current buffer.
484 Width is measured by how many columns it occupies on the screen.
485 When calculating width of a multibyte character in STRING,
486 only the base leading-code is considered; the validity of
487 the following bytes is not checked. Tabs in STRING are always
488 taken to occupy `tab-width' columns. */)
489 (str)
490 Lisp_Object str;
491 {
492 Lisp_Object val;
493
494 CHECK_STRING (str);
495 XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
496 return val;
497 }
498
499 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
500 doc: /* Return the direction of CHAR.
501 The returned value is 0 for left-to-right and 1 for right-to-left. */)
502 (ch)
503 Lisp_Object ch;
504 {
505 int c;
506
507 CHECK_CHARACTER (ch);
508 c = XINT (ch);
509 return CHAR_TABLE_REF (Vchar_direction_table, c);
510 }
511
512 DEFUN ("chars-in-region", Fchars_in_region, Schars_in_region, 2, 2, 0,
513 doc: /* Return number of characters between BEG and END.
514 This is now an obsolete function. We keep it just for backward compatibility. */)
515 (beg, end)
516 Lisp_Object beg, end;
517 {
518 int from, to;
519
520 CHECK_NUMBER_COERCE_MARKER (beg);
521 CHECK_NUMBER_COERCE_MARKER (end);
522
523 from = min (XFASTINT (beg), XFASTINT (end));
524 to = max (XFASTINT (beg), XFASTINT (end));
525
526 return make_number (to - from);
527 }
528
529 /* Return the number of characters in the NBYTES bytes at PTR.
530 This works by looking at the contents and checking for multibyte
531 sequences while assuming that there's no invalid sequence.
532 However, if the current buffer has enable-multibyte-characters =
533 nil, we treat each byte as a character. */
534
535 int
536 chars_in_text (ptr, nbytes)
537 unsigned char *ptr;
538 int nbytes;
539 {
540 /* current_buffer is null at early stages of Emacs initialization. */
541 if (current_buffer == 0
542 || NILP (current_buffer->enable_multibyte_characters))
543 return nbytes;
544
545 return multibyte_chars_in_text (ptr, nbytes);
546 }
547
548 /* Return the number of characters in the NBYTES bytes at PTR.
549 This works by looking at the contents and checking for multibyte
550 sequences while assuming that there's no invalid sequence. It
551 ignores enable-multibyte-characters. */
552
553 int
554 multibyte_chars_in_text (ptr, nbytes)
555 unsigned char *ptr;
556 int nbytes;
557 {
558 unsigned char *endp = ptr + nbytes;
559 int chars = 0;
560
561 while (ptr < endp)
562 {
563 int len = MULTIBYTE_LENGTH (ptr, endp);
564
565 if (len == 0)
566 abort ();
567 ptr += len;
568 chars++;
569 }
570
571 return chars;
572 }
573
574 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
575 characters and bytes in it, and store them in *NCHARS and *NBYTES
576 respectively. On counting bytes, pay attention to that 8-bit
577 characters not constructing a valid multibyte sequence are
578 represented by 2-byte in a multibyte text. */
579
580 void
581 parse_str_as_multibyte (str, len, nchars, nbytes)
582 unsigned char *str;
583 int len, *nchars, *nbytes;
584 {
585 unsigned char *endp = str + len;
586 int n, chars = 0, bytes = 0;
587
588 if (len >= MAX_MULTIBYTE_LENGTH)
589 {
590 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
591 while (str < adjusted_endp)
592 {
593 if ((n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
594 str += n, bytes += n;
595 else
596 str++, bytes += 2;
597 chars++;
598 }
599 }
600 while (str < endp)
601 {
602 if ((n = MULTIBYTE_LENGTH (str, endp)) > 0)
603 str += n, bytes += n;
604 else
605 str++, bytes += 2;
606 chars++;
607 }
608
609 *nchars = chars;
610 *nbytes = bytes;
611 return;
612 }
613
614 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
615 It actually converts only such 8-bit characters that don't contruct
616 a multibyte sequence to multibyte forms of Latin-1 characters. If
617 NCHARS is nonzero, set *NCHARS to the number of characters in the
618 text. It is assured that we can use LEN bytes at STR as a work
619 area and that is enough. Return the number of bytes of the
620 resulting text. */
621
622 int
623 str_as_multibyte (str, len, nbytes, nchars)
624 unsigned char *str;
625 int len, nbytes, *nchars;
626 {
627 unsigned char *p = str, *endp = str + nbytes;
628 unsigned char *to;
629 int chars = 0;
630 int n;
631
632 if (nbytes >= MAX_MULTIBYTE_LENGTH)
633 {
634 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
635 while (p < adjusted_endp
636 && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
637 p += n, chars++;
638 }
639 while ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
640 p += n, chars++;
641 if (nchars)
642 *nchars = chars;
643 if (p == endp)
644 return nbytes;
645
646 to = p;
647 nbytes = endp - p;
648 endp = str + len;
649 safe_bcopy ((char *) p, (char *) (endp - nbytes), nbytes);
650 p = endp - nbytes;
651
652 if (nbytes >= MAX_MULTIBYTE_LENGTH)
653 {
654 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
655 while (p < adjusted_endp)
656 {
657 if ((n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
658 {
659 while (n--)
660 *to++ = *p++;
661 }
662 else
663 {
664 int c = *p++;
665 c = BYTE8_TO_CHAR (c);
666 to += CHAR_STRING (c, to);
667 }
668 }
669 chars++;
670 }
671 while (p < endp)
672 {
673 if ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
674 {
675 while (n--)
676 *to++ = *p++;
677 }
678 else
679 {
680 int c = *p++;
681 c = BYTE8_TO_CHAR (c);
682 to += CHAR_STRING (c, to);
683 }
684 chars++;
685 }
686 if (nchars)
687 *nchars = chars;
688 return (to - str);
689 }
690
691 /* Parse unibyte string at STR of LEN bytes, and return the number of
692 bytes it may ocupy when converted to multibyte string by
693 `str_to_multibyte'. */
694
695 int
696 parse_str_to_multibyte (str, len)
697 unsigned char *str;
698 int len;
699 {
700 unsigned char *endp = str + len;
701 int bytes;
702
703 for (bytes = 0; str < endp; str++)
704 bytes += (*str < 0x80) ? 1 : 2;
705 return bytes;
706 }
707
708
709 /* Convert unibyte text at STR of NBYTES bytes to a multibyte text
710 that contains the same single-byte characters. It actually
711 converts all 8-bit characters to multibyte forms. It is assured
712 that we can use LEN bytes at STR as a work area and that is
713 enough. */
714
715 int
716 str_to_multibyte (str, len, bytes)
717 unsigned char *str;
718 int len, bytes;
719 {
720 unsigned char *p = str, *endp = str + bytes;
721 unsigned char *to;
722
723 while (p < endp && *p < 0x80) p++;
724 if (p == endp)
725 return bytes;
726 to = p;
727 bytes = endp - p;
728 endp = str + len;
729 safe_bcopy ((char *) p, (char *) (endp - bytes), bytes);
730 p = endp - bytes;
731 while (p < endp)
732 {
733 int c = *p++;
734
735 if (c >= 0x80)
736 c = BYTE8_TO_CHAR (c);
737 to += CHAR_STRING (c, to);
738 }
739 return (to - str);
740 }
741
742 /* Arrange multibyte text at STR of LEN bytes as a unibyte text. It
743 actually converts characters in the range 0x80..0xFF to
744 unibyte. */
745
746 int
747 str_as_unibyte (str, bytes)
748 unsigned char *str;
749 int bytes;
750 {
751 const unsigned char *p = str, *endp = str + bytes;
752 unsigned char *to;
753 int c, len;
754
755 while (p < endp)
756 {
757 c = *p;
758 len = BYTES_BY_CHAR_HEAD (c);
759 if (CHAR_BYTE8_HEAD_P (c))
760 break;
761 p += len;
762 }
763 to = str + (p - str);
764 while (p < endp)
765 {
766 c = *p;
767 len = BYTES_BY_CHAR_HEAD (c);
768 if (CHAR_BYTE8_HEAD_P (c))
769 {
770 c = STRING_CHAR_ADVANCE (p);
771 *to++ = CHAR_TO_BYTE8 (c);
772 }
773 else
774 {
775 while (len--) *to++ = *p++;
776 }
777 }
778 return (to - str);
779 }
780
781 int
782 string_count_byte8 (string)
783 Lisp_Object string;
784 {
785 int multibyte = STRING_MULTIBYTE (string);
786 int nbytes = STRING_BYTES (XSTRING (string));
787 unsigned char *p = XSTRING (string)->data;
788 unsigned char *pend = p + nbytes;
789 int count = 0;
790 int c, len;
791
792 if (multibyte)
793 while (p < pend)
794 {
795 c = *p;
796 len = BYTES_BY_CHAR_HEAD (c);
797
798 if (CHAR_BYTE8_HEAD_P (c))
799 count++;
800 p += len;
801 }
802 else
803 while (p < pend)
804 {
805 if (*p++ >= 0x80)
806 count++;
807 }
808 return count;
809 }
810
811
812 Lisp_Object
813 string_escape_byte8 (string)
814 Lisp_Object string;
815 {
816 int nchars = XSTRING (string)->size;
817 int nbytes = STRING_BYTES (XSTRING (string));
818 int multibyte = STRING_MULTIBYTE (string);
819 int byte8_count;
820 const unsigned char *src, *src_end;
821 unsigned char *dst;
822 Lisp_Object val;
823 int c, len;
824
825 if (multibyte && nchars == nbytes)
826 return string;
827
828 byte8_count = string_count_byte8 (string);
829
830 if (byte8_count == 0)
831 return string;
832
833 if (multibyte)
834 /* Convert 2-byte sequence of byte8 chars to 4-byte octal. */
835 val = make_uninit_multibyte_string (nchars + byte8_count * 3,
836 nbytes + byte8_count * 2);
837 else
838 /* Convert 1-byte sequence of byte8 chars to 4-byte octal. */
839 val = make_uninit_string (nbytes + byte8_count * 3);
840
841 src = XSTRING (string)->data;
842 src_end = src + nbytes;
843 dst = XSTRING (val)->data;
844 if (multibyte)
845 while (src < src_end)
846 {
847 c = *src;
848 len = BYTES_BY_CHAR_HEAD (c);
849
850 if (CHAR_BYTE8_HEAD_P (c))
851 {
852 c = STRING_CHAR_ADVANCE (src);
853 c = CHAR_TO_BYTE8 (c);
854 sprintf ((char *) dst, "\\%03o", c);
855 dst += 4;
856 }
857 else
858 while (len--) *dst++ = *src++;
859 }
860 else
861 while (src < src_end)
862 {
863 c = *src++;
864 if (c >= 0x80)
865 {
866 sprintf ((char *) dst, "\\%03o", c);
867 dst += 4;
868 }
869 else
870 *dst++ = c;
871 }
872 return val;
873 }
874
875 \f
876 DEFUN ("string", Fstring, Sstring, 1, MANY, 0,
877 doc: /*
878 Concatenate all the argument characters and make the result a string.
879 usage: (string &rest CHARACTERS) */)
880 (n, args)
881 int n;
882 Lisp_Object *args;
883 {
884 int i;
885 unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n);
886 unsigned char *p = buf;
887 int c;
888
889 for (i = 0; i < n; i++)
890 {
891 CHECK_CHARACTER (args[i]);
892 c = XINT (args[i]);
893 p += CHAR_STRING (c, p);
894 }
895
896 return make_string_from_bytes ((char *) buf, n, p - buf);
897 }
898
899 void
900 init_character_once ()
901 {
902 }
903
904 #ifdef emacs
905
906 void
907 syms_of_character ()
908 {
909 DEFSYM (Qcharacterp, "characterp");
910 DEFSYM (Qauto_fill_chars, "auto-fill-chars");
911
912 staticpro (&Vchar_unify_table);
913 Vchar_unify_table = Qnil;
914
915 defsubr (&Smax_char);
916 defsubr (&Scharacterp);
917 defsubr (&Sunibyte_char_to_multibyte);
918 defsubr (&Smultibyte_char_to_unibyte);
919 defsubr (&Schar_bytes);
920 defsubr (&Schar_width);
921 defsubr (&Sstring_width);
922 defsubr (&Schar_direction);
923 defsubr (&Schars_in_region);
924 defsubr (&Sstring);
925
926 DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector,
927 doc: /*
928 Vector recording all translation tables ever defined.
929 Each element is a pair (SYMBOL . TABLE) relating the table to the
930 symbol naming it. The ID of a translation table is an index into this vector. */);
931 Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
932
933 DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
934 doc: /*
935 A char-table for characters which invoke auto-filling.
936 Such characters have value t in this table. */);
937 Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
938 CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
939 CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
940
941 DEFVAR_LISP ("char-width-table", &Vchar_width_table,
942 doc: /*
943 A char-table for width (columns) of each character. */);
944 Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
945 char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
946 char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
947 make_number (4));
948
949 DEFVAR_LISP ("char-direction-table", &Vchar_direction_table,
950 doc: /* A char-table for direction of each character. */);
951 Vchar_direction_table = Fmake_char_table (Qnil, make_number (1));
952
953 DEFVAR_LISP ("printable-chars", &Vprintable_chars,
954 doc: /* A char-table for each printable character. */);
955 Vprintable_chars = Fmake_char_table (Qnil, Qnil);
956 Fset_char_table_range (Vprintable_chars,
957 Fcons (make_number (32), make_number (126)), Qt);
958 Fset_char_table_range (Vprintable_chars,
959 Fcons (make_number (160),
960 make_number (MAX_5_BYTE_CHAR)), Qt);
961
962 DEFVAR_LISP ("char-script-table", &Vchar_script_table,
963 doc: /* Char table of script symbols.
964 It has one extra slot whose value is a list of script symbols. */);
965
966 /* Intern this now in case it isn't already done.
967 Setting this variable twice is harmless.
968 But don't staticpro it here--that is done in alloc.c. */
969 Qchar_table_extra_slots = intern ("char-table-extra-slots");
970 DEFSYM (Qchar_script_table, "char-script-table");
971 Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
972 Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
973 }
974
975 #endif /* emacs */