]> code.delx.au - gnu-emacs/blob - src/charset.c
Merge branch 'master' of git.savannah.gnu.org:/srv/git/emacs
[gnu-emacs] / src / charset.c
1 /* Basic character set support.
2
3 Copyright (C) 2001-2016 Free Software Foundation, Inc.
4
5 Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
6 2005, 2006, 2007, 2008, 2009, 2010, 2011
7 National Institute of Advanced Industrial Science and Technology (AIST)
8 Registration Number H14PRO021
9
10 Copyright (C) 2003, 2004
11 National Institute of Advanced Industrial Science and Technology (AIST)
12 Registration Number H13PRO009
13
14 This file is part of GNU Emacs.
15
16 GNU Emacs is free software: you can redistribute it and/or modify
17 it under the terms of the GNU General Public License as published by
18 the Free Software Foundation, either version 3 of the License, or (at
19 your option) any later version.
20
21 GNU Emacs is distributed in the hope that it will be useful,
22 but WITHOUT ANY WARRANTY; without even the implied warranty of
23 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24 GNU General Public License for more details.
25
26 You should have received a copy of the GNU General Public License
27 along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
28
29 #include <config.h>
30
31 #include <errno.h>
32 #include <stdio.h>
33 #include <unistd.h>
34 #include <limits.h>
35 #include <sys/types.h>
36 #include <c-ctype.h>
37 #include "lisp.h"
38 #include "character.h"
39 #include "charset.h"
40 #include "coding.h"
41 #include "buffer.h"
42
43 /*** GENERAL NOTES on CODED CHARACTER SETS (CHARSETS) ***
44
45 A coded character set ("charset" hereafter) is a meaningful
46 collection (i.e. language, culture, functionality, etc.) of
47 characters. Emacs handles multiple charsets at once. In Emacs Lisp
48 code, a charset is represented by a symbol. In C code, a charset is
49 represented by its ID number or by a pointer to a struct charset.
50
51 The actual information about each charset is stored in two places.
52 Lispy information is stored in the hash table Vcharset_hash_table as
53 a vector (charset attributes). The other information is stored in
54 charset_table as a struct charset.
55
56 */
57
58 /* Hash table that contains attributes of each charset. Keys are
59 charset symbols, and values are vectors of charset attributes. */
60 Lisp_Object Vcharset_hash_table;
61
62 /* Table of struct charset. */
63 struct charset *charset_table;
64
65 static ptrdiff_t charset_table_size;
66 static int charset_table_used;
67
68 /* Special charsets corresponding to symbols. */
69 int charset_ascii;
70 int charset_eight_bit;
71 static int charset_iso_8859_1;
72 int charset_unicode;
73 static int charset_emacs;
74
75 /* The other special charsets. */
76 int charset_jisx0201_roman;
77 int charset_jisx0208_1978;
78 int charset_jisx0208;
79 int charset_ksc5601;
80
81 /* Charset of unibyte characters. */
82 int charset_unibyte;
83
84 /* List of charsets ordered by the priority. */
85 Lisp_Object Vcharset_ordered_list;
86
87 /* Sub-list of Vcharset_ordered_list that contains all non-preferred
88 charsets. */
89 Lisp_Object Vcharset_non_preferred_head;
90
91 /* Incremented every time we change the priority of charsets.
92 Wraps around. */
93 EMACS_UINT charset_ordered_list_tick;
94
95 /* List of iso-2022 charsets. */
96 Lisp_Object Viso_2022_charset_list;
97
98 /* List of emacs-mule charsets. */
99 Lisp_Object Vemacs_mule_charset_list;
100
101 int emacs_mule_charset[256];
102
103 /* Mapping table from ISO2022's charset (specified by DIMENSION,
104 CHARS, and FINAL-CHAR) to Emacs' charset. */
105 int iso_charset_table[ISO_MAX_DIMENSION][ISO_MAX_CHARS][ISO_MAX_FINAL];
106
107 #define CODE_POINT_TO_INDEX(charset, code) \
108 ((charset)->code_linear_p \
109 ? (int) ((code) - (charset)->min_code) \
110 : (((charset)->code_space_mask[(code) >> 24] & 0x8) \
111 && ((charset)->code_space_mask[((code) >> 16) & 0xFF] & 0x4) \
112 && ((charset)->code_space_mask[((code) >> 8) & 0xFF] & 0x2) \
113 && ((charset)->code_space_mask[(code) & 0xFF] & 0x1)) \
114 ? (int) (((((code) >> 24) - (charset)->code_space[12]) \
115 * (charset)->code_space[11]) \
116 + (((((code) >> 16) & 0xFF) - (charset)->code_space[8]) \
117 * (charset)->code_space[7]) \
118 + (((((code) >> 8) & 0xFF) - (charset)->code_space[4]) \
119 * (charset)->code_space[3]) \
120 + (((code) & 0xFF) - (charset)->code_space[0]) \
121 - ((charset)->char_index_offset)) \
122 : -1)
123
124
125 /* Return the code-point for the character index IDX in CHARSET.
126 IDX should be an unsigned int variable in a valid range (which is
127 always in nonnegative int range too). IDX contains garbage afterwards. */
128
129 #define INDEX_TO_CODE_POINT(charset, idx) \
130 ((charset)->code_linear_p \
131 ? (idx) + (charset)->min_code \
132 : (idx += (charset)->char_index_offset, \
133 (((charset)->code_space[0] + (idx) % (charset)->code_space[2]) \
134 | (((charset)->code_space[4] \
135 + ((idx) / (charset)->code_space[3] % (charset)->code_space[6])) \
136 << 8) \
137 | (((charset)->code_space[8] \
138 + ((idx) / (charset)->code_space[7] % (charset)->code_space[10])) \
139 << 16) \
140 | (((charset)->code_space[12] + ((idx) / (charset)->code_space[11])) \
141 << 24))))
142
143 /* Structure to hold mapping tables for a charset. Used by temacs
144 invoked for dumping. */
145
146 static struct
147 {
148 /* The current charset for which the following tables are setup. */
149 struct charset *current;
150
151 /* 1 iff the following table is used for encoder. */
152 short for_encoder;
153
154 /* When the following table is used for encoding, minimum and
155 maximum character of the current charset. */
156 int min_char, max_char;
157
158 /* A Unicode character corresponding to the code index 0 (i.e. the
159 minimum code-point) of the current charset, or -1 if the code
160 index 0 is not a Unicode character. This is checked when
161 table.encoder[CHAR] is zero. */
162 int zero_index_char;
163
164 union {
165 /* Table mapping code-indices (not code-points) of the current
166 charset to Unicode characters. If decoder[CHAR] is -1, CHAR
167 doesn't belong to the current charset. */
168 int decoder[0x10000];
169 /* Table mapping Unicode characters to code-indices of the current
170 charset. The first 0x10000 elements are for BMP (0..0xFFFF),
171 and the last 0x10000 are for SMP (0x10000..0x1FFFF) or SIP
172 (0x20000..0x2FFFF). Note that there is no charset map that
173 uses both SMP and SIP. */
174 unsigned short encoder[0x20000];
175 } table;
176 } *temp_charset_work;
177
178 #define SET_TEMP_CHARSET_WORK_ENCODER(C, CODE) \
179 do { \
180 if ((CODE) == 0) \
181 temp_charset_work->zero_index_char = (C); \
182 else if ((C) < 0x20000) \
183 temp_charset_work->table.encoder[(C)] = (CODE); \
184 else \
185 temp_charset_work->table.encoder[(C) - 0x10000] = (CODE); \
186 } while (0)
187
188 #define GET_TEMP_CHARSET_WORK_ENCODER(C) \
189 ((C) == temp_charset_work->zero_index_char ? 0 \
190 : (C) < 0x20000 ? (temp_charset_work->table.encoder[(C)] \
191 ? (int) temp_charset_work->table.encoder[(C)] : -1) \
192 : temp_charset_work->table.encoder[(C) - 0x10000] \
193 ? temp_charset_work->table.encoder[(C) - 0x10000] : -1)
194
195 #define SET_TEMP_CHARSET_WORK_DECODER(C, CODE) \
196 (temp_charset_work->table.decoder[(CODE)] = (C))
197
198 #define GET_TEMP_CHARSET_WORK_DECODER(CODE) \
199 (temp_charset_work->table.decoder[(CODE)])
200 \f
201
202 /* Set to 1 to warn that a charset map is loaded and thus a buffer
203 text and a string data may be relocated. */
204 bool charset_map_loaded;
205
206 struct charset_map_entries
207 {
208 struct {
209 unsigned from, to;
210 int c;
211 } entry[0x10000];
212 struct charset_map_entries *next;
213 };
214
215 /* Load the mapping information of CHARSET from ENTRIES for
216 initializing (CONTROL_FLAG == 0), decoding (CONTROL_FLAG == 1), and
217 encoding (CONTROL_FLAG == 2).
218
219 If CONTROL_FLAG is 0, setup CHARSET->min_char, CHARSET->max_char,
220 and CHARSET->fast_map.
221
222 If CONTROL_FLAG is 1, setup the following tables according to
223 CHARSET->method and inhibit_load_charset_map.
224
225 CHARSET->method | inhibit_lcm == 0 | inhibit_lcm == 1
226 ----------------------+--------------------+---------------------------
227 CHARSET_METHOD_MAP | CHARSET->decoder | temp_charset_work->decoder
228 ----------------------+--------------------+---------------------------
229 CHARSET_METHOD_OFFSET | Vchar_unify_table | temp_charset_work->decoder
230
231 If CONTROL_FLAG is 2, setup the following tables.
232
233 CHARSET->method | inhibit_lcm == 0 | inhibit_lcm == 1
234 ----------------------+--------------------+---------------------------
235 CHARSET_METHOD_MAP | CHARSET->encoder | temp_charset_work->encoder
236 ----------------------+--------------------+--------------------------
237 CHARSET_METHOD_OFFSET | CHARSET->deunifier | temp_charset_work->encoder
238 */
239
240 static void
241 load_charset_map (struct charset *charset, struct charset_map_entries *entries, int n_entries, int control_flag)
242 {
243 Lisp_Object vec;
244 Lisp_Object table UNINIT;
245 unsigned max_code = CHARSET_MAX_CODE (charset);
246 bool ascii_compatible_p = charset->ascii_compatible_p;
247 int min_char, max_char, nonascii_min_char;
248 int i;
249 unsigned char *fast_map = charset->fast_map;
250
251 if (n_entries <= 0)
252 return;
253
254 if (control_flag)
255 {
256 if (! inhibit_load_charset_map)
257 {
258 if (control_flag == 1)
259 {
260 if (charset->method == CHARSET_METHOD_MAP)
261 {
262 int n = CODE_POINT_TO_INDEX (charset, max_code) + 1;
263
264 vec = Fmake_vector (make_number (n), make_number (-1));
265 set_charset_attr (charset, charset_decoder, vec);
266 }
267 else
268 {
269 char_table_set_range (Vchar_unify_table,
270 charset->min_char, charset->max_char,
271 Qnil);
272 }
273 }
274 else
275 {
276 table = Fmake_char_table (Qnil, Qnil);
277 set_charset_attr (charset,
278 (charset->method == CHARSET_METHOD_MAP
279 ? charset_encoder : charset_deunifier),
280 table);
281 }
282 }
283 else
284 {
285 if (! temp_charset_work)
286 temp_charset_work = xmalloc (sizeof *temp_charset_work);
287 if (control_flag == 1)
288 {
289 memset (temp_charset_work->table.decoder, -1,
290 sizeof (int) * 0x10000);
291 }
292 else
293 {
294 memset (temp_charset_work->table.encoder, 0,
295 sizeof (unsigned short) * 0x20000);
296 temp_charset_work->zero_index_char = -1;
297 }
298 temp_charset_work->current = charset;
299 temp_charset_work->for_encoder = (control_flag == 2);
300 control_flag += 2;
301 }
302 charset_map_loaded = 1;
303 }
304
305 min_char = max_char = entries->entry[0].c;
306 nonascii_min_char = MAX_CHAR;
307 for (i = 0; i < n_entries; i++)
308 {
309 unsigned from, to;
310 int from_index, to_index, lim_index;
311 int from_c, to_c;
312 int idx = i % 0x10000;
313
314 if (i > 0 && idx == 0)
315 entries = entries->next;
316 from = entries->entry[idx].from;
317 to = entries->entry[idx].to;
318 from_c = entries->entry[idx].c;
319 from_index = CODE_POINT_TO_INDEX (charset, from);
320 if (from == to)
321 {
322 to_index = from_index;
323 to_c = from_c;
324 }
325 else
326 {
327 to_index = CODE_POINT_TO_INDEX (charset, to);
328 to_c = from_c + (to_index - from_index);
329 }
330 if (from_index < 0 || to_index < 0)
331 continue;
332 lim_index = to_index + 1;
333
334 if (to_c > max_char)
335 max_char = to_c;
336 else if (from_c < min_char)
337 min_char = from_c;
338
339 if (control_flag == 1)
340 {
341 if (charset->method == CHARSET_METHOD_MAP)
342 for (; from_index < lim_index; from_index++, from_c++)
343 ASET (vec, from_index, make_number (from_c));
344 else
345 for (; from_index < lim_index; from_index++, from_c++)
346 CHAR_TABLE_SET (Vchar_unify_table,
347 CHARSET_CODE_OFFSET (charset) + from_index,
348 make_number (from_c));
349 }
350 else if (control_flag == 2)
351 {
352 if (charset->method == CHARSET_METHOD_MAP
353 && CHARSET_COMPACT_CODES_P (charset))
354 for (; from_index < lim_index; from_index++, from_c++)
355 {
356 unsigned code = from_index;
357 code = INDEX_TO_CODE_POINT (charset, code);
358
359 if (NILP (CHAR_TABLE_REF (table, from_c)))
360 CHAR_TABLE_SET (table, from_c, make_number (code));
361 }
362 else
363 for (; from_index < lim_index; from_index++, from_c++)
364 {
365 if (NILP (CHAR_TABLE_REF (table, from_c)))
366 CHAR_TABLE_SET (table, from_c, make_number (from_index));
367 }
368 }
369 else if (control_flag == 3)
370 for (; from_index < lim_index; from_index++, from_c++)
371 SET_TEMP_CHARSET_WORK_DECODER (from_c, from_index);
372 else if (control_flag == 4)
373 for (; from_index < lim_index; from_index++, from_c++)
374 SET_TEMP_CHARSET_WORK_ENCODER (from_c, from_index);
375 else /* control_flag == 0 */
376 {
377 if (ascii_compatible_p)
378 {
379 if (! ASCII_CHAR_P (from_c))
380 {
381 if (from_c < nonascii_min_char)
382 nonascii_min_char = from_c;
383 }
384 else if (! ASCII_CHAR_P (to_c))
385 {
386 nonascii_min_char = 0x80;
387 }
388 }
389
390 for (; from_c <= to_c; from_c++)
391 CHARSET_FAST_MAP_SET (from_c, fast_map);
392 }
393 }
394
395 if (control_flag == 0)
396 {
397 CHARSET_MIN_CHAR (charset) = (ascii_compatible_p
398 ? nonascii_min_char : min_char);
399 CHARSET_MAX_CHAR (charset) = max_char;
400 }
401 else if (control_flag == 4)
402 {
403 temp_charset_work->min_char = min_char;
404 temp_charset_work->max_char = max_char;
405 }
406 }
407
408
409 /* Read a hexadecimal number (preceded by "0x") from the file FP while
410 paying attention to comment character '#'. */
411
412 static unsigned
413 read_hex (FILE *fp, bool *eof, bool *overflow)
414 {
415 int c;
416 unsigned n;
417
418 while ((c = getc (fp)) != EOF)
419 {
420 if (c == '#')
421 {
422 while ((c = getc (fp)) != EOF && c != '\n');
423 }
424 else if (c == '0')
425 {
426 if ((c = getc (fp)) == EOF || c == 'x')
427 break;
428 }
429 }
430 if (c == EOF)
431 {
432 *eof = 1;
433 return 0;
434 }
435 n = 0;
436 while (c_isxdigit (c = getc (fp)))
437 {
438 if (UINT_MAX >> 4 < n)
439 *overflow = 1;
440 n = ((n << 4)
441 | (c - ('0' <= c && c <= '9' ? '0'
442 : 'A' <= c && c <= 'F' ? 'A' - 10
443 : 'a' - 10)));
444 }
445 if (c != EOF)
446 ungetc (c, fp);
447 return n;
448 }
449
450 /* Return a mapping vector for CHARSET loaded from MAPFILE.
451 Each line of MAPFILE has this form
452 0xAAAA 0xCCCC
453 where 0xAAAA is a code-point and 0xCCCC is the corresponding
454 character code, or this form
455 0xAAAA-0xBBBB 0xCCCC
456 where 0xAAAA and 0xBBBB are code-points specifying a range, and
457 0xCCCC is the first character code of the range.
458
459 The returned vector has this form:
460 [ CODE1 CHAR1 CODE2 CHAR2 .... ]
461 where CODE1 is a code-point or a cons of code-points specifying a
462 range.
463
464 Note that this function uses `openp' to open MAPFILE but ignores
465 `file-name-handler-alist' to avoid running any Lisp code. */
466
467 static void
468 load_charset_map_from_file (struct charset *charset, Lisp_Object mapfile,
469 int control_flag)
470 {
471 unsigned min_code = CHARSET_MIN_CODE (charset);
472 unsigned max_code = CHARSET_MAX_CODE (charset);
473 int fd;
474 FILE *fp;
475 struct charset_map_entries *head, *entries;
476 int n_entries;
477 AUTO_STRING (map, ".map");
478 AUTO_STRING (txt, ".txt");
479 AUTO_LIST2 (suffixes, map, txt);
480 ptrdiff_t count = SPECPDL_INDEX ();
481 record_unwind_protect_nothing ();
482 specbind (Qfile_name_handler_alist, Qnil);
483 fd = openp (Vcharset_map_path, mapfile, suffixes, NULL, Qnil, false);
484 fp = fd < 0 ? 0 : fdopen (fd, "r");
485 if (!fp)
486 {
487 int open_errno = errno;
488 emacs_close (fd);
489 report_file_errno ("Loading charset map", mapfile, open_errno);
490 }
491 set_unwind_protect_ptr (count, fclose_unwind, fp);
492 unbind_to (count + 1, Qnil);
493
494 /* Use record_xmalloc, as `charset_map_entries' is
495 large (larger than MAX_ALLOCA). */
496 head = record_xmalloc (sizeof *head);
497 entries = head;
498 memset (entries, 0, sizeof (struct charset_map_entries));
499
500 n_entries = 0;
501 while (1)
502 {
503 unsigned from, to, c;
504 int idx;
505 bool eof = 0, overflow = 0;
506
507 from = read_hex (fp, &eof, &overflow);
508 if (eof)
509 break;
510 if (getc (fp) == '-')
511 to = read_hex (fp, &eof, &overflow);
512 else
513 to = from;
514 if (eof)
515 break;
516 c = read_hex (fp, &eof, &overflow);
517 if (eof)
518 break;
519
520 if (overflow)
521 continue;
522 if (from < min_code || to > max_code || from > to || c > MAX_CHAR)
523 continue;
524
525 if (n_entries == 0x10000)
526 {
527 entries->next = record_xmalloc (sizeof *entries->next);
528 entries = entries->next;
529 memset (entries, 0, sizeof (struct charset_map_entries));
530 n_entries = 0;
531 }
532 idx = n_entries;
533 entries->entry[idx].from = from;
534 entries->entry[idx].to = to;
535 entries->entry[idx].c = c;
536 n_entries++;
537 }
538 fclose (fp);
539 clear_unwind_protect (count);
540
541 load_charset_map (charset, head, n_entries, control_flag);
542 unbind_to (count, Qnil);
543 }
544
545 static void
546 load_charset_map_from_vector (struct charset *charset, Lisp_Object vec, int control_flag)
547 {
548 unsigned min_code = CHARSET_MIN_CODE (charset);
549 unsigned max_code = CHARSET_MAX_CODE (charset);
550 struct charset_map_entries *head, *entries;
551 int n_entries;
552 int len = ASIZE (vec);
553 int i;
554 USE_SAFE_ALLOCA;
555
556 if (len % 2 == 1)
557 {
558 add_to_log ("Failure in loading charset map: %V", vec);
559 return;
560 }
561
562 /* Use SAFE_ALLOCA instead of alloca, as `charset_map_entries' is
563 large (larger than MAX_ALLOCA). */
564 head = SAFE_ALLOCA (sizeof *head);
565 entries = head;
566 memset (entries, 0, sizeof (struct charset_map_entries));
567
568 n_entries = 0;
569 for (i = 0; i < len; i += 2)
570 {
571 Lisp_Object val, val2;
572 unsigned from, to;
573 EMACS_INT c;
574 int idx;
575
576 val = AREF (vec, i);
577 if (CONSP (val))
578 {
579 val2 = XCDR (val);
580 val = XCAR (val);
581 from = XFASTINT (val);
582 to = XFASTINT (val2);
583 }
584 else
585 from = to = XFASTINT (val);
586 val = AREF (vec, i + 1);
587 CHECK_NATNUM (val);
588 c = XFASTINT (val);
589
590 if (from < min_code || to > max_code || from > to || c > MAX_CHAR)
591 continue;
592
593 if (n_entries > 0 && (n_entries % 0x10000) == 0)
594 {
595 entries->next = SAFE_ALLOCA (sizeof *entries->next);
596 entries = entries->next;
597 memset (entries, 0, sizeof (struct charset_map_entries));
598 }
599 idx = n_entries % 0x10000;
600 entries->entry[idx].from = from;
601 entries->entry[idx].to = to;
602 entries->entry[idx].c = c;
603 n_entries++;
604 }
605
606 load_charset_map (charset, head, n_entries, control_flag);
607 SAFE_FREE ();
608 }
609
610
611 /* Load a mapping table for CHARSET. CONTROL-FLAG tells what kind of
612 map it is (see the comment of load_charset_map for the detail). */
613
614 static void
615 load_charset (struct charset *charset, int control_flag)
616 {
617 Lisp_Object map;
618
619 if (inhibit_load_charset_map
620 && temp_charset_work
621 && charset == temp_charset_work->current
622 && ((control_flag == 2) == temp_charset_work->for_encoder))
623 return;
624
625 if (CHARSET_METHOD (charset) == CHARSET_METHOD_MAP)
626 map = CHARSET_MAP (charset);
627 else
628 {
629 if (! CHARSET_UNIFIED_P (charset))
630 emacs_abort ();
631 map = CHARSET_UNIFY_MAP (charset);
632 }
633 if (STRINGP (map))
634 load_charset_map_from_file (charset, map, control_flag);
635 else
636 load_charset_map_from_vector (charset, map, control_flag);
637 }
638
639
640 DEFUN ("charsetp", Fcharsetp, Scharsetp, 1, 1, 0,
641 doc: /* Return non-nil if and only if OBJECT is a charset.*/)
642 (Lisp_Object object)
643 {
644 return (CHARSETP (object) ? Qt : Qnil);
645 }
646
647
648 static void
649 map_charset_for_dump (void (*c_function) (Lisp_Object, Lisp_Object),
650 Lisp_Object function, Lisp_Object arg,
651 unsigned int from, unsigned int to)
652 {
653 int from_idx = CODE_POINT_TO_INDEX (temp_charset_work->current, from);
654 int to_idx = CODE_POINT_TO_INDEX (temp_charset_work->current, to);
655 Lisp_Object range = Fcons (Qnil, Qnil);
656 int c, stop;
657
658 c = temp_charset_work->min_char;
659 stop = (temp_charset_work->max_char < 0x20000
660 ? temp_charset_work->max_char : 0xFFFF);
661
662 while (1)
663 {
664 int idx = GET_TEMP_CHARSET_WORK_ENCODER (c);
665
666 if (idx >= from_idx && idx <= to_idx)
667 {
668 if (NILP (XCAR (range)))
669 XSETCAR (range, make_number (c));
670 }
671 else if (! NILP (XCAR (range)))
672 {
673 XSETCDR (range, make_number (c - 1));
674 if (c_function)
675 (*c_function) (arg, range);
676 else
677 call2 (function, range, arg);
678 XSETCAR (range, Qnil);
679 }
680 if (c == stop)
681 {
682 if (c == temp_charset_work->max_char)
683 {
684 if (! NILP (XCAR (range)))
685 {
686 XSETCDR (range, make_number (c));
687 if (c_function)
688 (*c_function) (arg, range);
689 else
690 call2 (function, range, arg);
691 }
692 break;
693 }
694 c = 0x1FFFF;
695 stop = temp_charset_work->max_char;
696 }
697 c++;
698 }
699 }
700
701 void
702 map_charset_chars (void (*c_function)(Lisp_Object, Lisp_Object), Lisp_Object function,
703 Lisp_Object arg, struct charset *charset, unsigned from, unsigned to)
704 {
705 Lisp_Object range;
706 bool partial = (from > CHARSET_MIN_CODE (charset)
707 || to < CHARSET_MAX_CODE (charset));
708
709 if (CHARSET_METHOD (charset) == CHARSET_METHOD_OFFSET)
710 {
711 int from_idx = CODE_POINT_TO_INDEX (charset, from);
712 int to_idx = CODE_POINT_TO_INDEX (charset, to);
713 int from_c = from_idx + CHARSET_CODE_OFFSET (charset);
714 int to_c = to_idx + CHARSET_CODE_OFFSET (charset);
715
716 if (CHARSET_UNIFIED_P (charset))
717 {
718 if (! CHAR_TABLE_P (CHARSET_DEUNIFIER (charset)))
719 load_charset (charset, 2);
720 if (CHAR_TABLE_P (CHARSET_DEUNIFIER (charset)))
721 map_char_table_for_charset (c_function, function,
722 CHARSET_DEUNIFIER (charset), arg,
723 partial ? charset : NULL, from, to);
724 else
725 map_charset_for_dump (c_function, function, arg, from, to);
726 }
727
728 range = Fcons (make_number (from_c), make_number (to_c));
729 if (NILP (function))
730 (*c_function) (arg, range);
731 else
732 call2 (function, range, arg);
733 }
734 else if (CHARSET_METHOD (charset) == CHARSET_METHOD_MAP)
735 {
736 if (! CHAR_TABLE_P (CHARSET_ENCODER (charset)))
737 load_charset (charset, 2);
738 if (CHAR_TABLE_P (CHARSET_ENCODER (charset)))
739 map_char_table_for_charset (c_function, function,
740 CHARSET_ENCODER (charset), arg,
741 partial ? charset : NULL, from, to);
742 else
743 map_charset_for_dump (c_function, function, arg, from, to);
744 }
745 else if (CHARSET_METHOD (charset) == CHARSET_METHOD_SUBSET)
746 {
747 Lisp_Object subset_info;
748 int offset;
749
750 subset_info = CHARSET_SUBSET (charset);
751 charset = CHARSET_FROM_ID (XFASTINT (AREF (subset_info, 0)));
752 offset = XINT (AREF (subset_info, 3));
753 from -= offset;
754 if (from < XFASTINT (AREF (subset_info, 1)))
755 from = XFASTINT (AREF (subset_info, 1));
756 to -= offset;
757 if (to > XFASTINT (AREF (subset_info, 2)))
758 to = XFASTINT (AREF (subset_info, 2));
759 map_charset_chars (c_function, function, arg, charset, from, to);
760 }
761 else /* i.e. CHARSET_METHOD_SUPERSET */
762 {
763 Lisp_Object parents;
764
765 for (parents = CHARSET_SUPERSET (charset); CONSP (parents);
766 parents = XCDR (parents))
767 {
768 int offset;
769 unsigned this_from, this_to;
770
771 charset = CHARSET_FROM_ID (XFASTINT (XCAR (XCAR (parents))));
772 offset = XINT (XCDR (XCAR (parents)));
773 this_from = from > offset ? from - offset : 0;
774 this_to = to > offset ? to - offset : 0;
775 if (this_from < CHARSET_MIN_CODE (charset))
776 this_from = CHARSET_MIN_CODE (charset);
777 if (this_to > CHARSET_MAX_CODE (charset))
778 this_to = CHARSET_MAX_CODE (charset);
779 map_charset_chars (c_function, function, arg, charset,
780 this_from, this_to);
781 }
782 }
783 }
784
785 DEFUN ("map-charset-chars", Fmap_charset_chars, Smap_charset_chars, 2, 5, 0,
786 doc: /* Call FUNCTION for all characters in CHARSET.
787 FUNCTION is called with an argument RANGE and the optional 3rd
788 argument ARG.
789
790 RANGE is a cons (FROM . TO), where FROM and TO indicate a range of
791 characters contained in CHARSET.
792
793 The optional 4th and 5th arguments FROM-CODE and TO-CODE specify the
794 range of code points (in CHARSET) of target characters. */)
795 (Lisp_Object function, Lisp_Object charset, Lisp_Object arg, Lisp_Object from_code, Lisp_Object to_code)
796 {
797 struct charset *cs;
798 unsigned from, to;
799
800 CHECK_CHARSET_GET_CHARSET (charset, cs);
801 if (NILP (from_code))
802 from = CHARSET_MIN_CODE (cs);
803 else
804 {
805 from = XINT (from_code);
806 if (from < CHARSET_MIN_CODE (cs))
807 from = CHARSET_MIN_CODE (cs);
808 }
809 if (NILP (to_code))
810 to = CHARSET_MAX_CODE (cs);
811 else
812 {
813 to = XINT (to_code);
814 if (to > CHARSET_MAX_CODE (cs))
815 to = CHARSET_MAX_CODE (cs);
816 }
817 map_charset_chars (NULL, function, arg, cs, from, to);
818 return Qnil;
819 }
820
821
822 /* Define a charset according to the arguments. The Nth argument is
823 the Nth attribute of the charset (the last attribute `charset-id'
824 is not included). See the docstring of `define-charset' for the
825 detail. */
826
827 DEFUN ("define-charset-internal", Fdefine_charset_internal,
828 Sdefine_charset_internal, charset_arg_max, MANY, 0,
829 doc: /* For internal use only.
830 usage: (define-charset-internal ...) */)
831 (ptrdiff_t nargs, Lisp_Object *args)
832 {
833 /* Charset attr vector. */
834 Lisp_Object attrs;
835 Lisp_Object val;
836 EMACS_UINT hash_code;
837 struct Lisp_Hash_Table *hash_table = XHASH_TABLE (Vcharset_hash_table);
838 int i, j;
839 struct charset charset;
840 int id;
841 int dimension;
842 bool new_definition_p;
843 int nchars;
844
845 if (nargs != charset_arg_max)
846 return Fsignal (Qwrong_number_of_arguments,
847 Fcons (intern ("define-charset-internal"),
848 make_number (nargs)));
849
850 attrs = Fmake_vector (make_number (charset_attr_max), Qnil);
851
852 CHECK_SYMBOL (args[charset_arg_name]);
853 ASET (attrs, charset_name, args[charset_arg_name]);
854
855 val = args[charset_arg_code_space];
856 for (i = 0, dimension = 0, nchars = 1; ; i++)
857 {
858 Lisp_Object min_byte_obj, max_byte_obj;
859 int min_byte, max_byte;
860
861 min_byte_obj = Faref (val, make_number (i * 2));
862 max_byte_obj = Faref (val, make_number (i * 2 + 1));
863 CHECK_RANGED_INTEGER (min_byte_obj, 0, 255);
864 min_byte = XINT (min_byte_obj);
865 CHECK_RANGED_INTEGER (max_byte_obj, min_byte, 255);
866 max_byte = XINT (max_byte_obj);
867 charset.code_space[i * 4] = min_byte;
868 charset.code_space[i * 4 + 1] = max_byte;
869 charset.code_space[i * 4 + 2] = max_byte - min_byte + 1;
870 if (max_byte > 0)
871 dimension = i + 1;
872 if (i == 3)
873 break;
874 nchars *= charset.code_space[i * 4 + 2];
875 charset.code_space[i * 4 + 3] = nchars;
876 }
877
878 val = args[charset_arg_dimension];
879 if (NILP (val))
880 charset.dimension = dimension;
881 else
882 {
883 CHECK_RANGED_INTEGER (val, 1, 4);
884 charset.dimension = XINT (val);
885 }
886
887 charset.code_linear_p
888 = (charset.dimension == 1
889 || (charset.code_space[2] == 256
890 && (charset.dimension == 2
891 || (charset.code_space[6] == 256
892 && (charset.dimension == 3
893 || charset.code_space[10] == 256)))));
894
895 if (! charset.code_linear_p)
896 {
897 charset.code_space_mask = xzalloc (256);
898 for (i = 0; i < 4; i++)
899 for (j = charset.code_space[i * 4]; j <= charset.code_space[i * 4 + 1];
900 j++)
901 charset.code_space_mask[j] |= (1 << i);
902 }
903
904 charset.iso_chars_96 = charset.code_space[2] == 96;
905
906 charset.min_code = (charset.code_space[0]
907 | (charset.code_space[4] << 8)
908 | (charset.code_space[8] << 16)
909 | ((unsigned) charset.code_space[12] << 24));
910 charset.max_code = (charset.code_space[1]
911 | (charset.code_space[5] << 8)
912 | (charset.code_space[9] << 16)
913 | ((unsigned) charset.code_space[13] << 24));
914 charset.char_index_offset = 0;
915
916 val = args[charset_arg_min_code];
917 if (! NILP (val))
918 {
919 unsigned code = cons_to_unsigned (val, UINT_MAX);
920
921 if (code < charset.min_code
922 || code > charset.max_code)
923 args_out_of_range_3 (make_fixnum_or_float (charset.min_code),
924 make_fixnum_or_float (charset.max_code), val);
925 charset.char_index_offset = CODE_POINT_TO_INDEX (&charset, code);
926 charset.min_code = code;
927 }
928
929 val = args[charset_arg_max_code];
930 if (! NILP (val))
931 {
932 unsigned code = cons_to_unsigned (val, UINT_MAX);
933
934 if (code < charset.min_code
935 || code > charset.max_code)
936 args_out_of_range_3 (make_fixnum_or_float (charset.min_code),
937 make_fixnum_or_float (charset.max_code), val);
938 charset.max_code = code;
939 }
940
941 charset.compact_codes_p = charset.max_code < 0x10000;
942
943 val = args[charset_arg_invalid_code];
944 if (NILP (val))
945 {
946 if (charset.min_code > 0)
947 charset.invalid_code = 0;
948 else
949 {
950 if (charset.max_code < UINT_MAX)
951 charset.invalid_code = charset.max_code + 1;
952 else
953 error ("Attribute :invalid-code must be specified");
954 }
955 }
956 else
957 charset.invalid_code = cons_to_unsigned (val, UINT_MAX);
958
959 val = args[charset_arg_iso_final];
960 if (NILP (val))
961 charset.iso_final = -1;
962 else
963 {
964 CHECK_NUMBER (val);
965 if (XINT (val) < '0' || XINT (val) > 127)
966 error ("Invalid iso-final-char: %"pI"d", XINT (val));
967 charset.iso_final = XINT (val);
968 }
969
970 val = args[charset_arg_iso_revision];
971 if (NILP (val))
972 charset.iso_revision = -1;
973 else
974 {
975 CHECK_RANGED_INTEGER (val, -1, 63);
976 charset.iso_revision = XINT (val);
977 }
978
979 val = args[charset_arg_emacs_mule_id];
980 if (NILP (val))
981 charset.emacs_mule_id = -1;
982 else
983 {
984 CHECK_NATNUM (val);
985 if ((XINT (val) > 0 && XINT (val) <= 128) || XINT (val) >= 256)
986 error ("Invalid emacs-mule-id: %"pI"d", XINT (val));
987 charset.emacs_mule_id = XINT (val);
988 }
989
990 charset.ascii_compatible_p = ! NILP (args[charset_arg_ascii_compatible_p]);
991
992 charset.supplementary_p = ! NILP (args[charset_arg_supplementary_p]);
993
994 charset.unified_p = 0;
995
996 memset (charset.fast_map, 0, sizeof (charset.fast_map));
997
998 if (! NILP (args[charset_arg_code_offset]))
999 {
1000 val = args[charset_arg_code_offset];
1001 CHECK_CHARACTER (val);
1002
1003 charset.method = CHARSET_METHOD_OFFSET;
1004 charset.code_offset = XINT (val);
1005
1006 i = CODE_POINT_TO_INDEX (&charset, charset.max_code);
1007 if (MAX_CHAR - charset.code_offset < i)
1008 error ("Unsupported max char: %d", charset.max_char);
1009 charset.max_char = i + charset.code_offset;
1010 i = CODE_POINT_TO_INDEX (&charset, charset.min_code);
1011 charset.min_char = i + charset.code_offset;
1012
1013 i = (charset.min_char >> 7) << 7;
1014 for (; i < 0x10000 && i <= charset.max_char; i += 128)
1015 CHARSET_FAST_MAP_SET (i, charset.fast_map);
1016 i = (i >> 12) << 12;
1017 for (; i <= charset.max_char; i += 0x1000)
1018 CHARSET_FAST_MAP_SET (i, charset.fast_map);
1019 if (charset.code_offset == 0 && charset.max_char >= 0x80)
1020 charset.ascii_compatible_p = 1;
1021 }
1022 else if (! NILP (args[charset_arg_map]))
1023 {
1024 val = args[charset_arg_map];
1025 ASET (attrs, charset_map, val);
1026 charset.method = CHARSET_METHOD_MAP;
1027 }
1028 else if (! NILP (args[charset_arg_subset]))
1029 {
1030 Lisp_Object parent;
1031 Lisp_Object parent_min_code, parent_max_code, parent_code_offset;
1032 struct charset *parent_charset;
1033
1034 val = args[charset_arg_subset];
1035 parent = Fcar (val);
1036 CHECK_CHARSET_GET_CHARSET (parent, parent_charset);
1037 parent_min_code = Fnth (make_number (1), val);
1038 CHECK_NATNUM (parent_min_code);
1039 parent_max_code = Fnth (make_number (2), val);
1040 CHECK_NATNUM (parent_max_code);
1041 parent_code_offset = Fnth (make_number (3), val);
1042 CHECK_NUMBER (parent_code_offset);
1043 val = make_uninit_vector (4);
1044 ASET (val, 0, make_number (parent_charset->id));
1045 ASET (val, 1, parent_min_code);
1046 ASET (val, 2, parent_max_code);
1047 ASET (val, 3, parent_code_offset);
1048 ASET (attrs, charset_subset, val);
1049
1050 charset.method = CHARSET_METHOD_SUBSET;
1051 /* Here, we just copy the parent's fast_map. It's not accurate,
1052 but at least it works for quickly detecting which character
1053 DOESN'T belong to this charset. */
1054 memcpy (charset.fast_map, parent_charset->fast_map,
1055 sizeof charset.fast_map);
1056
1057 /* We also copy these for parents. */
1058 charset.min_char = parent_charset->min_char;
1059 charset.max_char = parent_charset->max_char;
1060 }
1061 else if (! NILP (args[charset_arg_superset]))
1062 {
1063 val = args[charset_arg_superset];
1064 charset.method = CHARSET_METHOD_SUPERSET;
1065 val = Fcopy_sequence (val);
1066 ASET (attrs, charset_superset, val);
1067
1068 charset.min_char = MAX_CHAR;
1069 charset.max_char = 0;
1070 for (; ! NILP (val); val = Fcdr (val))
1071 {
1072 Lisp_Object elt, car_part, cdr_part;
1073 int this_id, offset;
1074 struct charset *this_charset;
1075
1076 elt = Fcar (val);
1077 if (CONSP (elt))
1078 {
1079 car_part = XCAR (elt);
1080 cdr_part = XCDR (elt);
1081 CHECK_CHARSET_GET_ID (car_part, this_id);
1082 CHECK_TYPE_RANGED_INTEGER (int, cdr_part);
1083 offset = XINT (cdr_part);
1084 }
1085 else
1086 {
1087 CHECK_CHARSET_GET_ID (elt, this_id);
1088 offset = 0;
1089 }
1090 XSETCAR (val, Fcons (make_number (this_id), make_number (offset)));
1091
1092 this_charset = CHARSET_FROM_ID (this_id);
1093 if (charset.min_char > this_charset->min_char)
1094 charset.min_char = this_charset->min_char;
1095 if (charset.max_char < this_charset->max_char)
1096 charset.max_char = this_charset->max_char;
1097 for (i = 0; i < 190; i++)
1098 charset.fast_map[i] |= this_charset->fast_map[i];
1099 }
1100 }
1101 else
1102 error ("None of :code-offset, :map, :parents are specified");
1103
1104 val = args[charset_arg_unify_map];
1105 if (! NILP (val) && !STRINGP (val))
1106 CHECK_VECTOR (val);
1107 ASET (attrs, charset_unify_map, val);
1108
1109 CHECK_LIST (args[charset_arg_plist]);
1110 ASET (attrs, charset_plist, args[charset_arg_plist]);
1111
1112 charset.hash_index = hash_lookup (hash_table, args[charset_arg_name],
1113 &hash_code);
1114 if (charset.hash_index >= 0)
1115 {
1116 new_definition_p = 0;
1117 id = XFASTINT (CHARSET_SYMBOL_ID (args[charset_arg_name]));
1118 set_hash_value_slot (hash_table, charset.hash_index, attrs);
1119 }
1120 else
1121 {
1122 charset.hash_index = hash_put (hash_table, args[charset_arg_name], attrs,
1123 hash_code);
1124 if (charset_table_used == charset_table_size)
1125 {
1126 /* Ensure that charset IDs fit into 'int' as well as into the
1127 restriction imposed by fixnums. Although the 'int' restriction
1128 could be removed, too much other code would need altering; for
1129 example, the IDs are stuffed into struct
1130 coding_system.charbuf[i] entries, which are 'int'. */
1131 int old_size = charset_table_size;
1132 ptrdiff_t new_size = old_size;
1133 struct charset *new_table =
1134 xpalloc (0, &new_size, 1,
1135 min (INT_MAX, MOST_POSITIVE_FIXNUM),
1136 sizeof *charset_table);
1137 memcpy (new_table, charset_table, old_size * sizeof *new_table);
1138 charset_table = new_table;
1139 charset_table_size = new_size;
1140 /* FIXME: This leaks memory, as the old charset_table becomes
1141 unreachable. If the old charset table is charset_table_init
1142 then this leak is intentional; otherwise, it's unclear.
1143 If the latter memory leak is intentional, a
1144 comment should be added to explain this. If not, the old
1145 charset_table should be freed, by passing it as the 1st argument
1146 to xpalloc and removing the memcpy. */
1147 }
1148 id = charset_table_used++;
1149 new_definition_p = 1;
1150 }
1151
1152 ASET (attrs, charset_id, make_number (id));
1153 charset.id = id;
1154 charset_table[id] = charset;
1155
1156 if (charset.method == CHARSET_METHOD_MAP)
1157 {
1158 load_charset (&charset, 0);
1159 charset_table[id] = charset;
1160 }
1161
1162 if (charset.iso_final >= 0)
1163 {
1164 ISO_CHARSET_TABLE (charset.dimension, charset.iso_chars_96,
1165 charset.iso_final) = id;
1166 if (new_definition_p)
1167 Viso_2022_charset_list = nconc2 (Viso_2022_charset_list,
1168 list1 (make_number (id)));
1169 if (ISO_CHARSET_TABLE (1, 0, 'J') == id)
1170 charset_jisx0201_roman = id;
1171 else if (ISO_CHARSET_TABLE (2, 0, '@') == id)
1172 charset_jisx0208_1978 = id;
1173 else if (ISO_CHARSET_TABLE (2, 0, 'B') == id)
1174 charset_jisx0208 = id;
1175 else if (ISO_CHARSET_TABLE (2, 0, 'C') == id)
1176 charset_ksc5601 = id;
1177 }
1178
1179 if (charset.emacs_mule_id >= 0)
1180 {
1181 emacs_mule_charset[charset.emacs_mule_id] = id;
1182 if (charset.emacs_mule_id < 0xA0)
1183 emacs_mule_bytes[charset.emacs_mule_id] = charset.dimension + 1;
1184 else
1185 emacs_mule_bytes[charset.emacs_mule_id] = charset.dimension + 2;
1186 if (new_definition_p)
1187 Vemacs_mule_charset_list = nconc2 (Vemacs_mule_charset_list,
1188 list1 (make_number (id)));
1189 }
1190
1191 if (new_definition_p)
1192 {
1193 Vcharset_list = Fcons (args[charset_arg_name], Vcharset_list);
1194 if (charset.supplementary_p)
1195 Vcharset_ordered_list = nconc2 (Vcharset_ordered_list,
1196 list1 (make_number (id)));
1197 else
1198 {
1199 Lisp_Object tail;
1200
1201 for (tail = Vcharset_ordered_list; CONSP (tail); tail = XCDR (tail))
1202 {
1203 struct charset *cs = CHARSET_FROM_ID (XINT (XCAR (tail)));
1204
1205 if (cs->supplementary_p)
1206 break;
1207 }
1208 if (EQ (tail, Vcharset_ordered_list))
1209 Vcharset_ordered_list = Fcons (make_number (id),
1210 Vcharset_ordered_list);
1211 else if (NILP (tail))
1212 Vcharset_ordered_list = nconc2 (Vcharset_ordered_list,
1213 list1 (make_number (id)));
1214 else
1215 {
1216 val = Fcons (XCAR (tail), XCDR (tail));
1217 XSETCDR (tail, val);
1218 XSETCAR (tail, make_number (id));
1219 }
1220 }
1221 charset_ordered_list_tick++;
1222 }
1223
1224 return Qnil;
1225 }
1226
1227
1228 /* Same as Fdefine_charset_internal but arguments are more convenient
1229 to call from C (typically in syms_of_charset). This can define a
1230 charset of `offset' method only. Return the ID of the new
1231 charset. */
1232
1233 static int
1234 define_charset_internal (Lisp_Object name,
1235 int dimension,
1236 const char *code_space_chars,
1237 unsigned min_code, unsigned max_code,
1238 int iso_final, int iso_revision, int emacs_mule_id,
1239 bool ascii_compatible, bool supplementary,
1240 int code_offset)
1241 {
1242 const unsigned char *code_space = (const unsigned char *) code_space_chars;
1243 Lisp_Object args[charset_arg_max];
1244 Lisp_Object val;
1245 int i;
1246
1247 args[charset_arg_name] = name;
1248 args[charset_arg_dimension] = make_number (dimension);
1249 val = make_uninit_vector (8);
1250 for (i = 0; i < 8; i++)
1251 ASET (val, i, make_number (code_space[i]));
1252 args[charset_arg_code_space] = val;
1253 args[charset_arg_min_code] = make_number (min_code);
1254 args[charset_arg_max_code] = make_number (max_code);
1255 args[charset_arg_iso_final]
1256 = (iso_final < 0 ? Qnil : make_number (iso_final));
1257 args[charset_arg_iso_revision] = make_number (iso_revision);
1258 args[charset_arg_emacs_mule_id]
1259 = (emacs_mule_id < 0 ? Qnil : make_number (emacs_mule_id));
1260 args[charset_arg_ascii_compatible_p] = ascii_compatible ? Qt : Qnil;
1261 args[charset_arg_supplementary_p] = supplementary ? Qt : Qnil;
1262 args[charset_arg_invalid_code] = Qnil;
1263 args[charset_arg_code_offset] = make_number (code_offset);
1264 args[charset_arg_map] = Qnil;
1265 args[charset_arg_subset] = Qnil;
1266 args[charset_arg_superset] = Qnil;
1267 args[charset_arg_unify_map] = Qnil;
1268
1269 args[charset_arg_plist] =
1270 listn (CONSTYPE_HEAP, 14,
1271 QCname,
1272 args[charset_arg_name],
1273 intern_c_string (":dimension"),
1274 args[charset_arg_dimension],
1275 intern_c_string (":code-space"),
1276 args[charset_arg_code_space],
1277 intern_c_string (":iso-final-char"),
1278 args[charset_arg_iso_final],
1279 intern_c_string (":emacs-mule-id"),
1280 args[charset_arg_emacs_mule_id],
1281 QCascii_compatible_p,
1282 args[charset_arg_ascii_compatible_p],
1283 intern_c_string (":code-offset"),
1284 args[charset_arg_code_offset]);
1285 Fdefine_charset_internal (charset_arg_max, args);
1286
1287 return XINT (CHARSET_SYMBOL_ID (name));
1288 }
1289
1290
1291 DEFUN ("define-charset-alias", Fdefine_charset_alias,
1292 Sdefine_charset_alias, 2, 2, 0,
1293 doc: /* Define ALIAS as an alias for charset CHARSET. */)
1294 (Lisp_Object alias, Lisp_Object charset)
1295 {
1296 Lisp_Object attr;
1297
1298 CHECK_CHARSET_GET_ATTR (charset, attr);
1299 Fputhash (alias, attr, Vcharset_hash_table);
1300 Vcharset_list = Fcons (alias, Vcharset_list);
1301 return Qnil;
1302 }
1303
1304
1305 DEFUN ("charset-plist", Fcharset_plist, Scharset_plist, 1, 1, 0,
1306 doc: /* Return the property list of CHARSET. */)
1307 (Lisp_Object charset)
1308 {
1309 Lisp_Object attrs;
1310
1311 CHECK_CHARSET_GET_ATTR (charset, attrs);
1312 return CHARSET_ATTR_PLIST (attrs);
1313 }
1314
1315
1316 DEFUN ("set-charset-plist", Fset_charset_plist, Sset_charset_plist, 2, 2, 0,
1317 doc: /* Set CHARSET's property list to PLIST. */)
1318 (Lisp_Object charset, Lisp_Object plist)
1319 {
1320 Lisp_Object attrs;
1321
1322 CHECK_CHARSET_GET_ATTR (charset, attrs);
1323 ASET (attrs, charset_plist, plist);
1324 return plist;
1325 }
1326
1327
1328 DEFUN ("unify-charset", Funify_charset, Sunify_charset, 1, 3, 0,
1329 doc: /* Unify characters of CHARSET with Unicode.
1330 This means reading the relevant file and installing the table defined
1331 by CHARSET's `:unify-map' property.
1332
1333 Optional second arg UNIFY-MAP is a file name string or a vector. It has
1334 the same meaning as the `:unify-map' attribute in the function
1335 `define-charset' (which see).
1336
1337 Optional third argument DEUNIFY, if non-nil, means to de-unify CHARSET. */)
1338 (Lisp_Object charset, Lisp_Object unify_map, Lisp_Object deunify)
1339 {
1340 int id;
1341 struct charset *cs;
1342
1343 CHECK_CHARSET_GET_ID (charset, id);
1344 cs = CHARSET_FROM_ID (id);
1345 if (NILP (deunify)
1346 ? CHARSET_UNIFIED_P (cs) && ! NILP (CHARSET_DEUNIFIER (cs))
1347 : ! CHARSET_UNIFIED_P (cs))
1348 return Qnil;
1349
1350 CHARSET_UNIFIED_P (cs) = 0;
1351 if (NILP (deunify))
1352 {
1353 if (CHARSET_METHOD (cs) != CHARSET_METHOD_OFFSET
1354 || CHARSET_CODE_OFFSET (cs) < 0x110000)
1355 error ("Can't unify charset: %s", SDATA (SYMBOL_NAME (charset)));
1356 if (NILP (unify_map))
1357 unify_map = CHARSET_UNIFY_MAP (cs);
1358 else
1359 {
1360 if (! STRINGP (unify_map) && ! VECTORP (unify_map))
1361 signal_error ("Bad unify-map", unify_map);
1362 set_charset_attr (cs, charset_unify_map, unify_map);
1363 }
1364 if (NILP (Vchar_unify_table))
1365 Vchar_unify_table = Fmake_char_table (Qnil, Qnil);
1366 char_table_set_range (Vchar_unify_table,
1367 cs->min_char, cs->max_char, charset);
1368 CHARSET_UNIFIED_P (cs) = 1;
1369 }
1370 else if (CHAR_TABLE_P (Vchar_unify_table))
1371 {
1372 unsigned min_code = CHARSET_MIN_CODE (cs);
1373 unsigned max_code = CHARSET_MAX_CODE (cs);
1374 int min_char = DECODE_CHAR (cs, min_code);
1375 int max_char = DECODE_CHAR (cs, max_code);
1376
1377 char_table_set_range (Vchar_unify_table, min_char, max_char, Qnil);
1378 }
1379
1380 return Qnil;
1381 }
1382
1383 /* Check that DIMENSION, CHARS, and FINAL_CHAR specify a valid ISO charset.
1384 Return true if it's a 96-character set, false if 94. */
1385
1386 static bool
1387 check_iso_charset_parameter (Lisp_Object dimension, Lisp_Object chars,
1388 Lisp_Object final_char)
1389 {
1390 CHECK_NUMBER (dimension);
1391 CHECK_NUMBER (chars);
1392 CHECK_CHARACTER (final_char);
1393
1394 if (! (1 <= XINT (dimension) && XINT (dimension) <= 3))
1395 error ("Invalid DIMENSION %"pI"d, it should be 1, 2, or 3",
1396 XINT (dimension));
1397
1398 bool chars_flag = XINT (chars) == 96;
1399 if (! (chars_flag || XINT (chars) == 94))
1400 error ("Invalid CHARS %"pI"d, it should be 94 or 96", XINT (chars));
1401
1402 int final_ch = XFASTINT (final_char);
1403 if (! ('0' <= final_ch && final_ch <= '~'))
1404 error ("Invalid FINAL-CHAR '%c', it should be '0'..'~'", final_ch);
1405
1406 return chars_flag;
1407 }
1408
1409 DEFUN ("get-unused-iso-final-char", Fget_unused_iso_final_char,
1410 Sget_unused_iso_final_char, 2, 2, 0,
1411 doc: /*
1412 Return an unused ISO final char for a charset of DIMENSION and CHARS.
1413 DIMENSION is the number of bytes to represent a character: 1 or 2.
1414 CHARS is the number of characters in a dimension: 94 or 96.
1415
1416 This final char is for private use, thus the range is `0' (48) .. `?' (63).
1417 If there's no unused final char for the specified kind of charset,
1418 return nil. */)
1419 (Lisp_Object dimension, Lisp_Object chars)
1420 {
1421 bool chars_flag = check_iso_charset_parameter (dimension, chars,
1422 make_number ('0'));
1423 for (int final_char = '0'; final_char <= '?'; final_char++)
1424 if (ISO_CHARSET_TABLE (XINT (dimension), chars_flag, final_char) < 0)
1425 return make_number (final_char);
1426 return Qnil;
1427 }
1428
1429
1430 DEFUN ("declare-equiv-charset", Fdeclare_equiv_charset, Sdeclare_equiv_charset,
1431 4, 4, 0,
1432 doc: /* Declare an equivalent charset for ISO-2022 decoding.
1433
1434 On decoding by an ISO-2022 base coding system, when a charset
1435 specified by DIMENSION, CHARS, and FINAL-CHAR is designated, behave as
1436 if CHARSET is designated instead. */)
1437 (Lisp_Object dimension, Lisp_Object chars, Lisp_Object final_char, Lisp_Object charset)
1438 {
1439 int id;
1440
1441 CHECK_CHARSET_GET_ID (charset, id);
1442 bool chars_flag = check_iso_charset_parameter (dimension, chars, final_char);
1443 ISO_CHARSET_TABLE (XINT (dimension), chars_flag, XFASTINT (final_char)) = id;
1444 return Qnil;
1445 }
1446
1447
1448 /* Return information about charsets in the text at PTR of NBYTES
1449 bytes, which are NCHARS characters. The value is:
1450
1451 0: Each character is represented by one byte. This is always
1452 true for a unibyte string. For a multibyte string, true if
1453 it contains only ASCII characters.
1454
1455 1: No charsets other than ascii, control-1, and latin-1 are
1456 found.
1457
1458 2: Otherwise.
1459 */
1460
1461 int
1462 string_xstring_p (Lisp_Object string)
1463 {
1464 const unsigned char *p = SDATA (string);
1465 const unsigned char *endp = p + SBYTES (string);
1466
1467 if (SCHARS (string) == SBYTES (string))
1468 return 0;
1469
1470 while (p < endp)
1471 {
1472 int c = STRING_CHAR_ADVANCE (p);
1473
1474 if (c >= 0x100)
1475 return 2;
1476 }
1477 return 1;
1478 }
1479
1480
1481 /* Find charsets in the string at PTR of NCHARS and NBYTES.
1482
1483 CHARSETS is a vector. If Nth element is non-nil, it means the
1484 charset whose id is N is already found.
1485
1486 It may lookup a translation table TABLE if supplied. */
1487
1488 static void
1489 find_charsets_in_text (const unsigned char *ptr, ptrdiff_t nchars,
1490 ptrdiff_t nbytes, Lisp_Object charsets,
1491 Lisp_Object table, bool multibyte)
1492 {
1493 const unsigned char *pend = ptr + nbytes;
1494
1495 if (nchars == nbytes)
1496 {
1497 if (multibyte)
1498 ASET (charsets, charset_ascii, Qt);
1499 else
1500 while (ptr < pend)
1501 {
1502 int c = *ptr++;
1503
1504 if (!NILP (table))
1505 c = translate_char (table, c);
1506 if (ASCII_CHAR_P (c))
1507 ASET (charsets, charset_ascii, Qt);
1508 else
1509 ASET (charsets, charset_eight_bit, Qt);
1510 }
1511 }
1512 else
1513 {
1514 while (ptr < pend)
1515 {
1516 int c = STRING_CHAR_ADVANCE (ptr);
1517 struct charset *charset;
1518
1519 if (!NILP (table))
1520 c = translate_char (table, c);
1521 charset = CHAR_CHARSET (c);
1522 ASET (charsets, CHARSET_ID (charset), Qt);
1523 }
1524 }
1525 }
1526
1527 DEFUN ("find-charset-region", Ffind_charset_region, Sfind_charset_region,
1528 2, 3, 0,
1529 doc: /* Return a list of charsets in the region between BEG and END.
1530 BEG and END are buffer positions.
1531 Optional arg TABLE if non-nil is a translation table to look up.
1532
1533 If the current buffer is unibyte, the returned list may contain
1534 only `ascii', `eight-bit-control', and `eight-bit-graphic'. */)
1535 (Lisp_Object beg, Lisp_Object end, Lisp_Object table)
1536 {
1537 Lisp_Object charsets;
1538 ptrdiff_t from, from_byte, to, stop, stop_byte;
1539 int i;
1540 Lisp_Object val;
1541 bool multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
1542
1543 validate_region (&beg, &end);
1544 from = XFASTINT (beg);
1545 stop = to = XFASTINT (end);
1546
1547 if (from < GPT && GPT < to)
1548 {
1549 stop = GPT;
1550 stop_byte = GPT_BYTE;
1551 }
1552 else
1553 stop_byte = CHAR_TO_BYTE (stop);
1554
1555 from_byte = CHAR_TO_BYTE (from);
1556
1557 charsets = Fmake_vector (make_number (charset_table_used), Qnil);
1558 while (1)
1559 {
1560 find_charsets_in_text (BYTE_POS_ADDR (from_byte), stop - from,
1561 stop_byte - from_byte, charsets, table,
1562 multibyte);
1563 if (stop < to)
1564 {
1565 from = stop, from_byte = stop_byte;
1566 stop = to, stop_byte = CHAR_TO_BYTE (stop);
1567 }
1568 else
1569 break;
1570 }
1571
1572 val = Qnil;
1573 for (i = charset_table_used - 1; i >= 0; i--)
1574 if (!NILP (AREF (charsets, i)))
1575 val = Fcons (CHARSET_NAME (charset_table + i), val);
1576 return val;
1577 }
1578
1579 DEFUN ("find-charset-string", Ffind_charset_string, Sfind_charset_string,
1580 1, 2, 0,
1581 doc: /* Return a list of charsets in STR.
1582 Optional arg TABLE if non-nil is a translation table to look up.
1583
1584 If STR is unibyte, the returned list may contain
1585 only `ascii', `eight-bit-control', and `eight-bit-graphic'. */)
1586 (Lisp_Object str, Lisp_Object table)
1587 {
1588 Lisp_Object charsets;
1589 int i;
1590 Lisp_Object val;
1591
1592 CHECK_STRING (str);
1593
1594 charsets = Fmake_vector (make_number (charset_table_used), Qnil);
1595 find_charsets_in_text (SDATA (str), SCHARS (str), SBYTES (str),
1596 charsets, table,
1597 STRING_MULTIBYTE (str));
1598 val = Qnil;
1599 for (i = charset_table_used - 1; i >= 0; i--)
1600 if (!NILP (AREF (charsets, i)))
1601 val = Fcons (CHARSET_NAME (charset_table + i), val);
1602 return val;
1603 }
1604
1605 \f
1606
1607 /* Return a unified character code for C (>= 0x110000). VAL is a
1608 value of Vchar_unify_table for C; i.e. it is nil, an integer, or a
1609 charset symbol. */
1610 static int
1611 maybe_unify_char (int c, Lisp_Object val)
1612 {
1613 struct charset *charset;
1614
1615 if (INTEGERP (val))
1616 return XFASTINT (val);
1617 if (NILP (val))
1618 return c;
1619
1620 CHECK_CHARSET_GET_CHARSET (val, charset);
1621 #ifdef REL_ALLOC
1622 /* The call to load_charset below can allocate memory, which screws
1623 callers of this function through STRING_CHAR_* macros that hold C
1624 pointers to buffer text, if REL_ALLOC is used. */
1625 r_alloc_inhibit_buffer_relocation (1);
1626 #endif
1627 load_charset (charset, 1);
1628 if (! inhibit_load_charset_map)
1629 {
1630 val = CHAR_TABLE_REF (Vchar_unify_table, c);
1631 if (! NILP (val))
1632 c = XFASTINT (val);
1633 }
1634 else
1635 {
1636 int code_index = c - CHARSET_CODE_OFFSET (charset);
1637 int unified = GET_TEMP_CHARSET_WORK_DECODER (code_index);
1638
1639 if (unified > 0)
1640 c = unified;
1641 }
1642 #ifdef REL_ALLOC
1643 r_alloc_inhibit_buffer_relocation (0);
1644 #endif
1645 return c;
1646 }
1647
1648
1649 /* Return a character corresponding to the code-point CODE of
1650 CHARSET. */
1651
1652 int
1653 decode_char (struct charset *charset, unsigned int code)
1654 {
1655 int c, char_index;
1656 enum charset_method method = CHARSET_METHOD (charset);
1657
1658 if (code < CHARSET_MIN_CODE (charset) || code > CHARSET_MAX_CODE (charset))
1659 return -1;
1660
1661 if (method == CHARSET_METHOD_SUBSET)
1662 {
1663 Lisp_Object subset_info;
1664
1665 subset_info = CHARSET_SUBSET (charset);
1666 charset = CHARSET_FROM_ID (XFASTINT (AREF (subset_info, 0)));
1667 code -= XINT (AREF (subset_info, 3));
1668 if (code < XFASTINT (AREF (subset_info, 1))
1669 || code > XFASTINT (AREF (subset_info, 2)))
1670 c = -1;
1671 else
1672 c = DECODE_CHAR (charset, code);
1673 }
1674 else if (method == CHARSET_METHOD_SUPERSET)
1675 {
1676 Lisp_Object parents;
1677
1678 parents = CHARSET_SUPERSET (charset);
1679 c = -1;
1680 for (; CONSP (parents); parents = XCDR (parents))
1681 {
1682 int id = XINT (XCAR (XCAR (parents)));
1683 int code_offset = XINT (XCDR (XCAR (parents)));
1684 unsigned this_code = code - code_offset;
1685
1686 charset = CHARSET_FROM_ID (id);
1687 if ((c = DECODE_CHAR (charset, this_code)) >= 0)
1688 break;
1689 }
1690 }
1691 else
1692 {
1693 char_index = CODE_POINT_TO_INDEX (charset, code);
1694 if (char_index < 0)
1695 return -1;
1696
1697 if (method == CHARSET_METHOD_MAP)
1698 {
1699 Lisp_Object decoder;
1700
1701 decoder = CHARSET_DECODER (charset);
1702 if (! VECTORP (decoder))
1703 {
1704 load_charset (charset, 1);
1705 decoder = CHARSET_DECODER (charset);
1706 }
1707 if (VECTORP (decoder))
1708 c = XINT (AREF (decoder, char_index));
1709 else
1710 c = GET_TEMP_CHARSET_WORK_DECODER (char_index);
1711 }
1712 else /* method == CHARSET_METHOD_OFFSET */
1713 {
1714 c = char_index + CHARSET_CODE_OFFSET (charset);
1715 if (CHARSET_UNIFIED_P (charset)
1716 && MAX_UNICODE_CHAR < c && c <= MAX_5_BYTE_CHAR)
1717 {
1718 /* Unify C with a Unicode character if possible. */
1719 Lisp_Object val = CHAR_TABLE_REF (Vchar_unify_table, c);
1720 c = maybe_unify_char (c, val);
1721 }
1722 }
1723 }
1724
1725 return c;
1726 }
1727
1728 /* Variable used temporarily by the macro ENCODE_CHAR. */
1729 Lisp_Object charset_work;
1730
1731 /* Return a code-point of C in CHARSET. If C doesn't belong to
1732 CHARSET, return CHARSET_INVALID_CODE (CHARSET). If STRICT is true,
1733 use CHARSET's strict_max_char instead of max_char. */
1734
1735 unsigned
1736 encode_char (struct charset *charset, int c)
1737 {
1738 unsigned code;
1739 enum charset_method method = CHARSET_METHOD (charset);
1740
1741 if (CHARSET_UNIFIED_P (charset))
1742 {
1743 Lisp_Object deunifier;
1744 int code_index = -1;
1745
1746 deunifier = CHARSET_DEUNIFIER (charset);
1747 if (! CHAR_TABLE_P (deunifier))
1748 {
1749 load_charset (charset, 2);
1750 deunifier = CHARSET_DEUNIFIER (charset);
1751 }
1752 if (CHAR_TABLE_P (deunifier))
1753 {
1754 Lisp_Object deunified = CHAR_TABLE_REF (deunifier, c);
1755
1756 if (INTEGERP (deunified))
1757 code_index = XINT (deunified);
1758 }
1759 else
1760 {
1761 code_index = GET_TEMP_CHARSET_WORK_ENCODER (c);
1762 }
1763 if (code_index >= 0)
1764 c = CHARSET_CODE_OFFSET (charset) + code_index;
1765 }
1766
1767 if (method == CHARSET_METHOD_SUBSET)
1768 {
1769 Lisp_Object subset_info;
1770 struct charset *this_charset;
1771
1772 subset_info = CHARSET_SUBSET (charset);
1773 this_charset = CHARSET_FROM_ID (XFASTINT (AREF (subset_info, 0)));
1774 code = ENCODE_CHAR (this_charset, c);
1775 if (code == CHARSET_INVALID_CODE (this_charset)
1776 || code < XFASTINT (AREF (subset_info, 1))
1777 || code > XFASTINT (AREF (subset_info, 2)))
1778 return CHARSET_INVALID_CODE (charset);
1779 code += XINT (AREF (subset_info, 3));
1780 return code;
1781 }
1782
1783 if (method == CHARSET_METHOD_SUPERSET)
1784 {
1785 Lisp_Object parents;
1786
1787 parents = CHARSET_SUPERSET (charset);
1788 for (; CONSP (parents); parents = XCDR (parents))
1789 {
1790 int id = XINT (XCAR (XCAR (parents)));
1791 int code_offset = XINT (XCDR (XCAR (parents)));
1792 struct charset *this_charset = CHARSET_FROM_ID (id);
1793
1794 code = ENCODE_CHAR (this_charset, c);
1795 if (code != CHARSET_INVALID_CODE (this_charset))
1796 return code + code_offset;
1797 }
1798 return CHARSET_INVALID_CODE (charset);
1799 }
1800
1801 if (! CHARSET_FAST_MAP_REF ((c), charset->fast_map)
1802 || c < CHARSET_MIN_CHAR (charset) || c > CHARSET_MAX_CHAR (charset))
1803 return CHARSET_INVALID_CODE (charset);
1804
1805 if (method == CHARSET_METHOD_MAP)
1806 {
1807 Lisp_Object encoder;
1808 Lisp_Object val;
1809
1810 encoder = CHARSET_ENCODER (charset);
1811 if (! CHAR_TABLE_P (CHARSET_ENCODER (charset)))
1812 {
1813 load_charset (charset, 2);
1814 encoder = CHARSET_ENCODER (charset);
1815 }
1816 if (CHAR_TABLE_P (encoder))
1817 {
1818 val = CHAR_TABLE_REF (encoder, c);
1819 if (NILP (val))
1820 return CHARSET_INVALID_CODE (charset);
1821 code = XINT (val);
1822 if (! CHARSET_COMPACT_CODES_P (charset))
1823 code = INDEX_TO_CODE_POINT (charset, code);
1824 }
1825 else
1826 {
1827 code = GET_TEMP_CHARSET_WORK_ENCODER (c);
1828 code = INDEX_TO_CODE_POINT (charset, code);
1829 }
1830 }
1831 else /* method == CHARSET_METHOD_OFFSET */
1832 {
1833 unsigned code_index = c - CHARSET_CODE_OFFSET (charset);
1834
1835 code = INDEX_TO_CODE_POINT (charset, code_index);
1836 }
1837
1838 return code;
1839 }
1840
1841
1842 DEFUN ("decode-char", Fdecode_char, Sdecode_char, 2, 2, 0,
1843 doc: /* Decode the pair of CHARSET and CODE-POINT into a character.
1844 Return nil if CODE-POINT is not valid in CHARSET.
1845
1846 CODE-POINT may be a cons (HIGHER-16-BIT-VALUE . LOWER-16-BIT-VALUE). */)
1847 (Lisp_Object charset, Lisp_Object code_point)
1848 {
1849 int c, id;
1850 unsigned code;
1851 struct charset *charsetp;
1852
1853 CHECK_CHARSET_GET_ID (charset, id);
1854 code = cons_to_unsigned (code_point, UINT_MAX);
1855 charsetp = CHARSET_FROM_ID (id);
1856 c = DECODE_CHAR (charsetp, code);
1857 return (c >= 0 ? make_number (c) : Qnil);
1858 }
1859
1860
1861 DEFUN ("encode-char", Fencode_char, Sencode_char, 2, 2, 0,
1862 doc: /* Encode the character CH into a code-point of CHARSET.
1863 Return nil if CHARSET doesn't include CH. */)
1864 (Lisp_Object ch, Lisp_Object charset)
1865 {
1866 int c, id;
1867 unsigned code;
1868 struct charset *charsetp;
1869
1870 CHECK_CHARSET_GET_ID (charset, id);
1871 CHECK_CHARACTER (ch);
1872 c = XFASTINT (ch);
1873 charsetp = CHARSET_FROM_ID (id);
1874 code = ENCODE_CHAR (charsetp, c);
1875 if (code == CHARSET_INVALID_CODE (charsetp))
1876 return Qnil;
1877 return INTEGER_TO_CONS (code);
1878 }
1879
1880
1881 DEFUN ("make-char", Fmake_char, Smake_char, 1, 5, 0,
1882 doc:
1883 /* Return a character of CHARSET whose position codes are CODEn.
1884
1885 CODE1 through CODE4 are optional, but if you don't supply sufficient
1886 position codes, it is assumed that the minimum code in each dimension
1887 is specified. */)
1888 (Lisp_Object charset, Lisp_Object code1, Lisp_Object code2, Lisp_Object code3, Lisp_Object code4)
1889 {
1890 int id, dimension;
1891 struct charset *charsetp;
1892 unsigned code;
1893 int c;
1894
1895 CHECK_CHARSET_GET_ID (charset, id);
1896 charsetp = CHARSET_FROM_ID (id);
1897
1898 dimension = CHARSET_DIMENSION (charsetp);
1899 if (NILP (code1))
1900 code = (CHARSET_ASCII_COMPATIBLE_P (charsetp)
1901 ? 0 : CHARSET_MIN_CODE (charsetp));
1902 else
1903 {
1904 CHECK_NATNUM (code1);
1905 if (XFASTINT (code1) >= 0x100)
1906 args_out_of_range (make_number (0xFF), code1);
1907 code = XFASTINT (code1);
1908
1909 if (dimension > 1)
1910 {
1911 code <<= 8;
1912 if (NILP (code2))
1913 code |= charsetp->code_space[(dimension - 2) * 4];
1914 else
1915 {
1916 CHECK_NATNUM (code2);
1917 if (XFASTINT (code2) >= 0x100)
1918 args_out_of_range (make_number (0xFF), code2);
1919 code |= XFASTINT (code2);
1920 }
1921
1922 if (dimension > 2)
1923 {
1924 code <<= 8;
1925 if (NILP (code3))
1926 code |= charsetp->code_space[(dimension - 3) * 4];
1927 else
1928 {
1929 CHECK_NATNUM (code3);
1930 if (XFASTINT (code3) >= 0x100)
1931 args_out_of_range (make_number (0xFF), code3);
1932 code |= XFASTINT (code3);
1933 }
1934
1935 if (dimension > 3)
1936 {
1937 code <<= 8;
1938 if (NILP (code4))
1939 code |= charsetp->code_space[0];
1940 else
1941 {
1942 CHECK_NATNUM (code4);
1943 if (XFASTINT (code4) >= 0x100)
1944 args_out_of_range (make_number (0xFF), code4);
1945 code |= XFASTINT (code4);
1946 }
1947 }
1948 }
1949 }
1950 }
1951
1952 if (CHARSET_ISO_FINAL (charsetp) >= 0)
1953 code &= 0x7F7F7F7F;
1954 c = DECODE_CHAR (charsetp, code);
1955 if (c < 0)
1956 error ("Invalid code(s)");
1957 return make_number (c);
1958 }
1959
1960
1961 /* Return the first charset in CHARSET_LIST that contains C.
1962 CHARSET_LIST is a list of charset IDs. If it is nil, use
1963 Vcharset_ordered_list. */
1964
1965 struct charset *
1966 char_charset (int c, Lisp_Object charset_list, unsigned int *code_return)
1967 {
1968 bool maybe_null = 0;
1969
1970 if (NILP (charset_list))
1971 charset_list = Vcharset_ordered_list;
1972 else
1973 maybe_null = 1;
1974
1975 while (CONSP (charset_list))
1976 {
1977 struct charset *charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
1978 unsigned code = ENCODE_CHAR (charset, c);
1979
1980 if (code != CHARSET_INVALID_CODE (charset))
1981 {
1982 if (code_return)
1983 *code_return = code;
1984 return charset;
1985 }
1986 charset_list = XCDR (charset_list);
1987 if (! maybe_null
1988 && c <= MAX_UNICODE_CHAR
1989 && EQ (charset_list, Vcharset_non_preferred_head))
1990 return CHARSET_FROM_ID (charset_unicode);
1991 }
1992 return (maybe_null ? NULL
1993 : c <= MAX_5_BYTE_CHAR ? CHARSET_FROM_ID (charset_emacs)
1994 : CHARSET_FROM_ID (charset_eight_bit));
1995 }
1996
1997
1998 DEFUN ("split-char", Fsplit_char, Ssplit_char, 1, 1, 0,
1999 doc:
2000 /*Return list of charset and one to four position-codes of CH.
2001 The charset is decided by the current priority order of charsets.
2002 A position-code is a byte value of each dimension of the code-point of
2003 CH in the charset. */)
2004 (Lisp_Object ch)
2005 {
2006 struct charset *charset;
2007 int c, dimension;
2008 unsigned code;
2009 Lisp_Object val;
2010
2011 CHECK_CHARACTER (ch);
2012 c = XFASTINT (ch);
2013 charset = CHAR_CHARSET (c);
2014 if (! charset)
2015 emacs_abort ();
2016 code = ENCODE_CHAR (charset, c);
2017 if (code == CHARSET_INVALID_CODE (charset))
2018 emacs_abort ();
2019 dimension = CHARSET_DIMENSION (charset);
2020 for (val = Qnil; dimension > 0; dimension--)
2021 {
2022 val = Fcons (make_number (code & 0xFF), val);
2023 code >>= 8;
2024 }
2025 return Fcons (CHARSET_NAME (charset), val);
2026 }
2027
2028
2029 DEFUN ("char-charset", Fchar_charset, Schar_charset, 1, 2, 0,
2030 doc: /* Return the charset of highest priority that contains CH.
2031 ASCII characters are an exception: for them, this function always
2032 returns `ascii'.
2033 If optional 2nd arg RESTRICTION is non-nil, it is a list of charsets
2034 from which to find the charset. It may also be a coding system. In
2035 that case, find the charset from what supported by that coding system. */)
2036 (Lisp_Object ch, Lisp_Object restriction)
2037 {
2038 struct charset *charset;
2039
2040 CHECK_CHARACTER (ch);
2041 if (NILP (restriction))
2042 charset = CHAR_CHARSET (XINT (ch));
2043 else
2044 {
2045 if (CONSP (restriction))
2046 {
2047 int c = XFASTINT (ch);
2048
2049 for (; CONSP (restriction); restriction = XCDR (restriction))
2050 {
2051 struct charset *rcharset;
2052
2053 CHECK_CHARSET_GET_CHARSET (XCAR (restriction), rcharset);
2054 if (ENCODE_CHAR (rcharset, c) != CHARSET_INVALID_CODE (rcharset))
2055 return XCAR (restriction);
2056 }
2057 return Qnil;
2058 }
2059 restriction = coding_system_charset_list (restriction);
2060 charset = char_charset (XINT (ch), restriction, NULL);
2061 if (! charset)
2062 return Qnil;
2063 }
2064 return (CHARSET_NAME (charset));
2065 }
2066
2067
2068 DEFUN ("charset-after", Fcharset_after, Scharset_after, 0, 1, 0,
2069 doc: /*
2070 Return charset of a character in the current buffer at position POS.
2071 If POS is nil, it defaults to the current point.
2072 If POS is out of range, the value is nil. */)
2073 (Lisp_Object pos)
2074 {
2075 Lisp_Object ch;
2076 struct charset *charset;
2077
2078 ch = Fchar_after (pos);
2079 if (! INTEGERP (ch))
2080 return ch;
2081 charset = CHAR_CHARSET (XINT (ch));
2082 return (CHARSET_NAME (charset));
2083 }
2084
2085
2086 DEFUN ("iso-charset", Fiso_charset, Siso_charset, 3, 3, 0,
2087 doc: /*
2088 Return charset of ISO's specification DIMENSION, CHARS, and FINAL-CHAR.
2089
2090 ISO 2022's designation sequence (escape sequence) distinguishes charsets
2091 by their DIMENSION, CHARS, and FINAL-CHAR,
2092 whereas Emacs distinguishes them by charset symbol.
2093 See the documentation of the function `charset-info' for the meanings of
2094 DIMENSION, CHARS, and FINAL-CHAR. */)
2095 (Lisp_Object dimension, Lisp_Object chars, Lisp_Object final_char)
2096 {
2097 bool chars_flag = check_iso_charset_parameter (dimension, chars, final_char);
2098 int id = ISO_CHARSET_TABLE (XINT (dimension), chars_flag,
2099 XFASTINT (final_char));
2100 return (id >= 0 ? CHARSET_NAME (CHARSET_FROM_ID (id)) : Qnil);
2101 }
2102
2103
2104 DEFUN ("clear-charset-maps", Fclear_charset_maps, Sclear_charset_maps,
2105 0, 0, 0,
2106 doc: /*
2107 Internal use only.
2108 Clear temporary charset mapping tables.
2109 It should be called only from temacs invoked for dumping. */)
2110 (void)
2111 {
2112 if (temp_charset_work)
2113 {
2114 xfree (temp_charset_work);
2115 temp_charset_work = NULL;
2116 }
2117
2118 if (CHAR_TABLE_P (Vchar_unify_table))
2119 Foptimize_char_table (Vchar_unify_table, Qnil);
2120
2121 return Qnil;
2122 }
2123
2124 DEFUN ("charset-priority-list", Fcharset_priority_list,
2125 Scharset_priority_list, 0, 1, 0,
2126 doc: /* Return the list of charsets ordered by priority.
2127 HIGHESTP non-nil means just return the highest priority one. */)
2128 (Lisp_Object highestp)
2129 {
2130 Lisp_Object val = Qnil, list = Vcharset_ordered_list;
2131
2132 if (!NILP (highestp))
2133 return CHARSET_NAME (CHARSET_FROM_ID (XINT (Fcar (list))));
2134
2135 while (!NILP (list))
2136 {
2137 val = Fcons (CHARSET_NAME (CHARSET_FROM_ID (XINT (XCAR (list)))), val);
2138 list = XCDR (list);
2139 }
2140 return Fnreverse (val);
2141 }
2142
2143 DEFUN ("set-charset-priority", Fset_charset_priority, Sset_charset_priority,
2144 1, MANY, 0,
2145 doc: /* Assign higher priority to the charsets given as arguments.
2146 usage: (set-charset-priority &rest charsets) */)
2147 (ptrdiff_t nargs, Lisp_Object *args)
2148 {
2149 Lisp_Object new_head, old_list;
2150 Lisp_Object list_2022, list_emacs_mule;
2151 ptrdiff_t i;
2152 int id;
2153
2154 old_list = Fcopy_sequence (Vcharset_ordered_list);
2155 new_head = Qnil;
2156 for (i = 0; i < nargs; i++)
2157 {
2158 CHECK_CHARSET_GET_ID (args[i], id);
2159 if (! NILP (Fmemq (make_number (id), old_list)))
2160 {
2161 old_list = Fdelq (make_number (id), old_list);
2162 new_head = Fcons (make_number (id), new_head);
2163 }
2164 }
2165 Vcharset_non_preferred_head = old_list;
2166 Vcharset_ordered_list = CALLN (Fnconc, Fnreverse (new_head), old_list);
2167
2168 charset_ordered_list_tick++;
2169
2170 charset_unibyte = -1;
2171 for (old_list = Vcharset_ordered_list, list_2022 = list_emacs_mule = Qnil;
2172 CONSP (old_list); old_list = XCDR (old_list))
2173 {
2174 if (! NILP (Fmemq (XCAR (old_list), Viso_2022_charset_list)))
2175 list_2022 = Fcons (XCAR (old_list), list_2022);
2176 if (! NILP (Fmemq (XCAR (old_list), Vemacs_mule_charset_list)))
2177 list_emacs_mule = Fcons (XCAR (old_list), list_emacs_mule);
2178 if (charset_unibyte < 0)
2179 {
2180 struct charset *charset = CHARSET_FROM_ID (XINT (XCAR (old_list)));
2181
2182 if (CHARSET_DIMENSION (charset) == 1
2183 && CHARSET_ASCII_COMPATIBLE_P (charset)
2184 && CHARSET_MAX_CHAR (charset) >= 0x80)
2185 charset_unibyte = CHARSET_ID (charset);
2186 }
2187 }
2188 Viso_2022_charset_list = Fnreverse (list_2022);
2189 Vemacs_mule_charset_list = Fnreverse (list_emacs_mule);
2190 if (charset_unibyte < 0)
2191 charset_unibyte = charset_iso_8859_1;
2192
2193 return Qnil;
2194 }
2195
2196 DEFUN ("charset-id-internal", Fcharset_id_internal, Scharset_id_internal,
2197 0, 1, 0,
2198 doc: /* Internal use only.
2199 Return charset identification number of CHARSET. */)
2200 (Lisp_Object charset)
2201 {
2202 int id;
2203
2204 CHECK_CHARSET_GET_ID (charset, id);
2205 return make_number (id);
2206 }
2207
2208 struct charset_sort_data
2209 {
2210 Lisp_Object charset;
2211 int id;
2212 ptrdiff_t priority;
2213 };
2214
2215 static int
2216 charset_compare (const void *d1, const void *d2)
2217 {
2218 const struct charset_sort_data *data1 = d1, *data2 = d2;
2219 if (data1->priority != data2->priority)
2220 return data1->priority < data2->priority ? -1 : 1;
2221 return 0;
2222 }
2223
2224 DEFUN ("sort-charsets", Fsort_charsets, Ssort_charsets, 1, 1, 0,
2225 doc: /* Sort charset list CHARSETS by a priority of each charset.
2226 Return the sorted list. CHARSETS is modified by side effects.
2227 See also `charset-priority-list' and `set-charset-priority'. */)
2228 (Lisp_Object charsets)
2229 {
2230 Lisp_Object len = Flength (charsets);
2231 ptrdiff_t n = XFASTINT (len), i, j;
2232 int done;
2233 Lisp_Object tail, elt, attrs;
2234 struct charset_sort_data *sort_data;
2235 int id, min_id = INT_MAX, max_id = INT_MIN;
2236 USE_SAFE_ALLOCA;
2237
2238 if (n == 0)
2239 return Qnil;
2240 SAFE_NALLOCA (sort_data, 1, n);
2241 for (tail = charsets, i = 0; CONSP (tail); tail = XCDR (tail), i++)
2242 {
2243 elt = XCAR (tail);
2244 CHECK_CHARSET_GET_ATTR (elt, attrs);
2245 sort_data[i].charset = elt;
2246 sort_data[i].id = id = XINT (CHARSET_ATTR_ID (attrs));
2247 if (id < min_id)
2248 min_id = id;
2249 if (id > max_id)
2250 max_id = id;
2251 }
2252 for (done = 0, tail = Vcharset_ordered_list, i = 0;
2253 done < n && CONSP (tail); tail = XCDR (tail), i++)
2254 {
2255 elt = XCAR (tail);
2256 id = XFASTINT (elt);
2257 if (id >= min_id && id <= max_id)
2258 for (j = 0; j < n; j++)
2259 if (sort_data[j].id == id)
2260 {
2261 sort_data[j].priority = i;
2262 done++;
2263 }
2264 }
2265 qsort (sort_data, n, sizeof *sort_data, charset_compare);
2266 for (i = 0, tail = charsets; CONSP (tail); tail = XCDR (tail), i++)
2267 XSETCAR (tail, sort_data[i].charset);
2268 SAFE_FREE ();
2269 return charsets;
2270 }
2271
2272 \f
2273 void
2274 init_charset (void)
2275 {
2276 Lisp_Object tempdir;
2277 tempdir = Fexpand_file_name (build_string ("charsets"), Vdata_directory);
2278 if (! file_accessible_directory_p (tempdir))
2279 {
2280 /* This used to be non-fatal (dir_warning), but it should not
2281 happen, and if it does sooner or later it will cause some
2282 obscure problem (eg bug#6401), so better abort. */
2283 fprintf (stderr, "Error: charsets directory not found:\n\
2284 %s\n\
2285 Emacs will not function correctly without the character map files.\n%s\
2286 Please check your installation!\n",
2287 SDATA (tempdir),
2288 egetenv("EMACSDATA") ? "The EMACSDATA environment \
2289 variable is set, maybe it has the wrong value?\n" : "");
2290 exit (1);
2291 }
2292
2293 Vcharset_map_path = list1 (tempdir);
2294 }
2295
2296
2297 void
2298 init_charset_once (void)
2299 {
2300 int i, j, k;
2301
2302 for (i = 0; i < ISO_MAX_DIMENSION; i++)
2303 for (j = 0; j < ISO_MAX_CHARS; j++)
2304 for (k = 0; k < ISO_MAX_FINAL; k++)
2305 iso_charset_table[i][j][k] = -1;
2306
2307 for (i = 0; i < 256; i++)
2308 emacs_mule_charset[i] = -1;
2309
2310 charset_jisx0201_roman = -1;
2311 charset_jisx0208_1978 = -1;
2312 charset_jisx0208 = -1;
2313 charset_ksc5601 = -1;
2314 }
2315
2316 #ifdef emacs
2317
2318 /* Allocate an initial charset table that is large enough to handle
2319 Emacs while it is bootstrapping. As of September 2011, the size
2320 needs to be at least 166; make it a bit bigger to allow for future
2321 expansion.
2322
2323 Don't make the value so small that the table is reallocated during
2324 bootstrapping, as glibc malloc calls larger than just under 64 KiB
2325 during an initial bootstrap wreak havoc after dumping; see the
2326 M_MMAP_THRESHOLD value in alloc.c, plus there is a extra overhead
2327 internal to glibc malloc and perhaps to Emacs malloc debugging. */
2328 static struct charset charset_table_init[180];
2329
2330 void
2331 syms_of_charset (void)
2332 {
2333 DEFSYM (Qcharsetp, "charsetp");
2334
2335 /* Special charset symbols. */
2336 DEFSYM (Qascii, "ascii");
2337 DEFSYM (Qunicode, "unicode");
2338 DEFSYM (Qemacs, "emacs");
2339 DEFSYM (Qeight_bit, "eight-bit");
2340 DEFSYM (Qiso_8859_1, "iso-8859-1");
2341
2342 staticpro (&Vcharset_ordered_list);
2343 Vcharset_ordered_list = Qnil;
2344
2345 staticpro (&Viso_2022_charset_list);
2346 Viso_2022_charset_list = Qnil;
2347
2348 staticpro (&Vemacs_mule_charset_list);
2349 Vemacs_mule_charset_list = Qnil;
2350
2351 staticpro (&Vcharset_hash_table);
2352 Vcharset_hash_table = CALLN (Fmake_hash_table, QCtest, Qeq);
2353
2354 charset_table = charset_table_init;
2355 charset_table_size = ARRAYELTS (charset_table_init);
2356 charset_table_used = 0;
2357
2358 defsubr (&Scharsetp);
2359 defsubr (&Smap_charset_chars);
2360 defsubr (&Sdefine_charset_internal);
2361 defsubr (&Sdefine_charset_alias);
2362 defsubr (&Scharset_plist);
2363 defsubr (&Sset_charset_plist);
2364 defsubr (&Sunify_charset);
2365 defsubr (&Sget_unused_iso_final_char);
2366 defsubr (&Sdeclare_equiv_charset);
2367 defsubr (&Sfind_charset_region);
2368 defsubr (&Sfind_charset_string);
2369 defsubr (&Sdecode_char);
2370 defsubr (&Sencode_char);
2371 defsubr (&Ssplit_char);
2372 defsubr (&Smake_char);
2373 defsubr (&Schar_charset);
2374 defsubr (&Scharset_after);
2375 defsubr (&Siso_charset);
2376 defsubr (&Sclear_charset_maps);
2377 defsubr (&Scharset_priority_list);
2378 defsubr (&Sset_charset_priority);
2379 defsubr (&Scharset_id_internal);
2380 defsubr (&Ssort_charsets);
2381
2382 DEFVAR_LISP ("charset-map-path", Vcharset_map_path,
2383 doc: /* List of directories to search for charset map files. */);
2384 Vcharset_map_path = Qnil;
2385
2386 DEFVAR_BOOL ("inhibit-load-charset-map", inhibit_load_charset_map,
2387 doc: /* Inhibit loading of charset maps. Used when dumping Emacs. */);
2388 inhibit_load_charset_map = 0;
2389
2390 DEFVAR_LISP ("charset-list", Vcharset_list,
2391 doc: /* List of all charsets ever defined. */);
2392 Vcharset_list = Qnil;
2393
2394 DEFVAR_LISP ("current-iso639-language", Vcurrent_iso639_language,
2395 doc: /* ISO639 language mnemonic symbol for the current language environment.
2396 If the current language environment is for multiple languages (e.g. "Latin-1"),
2397 the value may be a list of mnemonics. */);
2398 Vcurrent_iso639_language = Qnil;
2399
2400 charset_ascii
2401 = define_charset_internal (Qascii, 1, "\x00\x7F\0\0\0\0\0",
2402 0, 127, 'B', -1, 0, 1, 0, 0);
2403 charset_iso_8859_1
2404 = define_charset_internal (Qiso_8859_1, 1, "\x00\xFF\0\0\0\0\0",
2405 0, 255, -1, -1, -1, 1, 0, 0);
2406 charset_unicode
2407 = define_charset_internal (Qunicode, 3, "\x00\xFF\x00\xFF\x00\x10\0",
2408 0, MAX_UNICODE_CHAR, -1, 0, -1, 1, 0, 0);
2409 charset_emacs
2410 = define_charset_internal (Qemacs, 3, "\x00\xFF\x00\xFF\x00\x3F\0",
2411 0, MAX_5_BYTE_CHAR, -1, 0, -1, 1, 1, 0);
2412 charset_eight_bit
2413 = define_charset_internal (Qeight_bit, 1, "\x80\xFF\0\0\0\0\0",
2414 128, 255, -1, 0, -1, 0, 1,
2415 MAX_5_BYTE_CHAR + 1);
2416 charset_unibyte = charset_iso_8859_1;
2417 }
2418
2419 #endif /* emacs */