]> code.delx.au - gnu-emacs/blob - src/coding.c
Merge from emacs-24; up to 2012-12-06T01:39:03Z!monnier@iro.umontreal.ca
[gnu-emacs] / src / coding.c
1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 2001-2013 Free Software Foundation, Inc.
3 Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
4 2005, 2006, 2007, 2008, 2009, 2010, 2011
5 National Institute of Advanced Industrial Science and Technology (AIST)
6 Registration Number H14PRO021
7 Copyright (C) 2003
8 National Institute of Advanced Industrial Science and Technology (AIST)
9 Registration Number H13PRO009
10
11 This file is part of GNU Emacs.
12
13 GNU Emacs is free software: you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation, either version 3 of the License, or
16 (at your option) any later version.
17
18 GNU Emacs is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
25
26 /*** TABLE OF CONTENTS ***
27
28 0. General comments
29 1. Preamble
30 2. Emacs' internal format (emacs-utf-8) handlers
31 3. UTF-8 handlers
32 4. UTF-16 handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
35 7. ISO2022 handlers
36 8. Shift-JIS and BIG5 handlers
37 9. CCL handlers
38 10. C library functions
39 11. Emacs Lisp library functions
40 12. Postamble
41
42 */
43
44 /*** 0. General comments ***
45
46
47 CODING SYSTEM
48
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
56 coding system.
57
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. On
59 the C level, a coding system is represented by a vector of attributes
60 stored in the hash table Vcharset_hash_table. The conversion from
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
63
64 Coding systems are classified into the following types depending on
65 the encoding mechanism. Here's a brief description of the types.
66
67 o UTF-8
68
69 o UTF-16
70
71 o Charset-base coding system
72
73 A coding system defined by one or more (coded) character sets.
74 Decoding and encoding are done by a code converter defined for each
75 character set.
76
77 o Old Emacs internal format (emacs-mule)
78
79 The coding system adopted by old versions of Emacs (20 and 21).
80
81 o ISO2022-base coding system
82
83 The most famous coding system for multiple character sets. X's
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
86 variants of ISO2022.
87
88 o SJIS (or Shift-JIS or MS-Kanji-Code)
89
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
92 section 8.
93
94 o BIG5
95
96 A coding system to encode character sets: ASCII and Big5. Widely
97 used for Chinese (mainly in Taiwan and Hong Kong). Details are
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
101
102 o CCL
103
104 If a user wants to decode/encode text encoded in a coding system
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
108
109 o Raw-text
110
111 A coding system for text containing raw eight-bit data. Emacs
112 treats each byte of source text as a character (except for
113 end-of-line conversion).
114
115 o No-conversion
116
117 Like raw text, but don't do end-of-line conversion.
118
119
120 END-OF-LINE FORMAT
121
122 How text end-of-line is encoded depends on operating system. For
123 instance, Unix's format is just one byte of LF (line-feed) code,
124 whereas DOS's format is two-byte sequence of `carriage-return' and
125 `line-feed' codes. MacOS's format is usually one byte of
126 `carriage-return'.
127
128 Since text character encoding and end-of-line encoding are
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
131
132 STRUCT CODING_SYSTEM
133
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
137 conversion (e.g. the location of source and destination data).
138
139 */
140
141 /* COMMON MACROS */
142
143
144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
145
146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX, and update the members of
148 DETECT_INFO.
149
150 Return true if the byte sequence conforms to XXX.
151
152 Below is the template of these functions. */
153
154 #if 0
155 static bool
156 detect_coding_XXX (struct coding_system *coding,
157 struct coding_detection_info *detect_info)
158 {
159 const unsigned char *src = coding->source;
160 const unsigned char *src_end = coding->source + coding->src_bytes;
161 bool multibytep = coding->src_multibyte;
162 ptrdiff_t consumed_chars = 0;
163 int found = 0;
164 ...;
165
166 while (1)
167 {
168 /* Get one byte from the source. If the source is exhausted, jump
169 to no_more_source:. */
170 ONE_MORE_BYTE (c);
171
172 if (! __C_conforms_to_XXX___ (c))
173 break;
174 if (! __C_strongly_suggests_XXX__ (c))
175 found = CATEGORY_MASK_XXX;
176 }
177 /* The byte sequence is invalid for XXX. */
178 detect_info->rejected |= CATEGORY_MASK_XXX;
179 return 0;
180
181 no_more_source:
182 /* The source exhausted successfully. */
183 detect_info->found |= found;
184 return 1;
185 }
186 #endif
187
188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
189
190 These functions decode a byte sequence specified as a source by
191 CODING. The resulting multibyte text goes to a place pointed to by
192 CODING->charbuf, the length of which should not exceed
193 CODING->charbuf_size;
194
195 These functions set the information of original and decoded texts in
196 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
197 They also set CODING->result to one of CODING_RESULT_XXX indicating
198 how the decoding is finished.
199
200 Below is the template of these functions. */
201
202 #if 0
203 static void
204 decode_coding_XXXX (struct coding_system *coding)
205 {
206 const unsigned char *src = coding->source + coding->consumed;
207 const unsigned char *src_end = coding->source + coding->src_bytes;
208 /* SRC_BASE remembers the start position in source in each loop.
209 The loop will be exited when there's not enough source code, or
210 when there's no room in CHARBUF for a decoded character. */
211 const unsigned char *src_base;
212 /* A buffer to produce decoded characters. */
213 int *charbuf = coding->charbuf + coding->charbuf_used;
214 int *charbuf_end = coding->charbuf + coding->charbuf_size;
215 bool multibytep = coding->src_multibyte;
216
217 while (1)
218 {
219 src_base = src;
220 if (charbuf < charbuf_end)
221 /* No more room to produce a decoded character. */
222 break;
223 ONE_MORE_BYTE (c);
224 /* Decode it. */
225 }
226
227 no_more_source:
228 if (src_base < src_end
229 && coding->mode & CODING_MODE_LAST_BLOCK)
230 /* If the source ends by partial bytes to construct a character,
231 treat them as eight-bit raw data. */
232 while (src_base < src_end && charbuf < charbuf_end)
233 *charbuf++ = *src_base++;
234 /* Remember how many bytes and characters we consumed. If the
235 source is multibyte, the bytes and chars are not identical. */
236 coding->consumed = coding->consumed_char = src_base - coding->source;
237 /* Remember how many characters we produced. */
238 coding->charbuf_used = charbuf - coding->charbuf;
239 }
240 #endif
241
242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
243
244 These functions encode SRC_BYTES length text at SOURCE of Emacs'
245 internal multibyte format by CODING. The resulting byte sequence
246 goes to a place pointed to by DESTINATION, the length of which
247 should not exceed DST_BYTES.
248
249 These functions set the information of original and encoded texts in
250 the members produced, produced_char, consumed, and consumed_char of
251 the structure *CODING. They also set the member result to one of
252 CODING_RESULT_XXX indicating how the encoding finished.
253
254 DST_BYTES zero means that source area and destination area are
255 overlapped, which means that we can produce a encoded text until it
256 reaches at the head of not-yet-encoded source text.
257
258 Below is a template of these functions. */
259 #if 0
260 static void
261 encode_coding_XXX (struct coding_system *coding)
262 {
263 bool multibytep = coding->dst_multibyte;
264 int *charbuf = coding->charbuf;
265 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
266 unsigned char *dst = coding->destination + coding->produced;
267 unsigned char *dst_end = coding->destination + coding->dst_bytes;
268 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
269 ptrdiff_t produced_chars = 0;
270
271 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
272 {
273 int c = *charbuf;
274 /* Encode C into DST, and increment DST. */
275 }
276 label_no_more_destination:
277 /* How many chars and bytes we produced. */
278 coding->produced_char += produced_chars;
279 coding->produced = dst - coding->destination;
280 }
281 #endif
282
283 \f
284 /*** 1. Preamble ***/
285
286 #include <config.h>
287 #include <stdio.h>
288
289 #include "lisp.h"
290 #include "character.h"
291 #include "buffer.h"
292 #include "charset.h"
293 #include "ccl.h"
294 #include "composite.h"
295 #include "coding.h"
296 #include "window.h"
297 #include "frame.h"
298 #include "termhooks.h"
299
300 Lisp_Object Vcoding_system_hash_table;
301
302 static Lisp_Object Qcoding_system, Qeol_type;
303 static Lisp_Object Qcoding_aliases;
304 Lisp_Object Qunix, Qdos;
305 static Lisp_Object Qmac;
306 Lisp_Object Qbuffer_file_coding_system;
307 static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
308 static Lisp_Object Qdefault_char;
309 Lisp_Object Qno_conversion, Qundecided;
310 Lisp_Object Qcharset, Qutf_8;
311 static Lisp_Object Qiso_2022;
312 static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
313 static Lisp_Object Qbig, Qlittle;
314 static Lisp_Object Qcoding_system_history;
315 static Lisp_Object Qvalid_codes;
316 static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
317 static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
318 static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
319 static Lisp_Object QCascii_compatible_p;
320
321 Lisp_Object Qcall_process, Qcall_process_region;
322 Lisp_Object Qstart_process, Qopen_network_stream;
323 static Lisp_Object Qtarget_idx;
324
325 static Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
326 static Lisp_Object Qinterrupted, Qinsufficient_memory;
327
328 /* If a symbol has this property, evaluate the value to define the
329 symbol as a coding system. */
330 static Lisp_Object Qcoding_system_define_form;
331
332 /* Format of end-of-line decided by system. This is Qunix on
333 Unix and Mac, Qdos on DOS/Windows.
334 This has an effect only for external encoding (i.e. for output to
335 file and process), not for in-buffer or Lisp string encoding. */
336 static Lisp_Object system_eol_type;
337
338 #ifdef emacs
339
340 Lisp_Object Qcoding_system_p, Qcoding_system_error;
341
342 /* Coding system emacs-mule and raw-text are for converting only
343 end-of-line format. */
344 Lisp_Object Qemacs_mule, Qraw_text;
345 Lisp_Object Qutf_8_emacs;
346
347 #if defined (WINDOWSNT) || defined (CYGWIN)
348 static Lisp_Object Qutf_16le;
349 #endif
350
351 /* Coding-systems are handed between Emacs Lisp programs and C internal
352 routines by the following three variables. */
353 /* Coding system to be used to encode text for terminal display when
354 terminal coding system is nil. */
355 struct coding_system safe_terminal_coding;
356
357 #endif /* emacs */
358
359 Lisp_Object Qtranslation_table;
360 Lisp_Object Qtranslation_table_id;
361 static Lisp_Object Qtranslation_table_for_decode;
362 static Lisp_Object Qtranslation_table_for_encode;
363
364 /* Two special coding systems. */
365 static Lisp_Object Vsjis_coding_system;
366 static Lisp_Object Vbig5_coding_system;
367
368 /* ISO2022 section */
369
370 #define CODING_ISO_INITIAL(coding, reg) \
371 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
372 coding_attr_iso_initial), \
373 reg)))
374
375
376 #define CODING_ISO_REQUEST(coding, charset_id) \
377 (((charset_id) <= (coding)->max_charset_id \
378 ? ((coding)->safe_charsets[charset_id] != 255 \
379 ? (coding)->safe_charsets[charset_id] \
380 : -1) \
381 : -1))
382
383
384 #define CODING_ISO_FLAGS(coding) \
385 ((coding)->spec.iso_2022.flags)
386 #define CODING_ISO_DESIGNATION(coding, reg) \
387 ((coding)->spec.iso_2022.current_designation[reg])
388 #define CODING_ISO_INVOCATION(coding, plane) \
389 ((coding)->spec.iso_2022.current_invocation[plane])
390 #define CODING_ISO_SINGLE_SHIFTING(coding) \
391 ((coding)->spec.iso_2022.single_shifting)
392 #define CODING_ISO_BOL(coding) \
393 ((coding)->spec.iso_2022.bol)
394 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
395 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
396 #define CODING_ISO_CMP_STATUS(coding) \
397 (&(coding)->spec.iso_2022.cmp_status)
398 #define CODING_ISO_EXTSEGMENT_LEN(coding) \
399 ((coding)->spec.iso_2022.ctext_extended_segment_len)
400 #define CODING_ISO_EMBEDDED_UTF_8(coding) \
401 ((coding)->spec.iso_2022.embedded_utf_8)
402
403 /* Control characters of ISO2022. */
404 /* code */ /* function */
405 #define ISO_CODE_SO 0x0E /* shift-out */
406 #define ISO_CODE_SI 0x0F /* shift-in */
407 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
408 #define ISO_CODE_ESC 0x1B /* escape */
409 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
410 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
411 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
412
413 /* All code (1-byte) of ISO2022 is classified into one of the
414 followings. */
415 enum iso_code_class_type
416 {
417 ISO_control_0, /* Control codes in the range
418 0x00..0x1F and 0x7F, except for the
419 following 5 codes. */
420 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
421 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
422 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
423 ISO_escape, /* ISO_CODE_ESC (0x1B) */
424 ISO_control_1, /* Control codes in the range
425 0x80..0x9F, except for the
426 following 3 codes. */
427 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
428 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
429 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
430 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
431 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
432 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
433 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
434 };
435
436 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
437 `iso-flags' attribute of an iso2022 coding system. */
438
439 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
440 instead of the correct short-form sequence (e.g. ESC $ A). */
441 #define CODING_ISO_FLAG_LONG_FORM 0x0001
442
443 /* If set, reset graphic planes and registers at end-of-line to the
444 initial state. */
445 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
446
447 /* If set, reset graphic planes and registers before any control
448 characters to the initial state. */
449 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
450
451 /* If set, encode by 7-bit environment. */
452 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
453
454 /* If set, use locking-shift function. */
455 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
456
457 /* If set, use single-shift function. Overwrite
458 CODING_ISO_FLAG_LOCKING_SHIFT. */
459 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
460
461 /* If set, use designation escape sequence. */
462 #define CODING_ISO_FLAG_DESIGNATION 0x0040
463
464 /* If set, produce revision number sequence. */
465 #define CODING_ISO_FLAG_REVISION 0x0080
466
467 /* If set, produce ISO6429's direction specifying sequence. */
468 #define CODING_ISO_FLAG_DIRECTION 0x0100
469
470 /* If set, assume designation states are reset at beginning of line on
471 output. */
472 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
473
474 /* If set, designation sequence should be placed at beginning of line
475 on output. */
476 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
477
478 /* If set, do not encode unsafe characters on output. */
479 #define CODING_ISO_FLAG_SAFE 0x0800
480
481 /* If set, extra latin codes (128..159) are accepted as a valid code
482 on input. */
483 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
484
485 #define CODING_ISO_FLAG_COMPOSITION 0x2000
486
487 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
488
489 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
490
491 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
492
493 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
494
495 /* A character to be produced on output if encoding of the original
496 character is prohibited by CODING_ISO_FLAG_SAFE. */
497 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
498
499 /* UTF-8 section */
500 #define CODING_UTF_8_BOM(coding) \
501 ((coding)->spec.utf_8_bom)
502
503 /* UTF-16 section */
504 #define CODING_UTF_16_BOM(coding) \
505 ((coding)->spec.utf_16.bom)
506
507 #define CODING_UTF_16_ENDIAN(coding) \
508 ((coding)->spec.utf_16.endian)
509
510 #define CODING_UTF_16_SURROGATE(coding) \
511 ((coding)->spec.utf_16.surrogate)
512
513
514 /* CCL section */
515 #define CODING_CCL_DECODER(coding) \
516 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
517 #define CODING_CCL_ENCODER(coding) \
518 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
519 #define CODING_CCL_VALIDS(coding) \
520 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
521
522 /* Index for each coding category in `coding_categories' */
523
524 enum coding_category
525 {
526 coding_category_iso_7,
527 coding_category_iso_7_tight,
528 coding_category_iso_8_1,
529 coding_category_iso_8_2,
530 coding_category_iso_7_else,
531 coding_category_iso_8_else,
532 coding_category_utf_8_auto,
533 coding_category_utf_8_nosig,
534 coding_category_utf_8_sig,
535 coding_category_utf_16_auto,
536 coding_category_utf_16_be,
537 coding_category_utf_16_le,
538 coding_category_utf_16_be_nosig,
539 coding_category_utf_16_le_nosig,
540 coding_category_charset,
541 coding_category_sjis,
542 coding_category_big5,
543 coding_category_ccl,
544 coding_category_emacs_mule,
545 /* All above are targets of code detection. */
546 coding_category_raw_text,
547 coding_category_undecided,
548 coding_category_max
549 };
550
551 /* Definitions of flag bits used in detect_coding_XXXX. */
552 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
553 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
554 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
555 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
556 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
557 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
558 #define CATEGORY_MASK_UTF_8_AUTO (1 << coding_category_utf_8_auto)
559 #define CATEGORY_MASK_UTF_8_NOSIG (1 << coding_category_utf_8_nosig)
560 #define CATEGORY_MASK_UTF_8_SIG (1 << coding_category_utf_8_sig)
561 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
562 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
563 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
564 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
565 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
566 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
567 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
568 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
569 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
570 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
571 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
572
573 /* This value is returned if detect_coding_mask () find nothing other
574 than ASCII characters. */
575 #define CATEGORY_MASK_ANY \
576 (CATEGORY_MASK_ISO_7 \
577 | CATEGORY_MASK_ISO_7_TIGHT \
578 | CATEGORY_MASK_ISO_8_1 \
579 | CATEGORY_MASK_ISO_8_2 \
580 | CATEGORY_MASK_ISO_7_ELSE \
581 | CATEGORY_MASK_ISO_8_ELSE \
582 | CATEGORY_MASK_UTF_8_AUTO \
583 | CATEGORY_MASK_UTF_8_NOSIG \
584 | CATEGORY_MASK_UTF_8_SIG \
585 | CATEGORY_MASK_UTF_16_AUTO \
586 | CATEGORY_MASK_UTF_16_BE \
587 | CATEGORY_MASK_UTF_16_LE \
588 | CATEGORY_MASK_UTF_16_BE_NOSIG \
589 | CATEGORY_MASK_UTF_16_LE_NOSIG \
590 | CATEGORY_MASK_CHARSET \
591 | CATEGORY_MASK_SJIS \
592 | CATEGORY_MASK_BIG5 \
593 | CATEGORY_MASK_CCL \
594 | CATEGORY_MASK_EMACS_MULE)
595
596
597 #define CATEGORY_MASK_ISO_7BIT \
598 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
599
600 #define CATEGORY_MASK_ISO_8BIT \
601 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
602
603 #define CATEGORY_MASK_ISO_ELSE \
604 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
605
606 #define CATEGORY_MASK_ISO_ESCAPE \
607 (CATEGORY_MASK_ISO_7 \
608 | CATEGORY_MASK_ISO_7_TIGHT \
609 | CATEGORY_MASK_ISO_7_ELSE \
610 | CATEGORY_MASK_ISO_8_ELSE)
611
612 #define CATEGORY_MASK_ISO \
613 ( CATEGORY_MASK_ISO_7BIT \
614 | CATEGORY_MASK_ISO_8BIT \
615 | CATEGORY_MASK_ISO_ELSE)
616
617 #define CATEGORY_MASK_UTF_16 \
618 (CATEGORY_MASK_UTF_16_AUTO \
619 | CATEGORY_MASK_UTF_16_BE \
620 | CATEGORY_MASK_UTF_16_LE \
621 | CATEGORY_MASK_UTF_16_BE_NOSIG \
622 | CATEGORY_MASK_UTF_16_LE_NOSIG)
623
624 #define CATEGORY_MASK_UTF_8 \
625 (CATEGORY_MASK_UTF_8_AUTO \
626 | CATEGORY_MASK_UTF_8_NOSIG \
627 | CATEGORY_MASK_UTF_8_SIG)
628
629 /* Table of coding categories (Lisp symbols). This variable is for
630 internal use only. */
631 static Lisp_Object Vcoding_category_table;
632
633 /* Table of coding-categories ordered by priority. */
634 static enum coding_category coding_priorities[coding_category_max];
635
636 /* Nth element is a coding context for the coding system bound to the
637 Nth coding category. */
638 static struct coding_system coding_categories[coding_category_max];
639
640 /*** Commonly used macros and functions ***/
641
642 #ifndef min
643 #define min(a, b) ((a) < (b) ? (a) : (b))
644 #endif
645 #ifndef max
646 #define max(a, b) ((a) > (b) ? (a) : (b))
647 #endif
648
649 #define CODING_GET_INFO(coding, attrs, charset_list) \
650 do { \
651 (attrs) = CODING_ID_ATTRS ((coding)->id); \
652 (charset_list) = CODING_ATTR_CHARSET_LIST (attrs); \
653 } while (0)
654
655
656 /* Safely get one byte from the source text pointed by SRC which ends
657 at SRC_END, and set C to that byte. If there are not enough bytes
658 in the source, it jumps to 'no_more_source'. If MULTIBYTEP,
659 and a multibyte character is found at SRC, set C to the
660 negative value of the character code. The caller should declare
661 and set these variables appropriately in advance:
662 src, src_end, multibytep */
663
664 #define ONE_MORE_BYTE(c) \
665 do { \
666 if (src == src_end) \
667 { \
668 if (src_base < src) \
669 record_conversion_result \
670 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
671 goto no_more_source; \
672 } \
673 c = *src++; \
674 if (multibytep && (c & 0x80)) \
675 { \
676 if ((c & 0xFE) == 0xC0) \
677 c = ((c & 1) << 6) | *src++; \
678 else \
679 { \
680 src--; \
681 c = - string_char (src, &src, NULL); \
682 record_conversion_result \
683 (coding, CODING_RESULT_INVALID_SRC); \
684 } \
685 } \
686 consumed_chars++; \
687 } while (0)
688
689 /* Safely get two bytes from the source text pointed by SRC which ends
690 at SRC_END, and set C1 and C2 to those bytes while skipping the
691 heading multibyte characters. If there are not enough bytes in the
692 source, it jumps to 'no_more_source'. If MULTIBYTEP and
693 a multibyte character is found for C2, set C2 to the negative value
694 of the character code. The caller should declare and set these
695 variables appropriately in advance:
696 src, src_end, multibytep
697 It is intended that this macro is used in detect_coding_utf_16. */
698
699 #define TWO_MORE_BYTES(c1, c2) \
700 do { \
701 do { \
702 if (src == src_end) \
703 goto no_more_source; \
704 c1 = *src++; \
705 if (multibytep && (c1 & 0x80)) \
706 { \
707 if ((c1 & 0xFE) == 0xC0) \
708 c1 = ((c1 & 1) << 6) | *src++; \
709 else \
710 { \
711 src += BYTES_BY_CHAR_HEAD (c1) - 1; \
712 c1 = -1; \
713 } \
714 } \
715 } while (c1 < 0); \
716 if (src == src_end) \
717 goto no_more_source; \
718 c2 = *src++; \
719 if (multibytep && (c2 & 0x80)) \
720 { \
721 if ((c2 & 0xFE) == 0xC0) \
722 c2 = ((c2 & 1) << 6) | *src++; \
723 else \
724 c2 = -1; \
725 } \
726 } while (0)
727
728
729 /* Store a byte C in the place pointed by DST and increment DST to the
730 next free point, and increment PRODUCED_CHARS. The caller should
731 assure that C is 0..127, and declare and set the variable `dst'
732 appropriately in advance.
733 */
734
735
736 #define EMIT_ONE_ASCII_BYTE(c) \
737 do { \
738 produced_chars++; \
739 *dst++ = (c); \
740 } while (0)
741
742
743 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2. */
744
745 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
746 do { \
747 produced_chars += 2; \
748 *dst++ = (c1), *dst++ = (c2); \
749 } while (0)
750
751
752 /* Store a byte C in the place pointed by DST and increment DST to the
753 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP,
754 store in an appropriate multibyte form. The caller should
755 declare and set the variables `dst' and `multibytep' appropriately
756 in advance. */
757
758 #define EMIT_ONE_BYTE(c) \
759 do { \
760 produced_chars++; \
761 if (multibytep) \
762 { \
763 unsigned ch = (c); \
764 if (ch >= 0x80) \
765 ch = BYTE8_TO_CHAR (ch); \
766 CHAR_STRING_ADVANCE (ch, dst); \
767 } \
768 else \
769 *dst++ = (c); \
770 } while (0)
771
772
773 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
774
775 #define EMIT_TWO_BYTES(c1, c2) \
776 do { \
777 produced_chars += 2; \
778 if (multibytep) \
779 { \
780 unsigned ch; \
781 \
782 ch = (c1); \
783 if (ch >= 0x80) \
784 ch = BYTE8_TO_CHAR (ch); \
785 CHAR_STRING_ADVANCE (ch, dst); \
786 ch = (c2); \
787 if (ch >= 0x80) \
788 ch = BYTE8_TO_CHAR (ch); \
789 CHAR_STRING_ADVANCE (ch, dst); \
790 } \
791 else \
792 { \
793 *dst++ = (c1); \
794 *dst++ = (c2); \
795 } \
796 } while (0)
797
798
799 #define EMIT_THREE_BYTES(c1, c2, c3) \
800 do { \
801 EMIT_ONE_BYTE (c1); \
802 EMIT_TWO_BYTES (c2, c3); \
803 } while (0)
804
805
806 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
807 do { \
808 EMIT_TWO_BYTES (c1, c2); \
809 EMIT_TWO_BYTES (c3, c4); \
810 } while (0)
811
812
813 static void
814 record_conversion_result (struct coding_system *coding,
815 enum coding_result_code result)
816 {
817 coding->result = result;
818 switch (result)
819 {
820 case CODING_RESULT_INSUFFICIENT_SRC:
821 Vlast_code_conversion_error = Qinsufficient_source;
822 break;
823 case CODING_RESULT_INCONSISTENT_EOL:
824 Vlast_code_conversion_error = Qinconsistent_eol;
825 break;
826 case CODING_RESULT_INVALID_SRC:
827 Vlast_code_conversion_error = Qinvalid_source;
828 break;
829 case CODING_RESULT_INTERRUPT:
830 Vlast_code_conversion_error = Qinterrupted;
831 break;
832 case CODING_RESULT_INSUFFICIENT_MEM:
833 Vlast_code_conversion_error = Qinsufficient_memory;
834 break;
835 case CODING_RESULT_INSUFFICIENT_DST:
836 /* Don't record this error in Vlast_code_conversion_error
837 because it happens just temporarily and is resolved when the
838 whole conversion is finished. */
839 break;
840 case CODING_RESULT_SUCCESS:
841 break;
842 default:
843 Vlast_code_conversion_error = intern ("Unknown error");
844 }
845 }
846
847 /* These wrapper macros are used to preserve validity of pointers into
848 buffer text across calls to decode_char, encode_char, etc, which
849 could cause relocation of buffers if it loads a charset map,
850 because loading a charset map allocates large structures. */
851
852 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
853 do { \
854 ptrdiff_t offset; \
855 \
856 charset_map_loaded = 0; \
857 c = DECODE_CHAR (charset, code); \
858 if (charset_map_loaded \
859 && (offset = coding_change_source (coding))) \
860 { \
861 src += offset; \
862 src_base += offset; \
863 src_end += offset; \
864 } \
865 } while (0)
866
867 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code) \
868 do { \
869 ptrdiff_t offset; \
870 \
871 charset_map_loaded = 0; \
872 code = ENCODE_CHAR (charset, c); \
873 if (charset_map_loaded \
874 && (offset = coding_change_destination (coding))) \
875 { \
876 dst += offset; \
877 dst_end += offset; \
878 } \
879 } while (0)
880
881 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
882 do { \
883 ptrdiff_t offset; \
884 \
885 charset_map_loaded = 0; \
886 charset = char_charset (c, charset_list, code_return); \
887 if (charset_map_loaded \
888 && (offset = coding_change_destination (coding))) \
889 { \
890 dst += offset; \
891 dst_end += offset; \
892 } \
893 } while (0)
894
895 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
896 do { \
897 ptrdiff_t offset; \
898 \
899 charset_map_loaded = 0; \
900 result = CHAR_CHARSET_P (c, charset); \
901 if (charset_map_loaded \
902 && (offset = coding_change_destination (coding))) \
903 { \
904 dst += offset; \
905 dst_end += offset; \
906 } \
907 } while (0)
908
909
910 /* If there are at least BYTES length of room at dst, allocate memory
911 for coding->destination and update dst and dst_end. We don't have
912 to take care of coding->source which will be relocated. It is
913 handled by calling coding_set_source in encode_coding. */
914
915 #define ASSURE_DESTINATION(bytes) \
916 do { \
917 if (dst + (bytes) >= dst_end) \
918 { \
919 ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
920 \
921 dst = alloc_destination (coding, more_bytes, dst); \
922 dst_end = coding->destination + coding->dst_bytes; \
923 } \
924 } while (0)
925
926
927 /* Store multibyte form of the character C in P, and advance P to the
928 end of the multibyte form. This used to be like CHAR_STRING_ADVANCE
929 without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
930 MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE. */
931
932 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p) CHAR_STRING_ADVANCE(c, p)
933
934 /* Return the character code of character whose multibyte form is at
935 P, and advance P to the end of the multibyte form. This used to be
936 like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
937 nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR. */
938
939 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
940
941 /* Set coding->source from coding->src_object. */
942
943 static void
944 coding_set_source (struct coding_system *coding)
945 {
946 if (BUFFERP (coding->src_object))
947 {
948 struct buffer *buf = XBUFFER (coding->src_object);
949
950 if (coding->src_pos < 0)
951 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
952 else
953 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
954 }
955 else if (STRINGP (coding->src_object))
956 {
957 coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
958 }
959 else
960 {
961 /* Otherwise, the source is C string and is never relocated
962 automatically. Thus we don't have to update anything. */
963 }
964 }
965
966
967 /* Set coding->source from coding->src_object, and return how many
968 bytes coding->source was changed. */
969
970 static ptrdiff_t
971 coding_change_source (struct coding_system *coding)
972 {
973 const unsigned char *orig = coding->source;
974 coding_set_source (coding);
975 return coding->source - orig;
976 }
977
978
979 /* Set coding->destination from coding->dst_object. */
980
981 static void
982 coding_set_destination (struct coding_system *coding)
983 {
984 if (BUFFERP (coding->dst_object))
985 {
986 if (BUFFERP (coding->src_object) && coding->src_pos < 0)
987 {
988 coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
989 coding->dst_bytes = (GAP_END_ADDR
990 - (coding->src_bytes - coding->consumed)
991 - coding->destination);
992 }
993 else
994 {
995 /* We are sure that coding->dst_pos_byte is before the gap
996 of the buffer. */
997 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
998 + coding->dst_pos_byte - BEG_BYTE);
999 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1000 - coding->destination);
1001 }
1002 }
1003 else
1004 {
1005 /* Otherwise, the destination is C string and is never relocated
1006 automatically. Thus we don't have to update anything. */
1007 }
1008 }
1009
1010
1011 /* Set coding->destination from coding->dst_object, and return how
1012 many bytes coding->destination was changed. */
1013
1014 static ptrdiff_t
1015 coding_change_destination (struct coding_system *coding)
1016 {
1017 const unsigned char *orig = coding->destination;
1018 coding_set_destination (coding);
1019 return coding->destination - orig;
1020 }
1021
1022
1023 static void
1024 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1025 {
1026 if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
1027 string_overflow ();
1028 coding->destination = xrealloc (coding->destination,
1029 coding->dst_bytes + bytes);
1030 coding->dst_bytes += bytes;
1031 }
1032
1033 static void
1034 coding_alloc_by_making_gap (struct coding_system *coding,
1035 ptrdiff_t gap_head_used, ptrdiff_t bytes)
1036 {
1037 if (EQ (coding->src_object, coding->dst_object))
1038 {
1039 /* The gap may contain the produced data at the head and not-yet
1040 consumed data at the tail. To preserve those data, we at
1041 first make the gap size to zero, then increase the gap
1042 size. */
1043 ptrdiff_t add = GAP_SIZE;
1044
1045 GPT += gap_head_used, GPT_BYTE += gap_head_used;
1046 GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1047 make_gap (bytes);
1048 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1049 GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1050 }
1051 else
1052 {
1053 Lisp_Object this_buffer;
1054
1055 this_buffer = Fcurrent_buffer ();
1056 set_buffer_internal (XBUFFER (coding->dst_object));
1057 make_gap (bytes);
1058 set_buffer_internal (XBUFFER (this_buffer));
1059 }
1060 }
1061
1062
1063 static unsigned char *
1064 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1065 unsigned char *dst)
1066 {
1067 ptrdiff_t offset = dst - coding->destination;
1068
1069 if (BUFFERP (coding->dst_object))
1070 {
1071 struct buffer *buf = XBUFFER (coding->dst_object);
1072
1073 coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1074 }
1075 else
1076 coding_alloc_by_realloc (coding, nbytes);
1077 coding_set_destination (coding);
1078 dst = coding->destination + offset;
1079 return dst;
1080 }
1081
1082 /** Macros for annotations. */
1083
1084 /* An annotation data is stored in the array coding->charbuf in this
1085 format:
1086 [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1087 LENGTH is the number of elements in the annotation.
1088 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1089 NCHARS is the number of characters in the text annotated.
1090
1091 The format of the following elements depend on ANNOTATION_MASK.
1092
1093 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1094 follows:
1095 ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1096
1097 NBYTES is the number of bytes specified in the header part of
1098 old-style emacs-mule encoding, or 0 for the other kind of
1099 composition.
1100
1101 METHOD is one of enum composition_method.
1102
1103 Optional COMPOSITION-COMPONENTS are characters and composition
1104 rules.
1105
1106 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1107 follows.
1108
1109 If ANNOTATION_MASK is 0, this annotation is just a space holder to
1110 recover from an invalid annotation, and should be skipped by
1111 produce_annotation. */
1112
1113 /* Maximum length of the header of annotation data. */
1114 #define MAX_ANNOTATION_LENGTH 5
1115
1116 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars) \
1117 do { \
1118 *(buf)++ = -(len); \
1119 *(buf)++ = (mask); \
1120 *(buf)++ = (nchars); \
1121 coding->annotated = 1; \
1122 } while (0);
1123
1124 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method) \
1125 do { \
1126 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1127 *buf++ = nbytes; \
1128 *buf++ = method; \
1129 } while (0)
1130
1131
1132 #define ADD_CHARSET_DATA(buf, nchars, id) \
1133 do { \
1134 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1135 *buf++ = id; \
1136 } while (0)
1137
1138 \f
1139 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1140
1141
1142
1143 \f
1144 /*** 3. UTF-8 ***/
1145
1146 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1147 Return true if a text is encoded in UTF-8. */
1148
1149 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1150 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1151 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1152 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1153 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1154 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1155
1156 #define UTF_8_BOM_1 0xEF
1157 #define UTF_8_BOM_2 0xBB
1158 #define UTF_8_BOM_3 0xBF
1159
1160 static bool
1161 detect_coding_utf_8 (struct coding_system *coding,
1162 struct coding_detection_info *detect_info)
1163 {
1164 const unsigned char *src = coding->source, *src_base;
1165 const unsigned char *src_end = coding->source + coding->src_bytes;
1166 bool multibytep = coding->src_multibyte;
1167 ptrdiff_t consumed_chars = 0;
1168 bool bom_found = 0;
1169 bool found = 0;
1170
1171 detect_info->checked |= CATEGORY_MASK_UTF_8;
1172 /* A coding system of this category is always ASCII compatible. */
1173 src += coding->head_ascii;
1174
1175 while (1)
1176 {
1177 int c, c1, c2, c3, c4;
1178
1179 src_base = src;
1180 ONE_MORE_BYTE (c);
1181 if (c < 0 || UTF_8_1_OCTET_P (c))
1182 continue;
1183 ONE_MORE_BYTE (c1);
1184 if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1185 break;
1186 if (UTF_8_2_OCTET_LEADING_P (c))
1187 {
1188 found = 1;
1189 continue;
1190 }
1191 ONE_MORE_BYTE (c2);
1192 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1193 break;
1194 if (UTF_8_3_OCTET_LEADING_P (c))
1195 {
1196 found = 1;
1197 if (src_base == coding->source
1198 && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1199 bom_found = 1;
1200 continue;
1201 }
1202 ONE_MORE_BYTE (c3);
1203 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1204 break;
1205 if (UTF_8_4_OCTET_LEADING_P (c))
1206 {
1207 found = 1;
1208 continue;
1209 }
1210 ONE_MORE_BYTE (c4);
1211 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1212 break;
1213 if (UTF_8_5_OCTET_LEADING_P (c))
1214 {
1215 found = 1;
1216 continue;
1217 }
1218 break;
1219 }
1220 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1221 return 0;
1222
1223 no_more_source:
1224 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1225 {
1226 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1227 return 0;
1228 }
1229 if (bom_found)
1230 {
1231 /* The first character 0xFFFE doesn't necessarily mean a BOM. */
1232 detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1233 }
1234 else
1235 {
1236 detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1237 if (found)
1238 detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
1239 }
1240 return 1;
1241 }
1242
1243
1244 static void
1245 decode_coding_utf_8 (struct coding_system *coding)
1246 {
1247 const unsigned char *src = coding->source + coding->consumed;
1248 const unsigned char *src_end = coding->source + coding->src_bytes;
1249 const unsigned char *src_base;
1250 int *charbuf = coding->charbuf + coding->charbuf_used;
1251 int *charbuf_end = coding->charbuf + coding->charbuf_size;
1252 ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1253 bool multibytep = coding->src_multibyte;
1254 enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1255 bool eol_dos
1256 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1257 int byte_after_cr = -1;
1258
1259 if (bom != utf_without_bom)
1260 {
1261 int c1, c2, c3;
1262
1263 src_base = src;
1264 ONE_MORE_BYTE (c1);
1265 if (! UTF_8_3_OCTET_LEADING_P (c1))
1266 src = src_base;
1267 else
1268 {
1269 ONE_MORE_BYTE (c2);
1270 if (! UTF_8_EXTRA_OCTET_P (c2))
1271 src = src_base;
1272 else
1273 {
1274 ONE_MORE_BYTE (c3);
1275 if (! UTF_8_EXTRA_OCTET_P (c3))
1276 src = src_base;
1277 else
1278 {
1279 if ((c1 != UTF_8_BOM_1)
1280 || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1281 src = src_base;
1282 else
1283 CODING_UTF_8_BOM (coding) = utf_without_bom;
1284 }
1285 }
1286 }
1287 }
1288 CODING_UTF_8_BOM (coding) = utf_without_bom;
1289
1290 while (1)
1291 {
1292 int c, c1, c2, c3, c4, c5;
1293
1294 src_base = src;
1295 consumed_chars_base = consumed_chars;
1296
1297 if (charbuf >= charbuf_end)
1298 {
1299 if (byte_after_cr >= 0)
1300 src_base--;
1301 break;
1302 }
1303
1304 if (byte_after_cr >= 0)
1305 c1 = byte_after_cr, byte_after_cr = -1;
1306 else
1307 ONE_MORE_BYTE (c1);
1308 if (c1 < 0)
1309 {
1310 c = - c1;
1311 }
1312 else if (UTF_8_1_OCTET_P (c1))
1313 {
1314 if (eol_dos && c1 == '\r')
1315 ONE_MORE_BYTE (byte_after_cr);
1316 c = c1;
1317 }
1318 else
1319 {
1320 ONE_MORE_BYTE (c2);
1321 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1322 goto invalid_code;
1323 if (UTF_8_2_OCTET_LEADING_P (c1))
1324 {
1325 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1326 /* Reject overlong sequences here and below. Encoders
1327 producing them are incorrect, they can be misleading,
1328 and they mess up read/write invariance. */
1329 if (c < 128)
1330 goto invalid_code;
1331 }
1332 else
1333 {
1334 ONE_MORE_BYTE (c3);
1335 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1336 goto invalid_code;
1337 if (UTF_8_3_OCTET_LEADING_P (c1))
1338 {
1339 c = (((c1 & 0xF) << 12)
1340 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1341 if (c < 0x800
1342 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1343 goto invalid_code;
1344 }
1345 else
1346 {
1347 ONE_MORE_BYTE (c4);
1348 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1349 goto invalid_code;
1350 if (UTF_8_4_OCTET_LEADING_P (c1))
1351 {
1352 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1353 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1354 if (c < 0x10000)
1355 goto invalid_code;
1356 }
1357 else
1358 {
1359 ONE_MORE_BYTE (c5);
1360 if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1361 goto invalid_code;
1362 if (UTF_8_5_OCTET_LEADING_P (c1))
1363 {
1364 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1365 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1366 | (c5 & 0x3F));
1367 if ((c > MAX_CHAR) || (c < 0x200000))
1368 goto invalid_code;
1369 }
1370 else
1371 goto invalid_code;
1372 }
1373 }
1374 }
1375 }
1376
1377 *charbuf++ = c;
1378 continue;
1379
1380 invalid_code:
1381 src = src_base;
1382 consumed_chars = consumed_chars_base;
1383 ONE_MORE_BYTE (c);
1384 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1385 coding->errors++;
1386 }
1387
1388 no_more_source:
1389 coding->consumed_char += consumed_chars_base;
1390 coding->consumed = src_base - coding->source;
1391 coding->charbuf_used = charbuf - coding->charbuf;
1392 }
1393
1394
1395 static bool
1396 encode_coding_utf_8 (struct coding_system *coding)
1397 {
1398 bool multibytep = coding->dst_multibyte;
1399 int *charbuf = coding->charbuf;
1400 int *charbuf_end = charbuf + coding->charbuf_used;
1401 unsigned char *dst = coding->destination + coding->produced;
1402 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1403 ptrdiff_t produced_chars = 0;
1404 int c;
1405
1406 if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1407 {
1408 ASSURE_DESTINATION (3);
1409 EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1410 CODING_UTF_8_BOM (coding) = utf_without_bom;
1411 }
1412
1413 if (multibytep)
1414 {
1415 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1416
1417 while (charbuf < charbuf_end)
1418 {
1419 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1420
1421 ASSURE_DESTINATION (safe_room);
1422 c = *charbuf++;
1423 if (CHAR_BYTE8_P (c))
1424 {
1425 c = CHAR_TO_BYTE8 (c);
1426 EMIT_ONE_BYTE (c);
1427 }
1428 else
1429 {
1430 CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1431 for (p = str; p < pend; p++)
1432 EMIT_ONE_BYTE (*p);
1433 }
1434 }
1435 }
1436 else
1437 {
1438 int safe_room = MAX_MULTIBYTE_LENGTH;
1439
1440 while (charbuf < charbuf_end)
1441 {
1442 ASSURE_DESTINATION (safe_room);
1443 c = *charbuf++;
1444 if (CHAR_BYTE8_P (c))
1445 *dst++ = CHAR_TO_BYTE8 (c);
1446 else
1447 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1448 produced_chars++;
1449 }
1450 }
1451 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1452 coding->produced_char += produced_chars;
1453 coding->produced = dst - coding->destination;
1454 return 0;
1455 }
1456
1457
1458 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1459 Return true if a text is encoded in one of UTF-16 based coding systems. */
1460
1461 #define UTF_16_HIGH_SURROGATE_P(val) \
1462 (((val) & 0xFC00) == 0xD800)
1463
1464 #define UTF_16_LOW_SURROGATE_P(val) \
1465 (((val) & 0xFC00) == 0xDC00)
1466
1467
1468 static bool
1469 detect_coding_utf_16 (struct coding_system *coding,
1470 struct coding_detection_info *detect_info)
1471 {
1472 const unsigned char *src = coding->source;
1473 const unsigned char *src_end = coding->source + coding->src_bytes;
1474 bool multibytep = coding->src_multibyte;
1475 int c1, c2;
1476
1477 detect_info->checked |= CATEGORY_MASK_UTF_16;
1478 if (coding->mode & CODING_MODE_LAST_BLOCK
1479 && (coding->src_chars & 1))
1480 {
1481 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1482 return 0;
1483 }
1484
1485 TWO_MORE_BYTES (c1, c2);
1486 if ((c1 == 0xFF) && (c2 == 0xFE))
1487 {
1488 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1489 | CATEGORY_MASK_UTF_16_AUTO);
1490 detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1491 | CATEGORY_MASK_UTF_16_BE_NOSIG
1492 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1493 }
1494 else if ((c1 == 0xFE) && (c2 == 0xFF))
1495 {
1496 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1497 | CATEGORY_MASK_UTF_16_AUTO);
1498 detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1499 | CATEGORY_MASK_UTF_16_BE_NOSIG
1500 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1501 }
1502 else if (c2 < 0)
1503 {
1504 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1505 return 0;
1506 }
1507 else
1508 {
1509 /* We check the dispersion of Eth and Oth bytes where E is even and
1510 O is odd. If both are high, we assume binary data.*/
1511 unsigned char e[256], o[256];
1512 unsigned e_num = 1, o_num = 1;
1513
1514 memset (e, 0, 256);
1515 memset (o, 0, 256);
1516 e[c1] = 1;
1517 o[c2] = 1;
1518
1519 detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1520 |CATEGORY_MASK_UTF_16_BE
1521 | CATEGORY_MASK_UTF_16_LE);
1522
1523 while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1524 != CATEGORY_MASK_UTF_16)
1525 {
1526 TWO_MORE_BYTES (c1, c2);
1527 if (c2 < 0)
1528 break;
1529 if (! e[c1])
1530 {
1531 e[c1] = 1;
1532 e_num++;
1533 if (e_num >= 128)
1534 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1535 }
1536 if (! o[c2])
1537 {
1538 o[c2] = 1;
1539 o_num++;
1540 if (o_num >= 128)
1541 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1542 }
1543 }
1544 return 0;
1545 }
1546
1547 no_more_source:
1548 return 1;
1549 }
1550
1551 static void
1552 decode_coding_utf_16 (struct coding_system *coding)
1553 {
1554 const unsigned char *src = coding->source + coding->consumed;
1555 const unsigned char *src_end = coding->source + coding->src_bytes;
1556 const unsigned char *src_base;
1557 int *charbuf = coding->charbuf + coding->charbuf_used;
1558 /* We may produces at most 3 chars in one loop. */
1559 int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1560 ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1561 bool multibytep = coding->src_multibyte;
1562 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1563 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1564 int surrogate = CODING_UTF_16_SURROGATE (coding);
1565 bool eol_dos
1566 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1567 int byte_after_cr1 = -1, byte_after_cr2 = -1;
1568
1569 if (bom == utf_with_bom)
1570 {
1571 int c, c1, c2;
1572
1573 src_base = src;
1574 ONE_MORE_BYTE (c1);
1575 ONE_MORE_BYTE (c2);
1576 c = (c1 << 8) | c2;
1577
1578 if (endian == utf_16_big_endian
1579 ? c != 0xFEFF : c != 0xFFFE)
1580 {
1581 /* The first two bytes are not BOM. Treat them as bytes
1582 for a normal character. */
1583 src = src_base;
1584 coding->errors++;
1585 }
1586 CODING_UTF_16_BOM (coding) = utf_without_bom;
1587 }
1588 else if (bom == utf_detect_bom)
1589 {
1590 /* We have already tried to detect BOM and failed in
1591 detect_coding. */
1592 CODING_UTF_16_BOM (coding) = utf_without_bom;
1593 }
1594
1595 while (1)
1596 {
1597 int c, c1, c2;
1598
1599 src_base = src;
1600 consumed_chars_base = consumed_chars;
1601
1602 if (charbuf >= charbuf_end)
1603 {
1604 if (byte_after_cr1 >= 0)
1605 src_base -= 2;
1606 break;
1607 }
1608
1609 if (byte_after_cr1 >= 0)
1610 c1 = byte_after_cr1, byte_after_cr1 = -1;
1611 else
1612 ONE_MORE_BYTE (c1);
1613 if (c1 < 0)
1614 {
1615 *charbuf++ = -c1;
1616 continue;
1617 }
1618 if (byte_after_cr2 >= 0)
1619 c2 = byte_after_cr2, byte_after_cr2 = -1;
1620 else
1621 ONE_MORE_BYTE (c2);
1622 if (c2 < 0)
1623 {
1624 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1625 *charbuf++ = -c2;
1626 continue;
1627 }
1628 c = (endian == utf_16_big_endian
1629 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1630
1631 if (surrogate)
1632 {
1633 if (! UTF_16_LOW_SURROGATE_P (c))
1634 {
1635 if (endian == utf_16_big_endian)
1636 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1637 else
1638 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1639 *charbuf++ = c1;
1640 *charbuf++ = c2;
1641 coding->errors++;
1642 if (UTF_16_HIGH_SURROGATE_P (c))
1643 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1644 else
1645 *charbuf++ = c;
1646 }
1647 else
1648 {
1649 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1650 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1651 *charbuf++ = 0x10000 + c;
1652 }
1653 }
1654 else
1655 {
1656 if (UTF_16_HIGH_SURROGATE_P (c))
1657 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1658 else
1659 {
1660 if (eol_dos && c == '\r')
1661 {
1662 ONE_MORE_BYTE (byte_after_cr1);
1663 ONE_MORE_BYTE (byte_after_cr2);
1664 }
1665 *charbuf++ = c;
1666 }
1667 }
1668 }
1669
1670 no_more_source:
1671 coding->consumed_char += consumed_chars_base;
1672 coding->consumed = src_base - coding->source;
1673 coding->charbuf_used = charbuf - coding->charbuf;
1674 }
1675
1676 static bool
1677 encode_coding_utf_16 (struct coding_system *coding)
1678 {
1679 bool multibytep = coding->dst_multibyte;
1680 int *charbuf = coding->charbuf;
1681 int *charbuf_end = charbuf + coding->charbuf_used;
1682 unsigned char *dst = coding->destination + coding->produced;
1683 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1684 int safe_room = 8;
1685 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1686 bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1687 ptrdiff_t produced_chars = 0;
1688 int c;
1689
1690 if (bom != utf_without_bom)
1691 {
1692 ASSURE_DESTINATION (safe_room);
1693 if (big_endian)
1694 EMIT_TWO_BYTES (0xFE, 0xFF);
1695 else
1696 EMIT_TWO_BYTES (0xFF, 0xFE);
1697 CODING_UTF_16_BOM (coding) = utf_without_bom;
1698 }
1699
1700 while (charbuf < charbuf_end)
1701 {
1702 ASSURE_DESTINATION (safe_room);
1703 c = *charbuf++;
1704 if (c > MAX_UNICODE_CHAR)
1705 c = coding->default_char;
1706
1707 if (c < 0x10000)
1708 {
1709 if (big_endian)
1710 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1711 else
1712 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1713 }
1714 else
1715 {
1716 int c1, c2;
1717
1718 c -= 0x10000;
1719 c1 = (c >> 10) + 0xD800;
1720 c2 = (c & 0x3FF) + 0xDC00;
1721 if (big_endian)
1722 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1723 else
1724 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1725 }
1726 }
1727 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1728 coding->produced = dst - coding->destination;
1729 coding->produced_char += produced_chars;
1730 return 0;
1731 }
1732
1733 \f
1734 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1735
1736 /* Emacs' internal format for representation of multiple character
1737 sets is a kind of multi-byte encoding, i.e. characters are
1738 represented by variable-length sequences of one-byte codes.
1739
1740 ASCII characters and control characters (e.g. `tab', `newline') are
1741 represented by one-byte sequences which are their ASCII codes, in
1742 the range 0x00 through 0x7F.
1743
1744 8-bit characters of the range 0x80..0x9F are represented by
1745 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1746 code + 0x20).
1747
1748 8-bit characters of the range 0xA0..0xFF are represented by
1749 one-byte sequences which are their 8-bit code.
1750
1751 The other characters are represented by a sequence of `base
1752 leading-code', optional `extended leading-code', and one or two
1753 `position-code's. The length of the sequence is determined by the
1754 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1755 whereas extended leading-code and position-code take the range 0xA0
1756 through 0xFF. See `charset.h' for more details about leading-code
1757 and position-code.
1758
1759 --- CODE RANGE of Emacs' internal format ---
1760 character set range
1761 ------------- -----
1762 ascii 0x00..0x7F
1763 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1764 eight-bit-graphic 0xA0..0xBF
1765 ELSE 0x81..0x9D + [0xA0..0xFF]+
1766 ---------------------------------------------
1767
1768 As this is the internal character representation, the format is
1769 usually not used externally (i.e. in a file or in a data sent to a
1770 process). But, it is possible to have a text externally in this
1771 format (i.e. by encoding by the coding system `emacs-mule').
1772
1773 In that case, a sequence of one-byte codes has a slightly different
1774 form.
1775
1776 At first, all characters in eight-bit-control are represented by
1777 one-byte sequences which are their 8-bit code.
1778
1779 Next, character composition data are represented by the byte
1780 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1781 where,
1782 METHOD is 0xF2 plus one of composition method (enum
1783 composition_method),
1784
1785 BYTES is 0xA0 plus a byte length of this composition data,
1786
1787 CHARS is 0xA0 plus a number of characters composed by this
1788 data,
1789
1790 COMPONENTs are characters of multibyte form or composition
1791 rules encoded by two-byte of ASCII codes.
1792
1793 In addition, for backward compatibility, the following formats are
1794 also recognized as composition data on decoding.
1795
1796 0x80 MSEQ ...
1797 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1798
1799 Here,
1800 MSEQ is a multibyte form but in these special format:
1801 ASCII: 0xA0 ASCII_CODE+0x80,
1802 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1803 RULE is a one byte code of the range 0xA0..0xF0 that
1804 represents a composition rule.
1805 */
1806
1807 char emacs_mule_bytes[256];
1808
1809
1810 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1811 Return true if a text is encoded in 'emacs-mule'. */
1812
1813 static bool
1814 detect_coding_emacs_mule (struct coding_system *coding,
1815 struct coding_detection_info *detect_info)
1816 {
1817 const unsigned char *src = coding->source, *src_base;
1818 const unsigned char *src_end = coding->source + coding->src_bytes;
1819 bool multibytep = coding->src_multibyte;
1820 ptrdiff_t consumed_chars = 0;
1821 int c;
1822 int found = 0;
1823
1824 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1825 /* A coding system of this category is always ASCII compatible. */
1826 src += coding->head_ascii;
1827
1828 while (1)
1829 {
1830 src_base = src;
1831 ONE_MORE_BYTE (c);
1832 if (c < 0)
1833 continue;
1834 if (c == 0x80)
1835 {
1836 /* Perhaps the start of composite character. We simply skip
1837 it because analyzing it is too heavy for detecting. But,
1838 at least, we check that the composite character
1839 constitutes of more than 4 bytes. */
1840 const unsigned char *src_start;
1841
1842 repeat:
1843 src_start = src;
1844 do
1845 {
1846 ONE_MORE_BYTE (c);
1847 }
1848 while (c >= 0xA0);
1849
1850 if (src - src_start <= 4)
1851 break;
1852 found = CATEGORY_MASK_EMACS_MULE;
1853 if (c == 0x80)
1854 goto repeat;
1855 }
1856
1857 if (c < 0x80)
1858 {
1859 if (c < 0x20
1860 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1861 break;
1862 }
1863 else
1864 {
1865 int more_bytes = emacs_mule_bytes[c] - 1;
1866
1867 while (more_bytes > 0)
1868 {
1869 ONE_MORE_BYTE (c);
1870 if (c < 0xA0)
1871 {
1872 src--; /* Unread the last byte. */
1873 break;
1874 }
1875 more_bytes--;
1876 }
1877 if (more_bytes != 0)
1878 break;
1879 found = CATEGORY_MASK_EMACS_MULE;
1880 }
1881 }
1882 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1883 return 0;
1884
1885 no_more_source:
1886 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1887 {
1888 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1889 return 0;
1890 }
1891 detect_info->found |= found;
1892 return 1;
1893 }
1894
1895
1896 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1897 character. If CMP_STATUS indicates that we must expect MSEQ or
1898 RULE described above, decode it and return the negative value of
1899 the decoded character or rule. If an invalid byte is found, return
1900 -1. If SRC is too short, return -2. */
1901
1902 static int
1903 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1904 int *nbytes, int *nchars, int *id,
1905 struct composition_status *cmp_status)
1906 {
1907 const unsigned char *src_end = coding->source + coding->src_bytes;
1908 const unsigned char *src_base = src;
1909 bool multibytep = coding->src_multibyte;
1910 int charset_ID;
1911 unsigned code;
1912 int c;
1913 int consumed_chars = 0;
1914 bool mseq_found = 0;
1915
1916 ONE_MORE_BYTE (c);
1917 if (c < 0)
1918 {
1919 c = -c;
1920 charset_ID = emacs_mule_charset[0];
1921 }
1922 else
1923 {
1924 if (c >= 0xA0)
1925 {
1926 if (cmp_status->state != COMPOSING_NO
1927 && cmp_status->old_form)
1928 {
1929 if (cmp_status->state == COMPOSING_CHAR)
1930 {
1931 if (c == 0xA0)
1932 {
1933 ONE_MORE_BYTE (c);
1934 c -= 0x80;
1935 if (c < 0)
1936 goto invalid_code;
1937 }
1938 else
1939 c -= 0x20;
1940 mseq_found = 1;
1941 }
1942 else
1943 {
1944 *nbytes = src - src_base;
1945 *nchars = consumed_chars;
1946 return -c;
1947 }
1948 }
1949 else
1950 goto invalid_code;
1951 }
1952
1953 switch (emacs_mule_bytes[c])
1954 {
1955 case 2:
1956 if ((charset_ID = emacs_mule_charset[c]) < 0)
1957 goto invalid_code;
1958 ONE_MORE_BYTE (c);
1959 if (c < 0xA0)
1960 goto invalid_code;
1961 code = c & 0x7F;
1962 break;
1963
1964 case 3:
1965 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1966 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1967 {
1968 ONE_MORE_BYTE (c);
1969 if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
1970 goto invalid_code;
1971 ONE_MORE_BYTE (c);
1972 if (c < 0xA0)
1973 goto invalid_code;
1974 code = c & 0x7F;
1975 }
1976 else
1977 {
1978 if ((charset_ID = emacs_mule_charset[c]) < 0)
1979 goto invalid_code;
1980 ONE_MORE_BYTE (c);
1981 if (c < 0xA0)
1982 goto invalid_code;
1983 code = (c & 0x7F) << 8;
1984 ONE_MORE_BYTE (c);
1985 if (c < 0xA0)
1986 goto invalid_code;
1987 code |= c & 0x7F;
1988 }
1989 break;
1990
1991 case 4:
1992 ONE_MORE_BYTE (c);
1993 if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
1994 goto invalid_code;
1995 ONE_MORE_BYTE (c);
1996 if (c < 0xA0)
1997 goto invalid_code;
1998 code = (c & 0x7F) << 8;
1999 ONE_MORE_BYTE (c);
2000 if (c < 0xA0)
2001 goto invalid_code;
2002 code |= c & 0x7F;
2003 break;
2004
2005 case 1:
2006 code = c;
2007 charset_ID = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
2008 break;
2009
2010 default:
2011 emacs_abort ();
2012 }
2013 CODING_DECODE_CHAR (coding, src, src_base, src_end,
2014 CHARSET_FROM_ID (charset_ID), code, c);
2015 if (c < 0)
2016 goto invalid_code;
2017 }
2018 *nbytes = src - src_base;
2019 *nchars = consumed_chars;
2020 if (id)
2021 *id = charset_ID;
2022 return (mseq_found ? -c : c);
2023
2024 no_more_source:
2025 return -2;
2026
2027 invalid_code:
2028 return -1;
2029 }
2030
2031
2032 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2033
2034 /* Handle these composition sequence ('|': the end of header elements,
2035 BYTES and CHARS >= 0xA0):
2036
2037 (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2038 (2) altchar composition: 0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2039 (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2040
2041 and these old form:
2042
2043 (4) relative composition: 0x80 | MSEQ ... MSEQ
2044 (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2045
2046 When the starter 0x80 and the following header elements are found,
2047 this annotation header is produced.
2048
2049 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2050
2051 NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2052 NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2053
2054 Then, upon reading the following elements, these codes are produced
2055 until the composition end is found:
2056
2057 (1) CHAR ... CHAR
2058 (2) ALT ... ALT CHAR ... CHAR
2059 (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2060 (4) CHAR ... CHAR
2061 (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2062
2063 When the composition end is found, LENGTH and NCHARS in the
2064 annotation header is updated as below:
2065
2066 (1) LENGTH: unchanged, NCHARS: unchanged
2067 (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2068 (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2069 (4) LENGTH: unchanged, NCHARS: number of CHARs
2070 (5) LENGTH: unchanged, NCHARS: number of CHARs
2071
2072 If an error is found while composing, the annotation header is
2073 changed to the original composition header (plus filler -1s) as
2074 below:
2075
2076 (1),(2),(3) [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2077 (5) [ 0x80 0xFF -1 -1- -1 ]
2078
2079 and the sequence [ -2 DECODED-RULE ] is changed to the original
2080 byte sequence as below:
2081 o the original byte sequence is B: [ B -1 ]
2082 o the original byte sequence is B1 B2: [ B1 B2 ]
2083
2084 Most of the routines are implemented by macros because many
2085 variables and labels in the caller decode_coding_emacs_mule must be
2086 accessible, and they are usually called just once (thus doesn't
2087 increase the size of compiled object). */
2088
2089 /* Decode a composition rule represented by C as a component of
2090 composition sequence of Emacs 20 style. Set RULE to the decoded
2091 rule. */
2092
2093 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule) \
2094 do { \
2095 int gref, nref; \
2096 \
2097 c -= 0xA0; \
2098 if (c < 0 || c >= 81) \
2099 goto invalid_code; \
2100 gref = c / 9, nref = c % 9; \
2101 if (gref == 4) gref = 10; \
2102 if (nref == 4) nref = 10; \
2103 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
2104 } while (0)
2105
2106
2107 /* Decode a composition rule represented by C and the following byte
2108 at SRC as a component of composition sequence of Emacs 21 style.
2109 Set RULE to the decoded rule. */
2110
2111 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule) \
2112 do { \
2113 int gref, nref; \
2114 \
2115 gref = c - 0x20; \
2116 if (gref < 0 || gref >= 81) \
2117 goto invalid_code; \
2118 ONE_MORE_BYTE (c); \
2119 nref = c - 0x20; \
2120 if (nref < 0 || nref >= 81) \
2121 goto invalid_code; \
2122 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
2123 } while (0)
2124
2125
2126 /* Start of Emacs 21 style format. The first three bytes at SRC are
2127 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2128 byte length of this composition information, CHARS is the number of
2129 characters composed by this composition. */
2130
2131 #define DECODE_EMACS_MULE_21_COMPOSITION() \
2132 do { \
2133 enum composition_method method = c - 0xF2; \
2134 int nbytes, nchars; \
2135 \
2136 ONE_MORE_BYTE (c); \
2137 if (c < 0) \
2138 goto invalid_code; \
2139 nbytes = c - 0xA0; \
2140 if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4)) \
2141 goto invalid_code; \
2142 ONE_MORE_BYTE (c); \
2143 nchars = c - 0xA0; \
2144 if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS) \
2145 goto invalid_code; \
2146 cmp_status->old_form = 0; \
2147 cmp_status->method = method; \
2148 if (method == COMPOSITION_RELATIVE) \
2149 cmp_status->state = COMPOSING_CHAR; \
2150 else \
2151 cmp_status->state = COMPOSING_COMPONENT_CHAR; \
2152 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2153 cmp_status->nchars = nchars; \
2154 cmp_status->ncomps = nbytes - 4; \
2155 ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method); \
2156 } while (0)
2157
2158
2159 /* Start of Emacs 20 style format for relative composition. */
2160
2161 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION() \
2162 do { \
2163 cmp_status->old_form = 1; \
2164 cmp_status->method = COMPOSITION_RELATIVE; \
2165 cmp_status->state = COMPOSING_CHAR; \
2166 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2167 cmp_status->nchars = cmp_status->ncomps = 0; \
2168 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
2169 } while (0)
2170
2171
2172 /* Start of Emacs 20 style format for rule-base composition. */
2173
2174 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION() \
2175 do { \
2176 cmp_status->old_form = 1; \
2177 cmp_status->method = COMPOSITION_WITH_RULE; \
2178 cmp_status->state = COMPOSING_CHAR; \
2179 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2180 cmp_status->nchars = cmp_status->ncomps = 0; \
2181 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
2182 } while (0)
2183
2184
2185 #define DECODE_EMACS_MULE_COMPOSITION_START() \
2186 do { \
2187 const unsigned char *current_src = src; \
2188 \
2189 ONE_MORE_BYTE (c); \
2190 if (c < 0) \
2191 goto invalid_code; \
2192 if (c - 0xF2 >= COMPOSITION_RELATIVE \
2193 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS) \
2194 DECODE_EMACS_MULE_21_COMPOSITION (); \
2195 else if (c < 0xA0) \
2196 goto invalid_code; \
2197 else if (c < 0xC0) \
2198 { \
2199 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (); \
2200 /* Re-read C as a composition component. */ \
2201 src = current_src; \
2202 } \
2203 else if (c == 0xFF) \
2204 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (); \
2205 else \
2206 goto invalid_code; \
2207 } while (0)
2208
2209 #define EMACS_MULE_COMPOSITION_END() \
2210 do { \
2211 int idx = - cmp_status->length; \
2212 \
2213 if (cmp_status->old_form) \
2214 charbuf[idx + 2] = cmp_status->nchars; \
2215 else if (cmp_status->method > COMPOSITION_RELATIVE) \
2216 charbuf[idx] = charbuf[idx + 2] - cmp_status->length; \
2217 cmp_status->state = COMPOSING_NO; \
2218 } while (0)
2219
2220
2221 static int
2222 emacs_mule_finish_composition (int *charbuf,
2223 struct composition_status *cmp_status)
2224 {
2225 int idx = - cmp_status->length;
2226 int new_chars;
2227
2228 if (cmp_status->old_form && cmp_status->nchars > 0)
2229 {
2230 charbuf[idx + 2] = cmp_status->nchars;
2231 new_chars = 0;
2232 if (cmp_status->method == COMPOSITION_WITH_RULE
2233 && cmp_status->state == COMPOSING_CHAR)
2234 {
2235 /* The last rule was invalid. */
2236 int rule = charbuf[-1] + 0xA0;
2237
2238 charbuf[-2] = BYTE8_TO_CHAR (rule);
2239 charbuf[-1] = -1;
2240 new_chars = 1;
2241 }
2242 }
2243 else
2244 {
2245 charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2246
2247 if (cmp_status->method == COMPOSITION_WITH_RULE)
2248 {
2249 charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2250 charbuf[idx++] = -3;
2251 charbuf[idx++] = 0;
2252 new_chars = 1;
2253 }
2254 else
2255 {
2256 int nchars = charbuf[idx + 1] + 0xA0;
2257 int nbytes = charbuf[idx + 2] + 0xA0;
2258
2259 charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2260 charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2261 charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2262 charbuf[idx++] = -1;
2263 new_chars = 4;
2264 }
2265 }
2266 cmp_status->state = COMPOSING_NO;
2267 return new_chars;
2268 }
2269
2270 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION() \
2271 do { \
2272 if (cmp_status->state != COMPOSING_NO) \
2273 char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2274 } while (0)
2275
2276
2277 static void
2278 decode_coding_emacs_mule (struct coding_system *coding)
2279 {
2280 const unsigned char *src = coding->source + coding->consumed;
2281 const unsigned char *src_end = coding->source + coding->src_bytes;
2282 const unsigned char *src_base;
2283 int *charbuf = coding->charbuf + coding->charbuf_used;
2284 /* We may produce two annotations (charset and composition) in one
2285 loop and one more charset annotation at the end. */
2286 int *charbuf_end
2287 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2288 /* We can produce up to 2 characters in a loop. */
2289 - 1;
2290 ptrdiff_t consumed_chars = 0, consumed_chars_base;
2291 bool multibytep = coding->src_multibyte;
2292 ptrdiff_t char_offset = coding->produced_char;
2293 ptrdiff_t last_offset = char_offset;
2294 int last_id = charset_ascii;
2295 bool eol_dos
2296 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2297 int byte_after_cr = -1;
2298 struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2299
2300 if (cmp_status->state != COMPOSING_NO)
2301 {
2302 int i;
2303
2304 if (charbuf_end - charbuf < cmp_status->length)
2305 emacs_abort ();
2306 for (i = 0; i < cmp_status->length; i++)
2307 *charbuf++ = cmp_status->carryover[i];
2308 coding->annotated = 1;
2309 }
2310
2311 while (1)
2312 {
2313 int c, id IF_LINT (= 0);
2314
2315 src_base = src;
2316 consumed_chars_base = consumed_chars;
2317
2318 if (charbuf >= charbuf_end)
2319 {
2320 if (byte_after_cr >= 0)
2321 src_base--;
2322 break;
2323 }
2324
2325 if (byte_after_cr >= 0)
2326 c = byte_after_cr, byte_after_cr = -1;
2327 else
2328 ONE_MORE_BYTE (c);
2329
2330 if (c < 0 || c == 0x80)
2331 {
2332 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2333 if (c < 0)
2334 {
2335 *charbuf++ = -c;
2336 char_offset++;
2337 }
2338 else
2339 DECODE_EMACS_MULE_COMPOSITION_START ();
2340 continue;
2341 }
2342
2343 if (c < 0x80)
2344 {
2345 if (eol_dos && c == '\r')
2346 ONE_MORE_BYTE (byte_after_cr);
2347 id = charset_ascii;
2348 if (cmp_status->state != COMPOSING_NO)
2349 {
2350 if (cmp_status->old_form)
2351 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2352 else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2353 cmp_status->ncomps--;
2354 }
2355 }
2356 else
2357 {
2358 int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2359 /* emacs_mule_char can load a charset map from a file, which
2360 allocates a large structure and might cause buffer text
2361 to be relocated as result. Thus, we need to remember the
2362 original pointer to buffer text, and fix up all related
2363 pointers after the call. */
2364 const unsigned char *orig = coding->source;
2365 ptrdiff_t offset;
2366
2367 c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2368 cmp_status);
2369 offset = coding->source - orig;
2370 if (offset)
2371 {
2372 src += offset;
2373 src_base += offset;
2374 src_end += offset;
2375 }
2376 if (c < 0)
2377 {
2378 if (c == -1)
2379 goto invalid_code;
2380 if (c == -2)
2381 break;
2382 }
2383 src = src_base + nbytes;
2384 consumed_chars = consumed_chars_base + nchars;
2385 if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2386 cmp_status->ncomps -= nchars;
2387 }
2388
2389 /* Now if C >= 0, we found a normally encoded character, if C <
2390 0, we found an old-style composition component character or
2391 rule. */
2392
2393 if (cmp_status->state == COMPOSING_NO)
2394 {
2395 if (last_id != id)
2396 {
2397 if (last_id != charset_ascii)
2398 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2399 last_id);
2400 last_id = id;
2401 last_offset = char_offset;
2402 }
2403 *charbuf++ = c;
2404 char_offset++;
2405 }
2406 else if (cmp_status->state == COMPOSING_CHAR)
2407 {
2408 if (cmp_status->old_form)
2409 {
2410 if (c >= 0)
2411 {
2412 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2413 *charbuf++ = c;
2414 char_offset++;
2415 }
2416 else
2417 {
2418 *charbuf++ = -c;
2419 cmp_status->nchars++;
2420 cmp_status->length++;
2421 if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2422 EMACS_MULE_COMPOSITION_END ();
2423 else if (cmp_status->method == COMPOSITION_WITH_RULE)
2424 cmp_status->state = COMPOSING_RULE;
2425 }
2426 }
2427 else
2428 {
2429 *charbuf++ = c;
2430 cmp_status->length++;
2431 cmp_status->nchars--;
2432 if (cmp_status->nchars == 0)
2433 EMACS_MULE_COMPOSITION_END ();
2434 }
2435 }
2436 else if (cmp_status->state == COMPOSING_RULE)
2437 {
2438 int rule;
2439
2440 if (c >= 0)
2441 {
2442 EMACS_MULE_COMPOSITION_END ();
2443 *charbuf++ = c;
2444 char_offset++;
2445 }
2446 else
2447 {
2448 c = -c;
2449 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2450 if (rule < 0)
2451 goto invalid_code;
2452 *charbuf++ = -2;
2453 *charbuf++ = rule;
2454 cmp_status->length += 2;
2455 cmp_status->state = COMPOSING_CHAR;
2456 }
2457 }
2458 else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2459 {
2460 *charbuf++ = c;
2461 cmp_status->length++;
2462 if (cmp_status->ncomps == 0)
2463 cmp_status->state = COMPOSING_CHAR;
2464 else if (cmp_status->ncomps > 0)
2465 {
2466 if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2467 cmp_status->state = COMPOSING_COMPONENT_RULE;
2468 }
2469 else
2470 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2471 }
2472 else /* COMPOSING_COMPONENT_RULE */
2473 {
2474 int rule;
2475
2476 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2477 if (rule < 0)
2478 goto invalid_code;
2479 *charbuf++ = -2;
2480 *charbuf++ = rule;
2481 cmp_status->length += 2;
2482 cmp_status->ncomps--;
2483 if (cmp_status->ncomps > 0)
2484 cmp_status->state = COMPOSING_COMPONENT_CHAR;
2485 else
2486 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2487 }
2488 continue;
2489
2490 invalid_code:
2491 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2492 src = src_base;
2493 consumed_chars = consumed_chars_base;
2494 ONE_MORE_BYTE (c);
2495 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2496 char_offset++;
2497 coding->errors++;
2498 }
2499
2500 no_more_source:
2501 if (cmp_status->state != COMPOSING_NO)
2502 {
2503 if (coding->mode & CODING_MODE_LAST_BLOCK)
2504 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2505 else
2506 {
2507 int i;
2508
2509 charbuf -= cmp_status->length;
2510 for (i = 0; i < cmp_status->length; i++)
2511 cmp_status->carryover[i] = charbuf[i];
2512 }
2513 }
2514 if (last_id != charset_ascii)
2515 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2516 coding->consumed_char += consumed_chars_base;
2517 coding->consumed = src_base - coding->source;
2518 coding->charbuf_used = charbuf - coding->charbuf;
2519 }
2520
2521
2522 #define EMACS_MULE_LEADING_CODES(id, codes) \
2523 do { \
2524 if (id < 0xA0) \
2525 codes[0] = id, codes[1] = 0; \
2526 else if (id < 0xE0) \
2527 codes[0] = 0x9A, codes[1] = id; \
2528 else if (id < 0xF0) \
2529 codes[0] = 0x9B, codes[1] = id; \
2530 else if (id < 0xF5) \
2531 codes[0] = 0x9C, codes[1] = id; \
2532 else \
2533 codes[0] = 0x9D, codes[1] = id; \
2534 } while (0);
2535
2536
2537 static bool
2538 encode_coding_emacs_mule (struct coding_system *coding)
2539 {
2540 bool multibytep = coding->dst_multibyte;
2541 int *charbuf = coding->charbuf;
2542 int *charbuf_end = charbuf + coding->charbuf_used;
2543 unsigned char *dst = coding->destination + coding->produced;
2544 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2545 int safe_room = 8;
2546 ptrdiff_t produced_chars = 0;
2547 Lisp_Object attrs, charset_list;
2548 int c;
2549 int preferred_charset_id = -1;
2550
2551 CODING_GET_INFO (coding, attrs, charset_list);
2552 if (! EQ (charset_list, Vemacs_mule_charset_list))
2553 {
2554 charset_list = Vemacs_mule_charset_list;
2555 ASET (attrs, coding_attr_charset_list, charset_list);
2556 }
2557
2558 while (charbuf < charbuf_end)
2559 {
2560 ASSURE_DESTINATION (safe_room);
2561 c = *charbuf++;
2562
2563 if (c < 0)
2564 {
2565 /* Handle an annotation. */
2566 switch (*charbuf)
2567 {
2568 case CODING_ANNOTATE_COMPOSITION_MASK:
2569 /* Not yet implemented. */
2570 break;
2571 case CODING_ANNOTATE_CHARSET_MASK:
2572 preferred_charset_id = charbuf[3];
2573 if (preferred_charset_id >= 0
2574 && NILP (Fmemq (make_number (preferred_charset_id),
2575 charset_list)))
2576 preferred_charset_id = -1;
2577 break;
2578 default:
2579 emacs_abort ();
2580 }
2581 charbuf += -c - 1;
2582 continue;
2583 }
2584
2585 if (ASCII_CHAR_P (c))
2586 EMIT_ONE_ASCII_BYTE (c);
2587 else if (CHAR_BYTE8_P (c))
2588 {
2589 c = CHAR_TO_BYTE8 (c);
2590 EMIT_ONE_BYTE (c);
2591 }
2592 else
2593 {
2594 struct charset *charset;
2595 unsigned code;
2596 int dimension;
2597 int emacs_mule_id;
2598 unsigned char leading_codes[2];
2599
2600 if (preferred_charset_id >= 0)
2601 {
2602 bool result;
2603
2604 charset = CHARSET_FROM_ID (preferred_charset_id);
2605 CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2606 if (result)
2607 code = ENCODE_CHAR (charset, c);
2608 else
2609 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2610 &code, charset);
2611 }
2612 else
2613 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2614 &code, charset);
2615 if (! charset)
2616 {
2617 c = coding->default_char;
2618 if (ASCII_CHAR_P (c))
2619 {
2620 EMIT_ONE_ASCII_BYTE (c);
2621 continue;
2622 }
2623 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2624 &code, charset);
2625 }
2626 dimension = CHARSET_DIMENSION (charset);
2627 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2628 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2629 EMIT_ONE_BYTE (leading_codes[0]);
2630 if (leading_codes[1])
2631 EMIT_ONE_BYTE (leading_codes[1]);
2632 if (dimension == 1)
2633 EMIT_ONE_BYTE (code | 0x80);
2634 else
2635 {
2636 code |= 0x8080;
2637 EMIT_ONE_BYTE (code >> 8);
2638 EMIT_ONE_BYTE (code & 0xFF);
2639 }
2640 }
2641 }
2642 record_conversion_result (coding, CODING_RESULT_SUCCESS);
2643 coding->produced_char += produced_chars;
2644 coding->produced = dst - coding->destination;
2645 return 0;
2646 }
2647
2648 \f
2649 /*** 7. ISO2022 handlers ***/
2650
2651 /* The following note describes the coding system ISO2022 briefly.
2652 Since the intention of this note is to help understand the
2653 functions in this file, some parts are NOT ACCURATE or are OVERLY
2654 SIMPLIFIED. For thorough understanding, please refer to the
2655 original document of ISO2022. This is equivalent to the standard
2656 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2657
2658 ISO2022 provides many mechanisms to encode several character sets
2659 in 7-bit and 8-bit environments. For 7-bit environments, all text
2660 is encoded using bytes less than 128. This may make the encoded
2661 text a little bit longer, but the text passes more easily through
2662 several types of gateway, some of which strip off the MSB (Most
2663 Significant Bit).
2664
2665 There are two kinds of character sets: control character sets and
2666 graphic character sets. The former contain control characters such
2667 as `newline' and `escape' to provide control functions (control
2668 functions are also provided by escape sequences). The latter
2669 contain graphic characters such as 'A' and '-'. Emacs recognizes
2670 two control character sets and many graphic character sets.
2671
2672 Graphic character sets are classified into one of the following
2673 four classes, according to the number of bytes (DIMENSION) and
2674 number of characters in one dimension (CHARS) of the set:
2675 - DIMENSION1_CHARS94
2676 - DIMENSION1_CHARS96
2677 - DIMENSION2_CHARS94
2678 - DIMENSION2_CHARS96
2679
2680 In addition, each character set is assigned an identification tag,
2681 unique for each set, called the "final character" (denoted as <F>
2682 hereafter). The <F> of each character set is decided by ECMA(*)
2683 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2684 (0x30..0x3F are for private use only).
2685
2686 Note (*): ECMA = European Computer Manufacturers Association
2687
2688 Here are examples of graphic character sets [NAME(<F>)]:
2689 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2690 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2691 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2692 o DIMENSION2_CHARS96 -- none for the moment
2693
2694 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2695 C0 [0x00..0x1F] -- control character plane 0
2696 GL [0x20..0x7F] -- graphic character plane 0
2697 C1 [0x80..0x9F] -- control character plane 1
2698 GR [0xA0..0xFF] -- graphic character plane 1
2699
2700 A control character set is directly designated and invoked to C0 or
2701 C1 by an escape sequence. The most common case is that:
2702 - ISO646's control character set is designated/invoked to C0, and
2703 - ISO6429's control character set is designated/invoked to C1,
2704 and usually these designations/invocations are omitted in encoded
2705 text. In a 7-bit environment, only C0 can be used, and a control
2706 character for C1 is encoded by an appropriate escape sequence to
2707 fit into the environment. All control characters for C1 are
2708 defined to have corresponding escape sequences.
2709
2710 A graphic character set is at first designated to one of four
2711 graphic registers (G0 through G3), then these graphic registers are
2712 invoked to GL or GR. These designations and invocations can be
2713 done independently. The most common case is that G0 is invoked to
2714 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2715 these invocations and designations are omitted in encoded text.
2716 In a 7-bit environment, only GL can be used.
2717
2718 When a graphic character set of CHARS94 is invoked to GL, codes
2719 0x20 and 0x7F of the GL area work as control characters SPACE and
2720 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2721 be used.
2722
2723 There are two ways of invocation: locking-shift and single-shift.
2724 With locking-shift, the invocation lasts until the next different
2725 invocation, whereas with single-shift, the invocation affects the
2726 following character only and doesn't affect the locking-shift
2727 state. Invocations are done by the following control characters or
2728 escape sequences:
2729
2730 ----------------------------------------------------------------------
2731 abbrev function cntrl escape seq description
2732 ----------------------------------------------------------------------
2733 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2734 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2735 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2736 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2737 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2738 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2739 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2740 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2741 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2742 ----------------------------------------------------------------------
2743 (*) These are not used by any known coding system.
2744
2745 Control characters for these functions are defined by macros
2746 ISO_CODE_XXX in `coding.h'.
2747
2748 Designations are done by the following escape sequences:
2749 ----------------------------------------------------------------------
2750 escape sequence description
2751 ----------------------------------------------------------------------
2752 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2753 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2754 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2755 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2756 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2757 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2758 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2759 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2760 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2761 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2762 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2763 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2764 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2765 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2766 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2767 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2768 ----------------------------------------------------------------------
2769
2770 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2771 of dimension 1, chars 94, and final character <F>, etc...
2772
2773 Note (*): Although these designations are not allowed in ISO2022,
2774 Emacs accepts them on decoding, and produces them on encoding
2775 CHARS96 character sets in a coding system which is characterized as
2776 7-bit environment, non-locking-shift, and non-single-shift.
2777
2778 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2779 '(' must be omitted. We refer to this as "short-form" hereafter.
2780
2781 Now you may notice that there are a lot of ways of encoding the
2782 same multilingual text in ISO2022. Actually, there exist many
2783 coding systems such as Compound Text (used in X11's inter client
2784 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2785 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2786 localized platforms), and all of these are variants of ISO2022.
2787
2788 In addition to the above, Emacs handles two more kinds of escape
2789 sequences: ISO6429's direction specification and Emacs' private
2790 sequence for specifying character composition.
2791
2792 ISO6429's direction specification takes the following form:
2793 o CSI ']' -- end of the current direction
2794 o CSI '0' ']' -- end of the current direction
2795 o CSI '1' ']' -- start of left-to-right text
2796 o CSI '2' ']' -- start of right-to-left text
2797 The control character CSI (0x9B: control sequence introducer) is
2798 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2799
2800 Character composition specification takes the following form:
2801 o ESC '0' -- start relative composition
2802 o ESC '1' -- end composition
2803 o ESC '2' -- start rule-base composition (*)
2804 o ESC '3' -- start relative composition with alternate chars (**)
2805 o ESC '4' -- start rule-base composition with alternate chars (**)
2806 Since these are not standard escape sequences of any ISO standard,
2807 the use of them with these meanings is restricted to Emacs only.
2808
2809 (*) This form is used only in Emacs 20.7 and older versions,
2810 but newer versions can safely decode it.
2811 (**) This form is used only in Emacs 21.1 and newer versions,
2812 and older versions can't decode it.
2813
2814 Here's a list of example usages of these composition escape
2815 sequences (categorized by `enum composition_method').
2816
2817 COMPOSITION_RELATIVE:
2818 ESC 0 CHAR [ CHAR ] ESC 1
2819 COMPOSITION_WITH_RULE:
2820 ESC 2 CHAR [ RULE CHAR ] ESC 1
2821 COMPOSITION_WITH_ALTCHARS:
2822 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2823 COMPOSITION_WITH_RULE_ALTCHARS:
2824 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2825
2826 static enum iso_code_class_type iso_code_class[256];
2827
2828 #define SAFE_CHARSET_P(coding, id) \
2829 ((id) <= (coding)->max_charset_id \
2830 && (coding)->safe_charsets[id] != 255)
2831
2832 static void
2833 setup_iso_safe_charsets (Lisp_Object attrs)
2834 {
2835 Lisp_Object charset_list, safe_charsets;
2836 Lisp_Object request;
2837 Lisp_Object reg_usage;
2838 Lisp_Object tail;
2839 EMACS_INT reg94, reg96;
2840 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2841 int max_charset_id;
2842
2843 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2844 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2845 && ! EQ (charset_list, Viso_2022_charset_list))
2846 {
2847 charset_list = Viso_2022_charset_list;
2848 ASET (attrs, coding_attr_charset_list, charset_list);
2849 ASET (attrs, coding_attr_safe_charsets, Qnil);
2850 }
2851
2852 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2853 return;
2854
2855 max_charset_id = 0;
2856 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2857 {
2858 int id = XINT (XCAR (tail));
2859 if (max_charset_id < id)
2860 max_charset_id = id;
2861 }
2862
2863 safe_charsets = make_uninit_string (max_charset_id + 1);
2864 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2865 request = AREF (attrs, coding_attr_iso_request);
2866 reg_usage = AREF (attrs, coding_attr_iso_usage);
2867 reg94 = XINT (XCAR (reg_usage));
2868 reg96 = XINT (XCDR (reg_usage));
2869
2870 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2871 {
2872 Lisp_Object id;
2873 Lisp_Object reg;
2874 struct charset *charset;
2875
2876 id = XCAR (tail);
2877 charset = CHARSET_FROM_ID (XINT (id));
2878 reg = Fcdr (Fassq (id, request));
2879 if (! NILP (reg))
2880 SSET (safe_charsets, XINT (id), XINT (reg));
2881 else if (charset->iso_chars_96)
2882 {
2883 if (reg96 < 4)
2884 SSET (safe_charsets, XINT (id), reg96);
2885 }
2886 else
2887 {
2888 if (reg94 < 4)
2889 SSET (safe_charsets, XINT (id), reg94);
2890 }
2891 }
2892 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2893 }
2894
2895
2896 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2897 Return true if a text is encoded in one of ISO-2022 based coding
2898 systems. */
2899
2900 static bool
2901 detect_coding_iso_2022 (struct coding_system *coding,
2902 struct coding_detection_info *detect_info)
2903 {
2904 const unsigned char *src = coding->source, *src_base = src;
2905 const unsigned char *src_end = coding->source + coding->src_bytes;
2906 bool multibytep = coding->src_multibyte;
2907 bool single_shifting = 0;
2908 int id;
2909 int c, c1;
2910 ptrdiff_t consumed_chars = 0;
2911 int i;
2912 int rejected = 0;
2913 int found = 0;
2914 int composition_count = -1;
2915
2916 detect_info->checked |= CATEGORY_MASK_ISO;
2917
2918 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2919 {
2920 struct coding_system *this = &(coding_categories[i]);
2921 Lisp_Object attrs, val;
2922
2923 if (this->id < 0)
2924 continue;
2925 attrs = CODING_ID_ATTRS (this->id);
2926 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2927 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
2928 setup_iso_safe_charsets (attrs);
2929 val = CODING_ATTR_SAFE_CHARSETS (attrs);
2930 this->max_charset_id = SCHARS (val) - 1;
2931 this->safe_charsets = SDATA (val);
2932 }
2933
2934 /* A coding system of this category is always ASCII compatible. */
2935 src += coding->head_ascii;
2936
2937 while (rejected != CATEGORY_MASK_ISO)
2938 {
2939 src_base = src;
2940 ONE_MORE_BYTE (c);
2941 switch (c)
2942 {
2943 case ISO_CODE_ESC:
2944 if (inhibit_iso_escape_detection)
2945 break;
2946 single_shifting = 0;
2947 ONE_MORE_BYTE (c);
2948 if (c == 'N' || c == 'O')
2949 {
2950 /* ESC <Fe> for SS2 or SS3. */
2951 single_shifting = 1;
2952 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2953 }
2954 else if (c == '1')
2955 {
2956 /* End of composition. */
2957 if (composition_count < 0
2958 || composition_count > MAX_COMPOSITION_COMPONENTS)
2959 /* Invalid */
2960 break;
2961 composition_count = -1;
2962 found |= CATEGORY_MASK_ISO;
2963 }
2964 else if (c >= '0' && c <= '4')
2965 {
2966 /* ESC <Fp> for start/end composition. */
2967 composition_count = 0;
2968 }
2969 else
2970 {
2971 if (c >= '(' && c <= '/')
2972 {
2973 /* Designation sequence for a charset of dimension 1. */
2974 ONE_MORE_BYTE (c1);
2975 if (c1 < ' ' || c1 >= 0x80
2976 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2977 /* Invalid designation sequence. Just ignore. */
2978 break;
2979 }
2980 else if (c == '$')
2981 {
2982 /* Designation sequence for a charset of dimension 2. */
2983 ONE_MORE_BYTE (c);
2984 if (c >= '@' && c <= 'B')
2985 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
2986 id = iso_charset_table[1][0][c];
2987 else if (c >= '(' && c <= '/')
2988 {
2989 ONE_MORE_BYTE (c1);
2990 if (c1 < ' ' || c1 >= 0x80
2991 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2992 /* Invalid designation sequence. Just ignore. */
2993 break;
2994 }
2995 else
2996 /* Invalid designation sequence. Just ignore it. */
2997 break;
2998 }
2999 else
3000 {
3001 /* Invalid escape sequence. Just ignore it. */
3002 break;
3003 }
3004
3005 /* We found a valid designation sequence for CHARSET. */
3006 rejected |= CATEGORY_MASK_ISO_8BIT;
3007 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3008 id))
3009 found |= CATEGORY_MASK_ISO_7;
3010 else
3011 rejected |= CATEGORY_MASK_ISO_7;
3012 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3013 id))
3014 found |= CATEGORY_MASK_ISO_7_TIGHT;
3015 else
3016 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3017 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3018 id))
3019 found |= CATEGORY_MASK_ISO_7_ELSE;
3020 else
3021 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3022 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3023 id))
3024 found |= CATEGORY_MASK_ISO_8_ELSE;
3025 else
3026 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3027 }
3028 break;
3029
3030 case ISO_CODE_SO:
3031 case ISO_CODE_SI:
3032 /* Locking shift out/in. */
3033 if (inhibit_iso_escape_detection)
3034 break;
3035 single_shifting = 0;
3036 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3037 break;
3038
3039 case ISO_CODE_CSI:
3040 /* Control sequence introducer. */
3041 single_shifting = 0;
3042 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3043 found |= CATEGORY_MASK_ISO_8_ELSE;
3044 goto check_extra_latin;
3045
3046 case ISO_CODE_SS2:
3047 case ISO_CODE_SS3:
3048 /* Single shift. */
3049 if (inhibit_iso_escape_detection)
3050 break;
3051 single_shifting = 0;
3052 rejected |= CATEGORY_MASK_ISO_7BIT;
3053 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3054 & CODING_ISO_FLAG_SINGLE_SHIFT)
3055 {
3056 found |= CATEGORY_MASK_ISO_8_1;
3057 single_shifting = 1;
3058 }
3059 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3060 & CODING_ISO_FLAG_SINGLE_SHIFT)
3061 {
3062 found |= CATEGORY_MASK_ISO_8_2;
3063 single_shifting = 1;
3064 }
3065 if (single_shifting)
3066 break;
3067 check_extra_latin:
3068 if (! VECTORP (Vlatin_extra_code_table)
3069 || NILP (AREF (Vlatin_extra_code_table, c)))
3070 {
3071 rejected = CATEGORY_MASK_ISO;
3072 break;
3073 }
3074 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3075 & CODING_ISO_FLAG_LATIN_EXTRA)
3076 found |= CATEGORY_MASK_ISO_8_1;
3077 else
3078 rejected |= CATEGORY_MASK_ISO_8_1;
3079 rejected |= CATEGORY_MASK_ISO_8_2;
3080 break;
3081
3082 default:
3083 if (c < 0)
3084 continue;
3085 if (c < 0x80)
3086 {
3087 if (composition_count >= 0)
3088 composition_count++;
3089 single_shifting = 0;
3090 break;
3091 }
3092 if (c >= 0xA0)
3093 {
3094 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3095 found |= CATEGORY_MASK_ISO_8_1;
3096 /* Check the length of succeeding codes of the range
3097 0xA0..0FF. If the byte length is even, we include
3098 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
3099 only when we are not single shifting. */
3100 if (! single_shifting
3101 && ! (rejected & CATEGORY_MASK_ISO_8_2))
3102 {
3103 int len = 1;
3104 while (src < src_end)
3105 {
3106 src_base = src;
3107 ONE_MORE_BYTE (c);
3108 if (c < 0xA0)
3109 {
3110 src = src_base;
3111 break;
3112 }
3113 len++;
3114 }
3115
3116 if (len & 1 && src < src_end)
3117 {
3118 rejected |= CATEGORY_MASK_ISO_8_2;
3119 if (composition_count >= 0)
3120 composition_count += len;
3121 }
3122 else
3123 {
3124 found |= CATEGORY_MASK_ISO_8_2;
3125 if (composition_count >= 0)
3126 composition_count += len / 2;
3127 }
3128 }
3129 break;
3130 }
3131 }
3132 }
3133 detect_info->rejected |= CATEGORY_MASK_ISO;
3134 return 0;
3135
3136 no_more_source:
3137 detect_info->rejected |= rejected;
3138 detect_info->found |= (found & ~rejected);
3139 return 1;
3140 }
3141
3142
3143 /* Set designation state into CODING. Set CHARS_96 to -1 if the
3144 escape sequence should be kept. */
3145 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
3146 do { \
3147 int id, prev; \
3148 \
3149 if (final < '0' || final >= 128 \
3150 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
3151 || !SAFE_CHARSET_P (coding, id)) \
3152 { \
3153 CODING_ISO_DESIGNATION (coding, reg) = -2; \
3154 chars_96 = -1; \
3155 break; \
3156 } \
3157 prev = CODING_ISO_DESIGNATION (coding, reg); \
3158 if (id == charset_jisx0201_roman) \
3159 { \
3160 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3161 id = charset_ascii; \
3162 } \
3163 else if (id == charset_jisx0208_1978) \
3164 { \
3165 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3166 id = charset_jisx0208; \
3167 } \
3168 CODING_ISO_DESIGNATION (coding, reg) = id; \
3169 /* If there was an invalid designation to REG previously, and this \
3170 designation is ASCII to REG, we should keep this designation \
3171 sequence. */ \
3172 if (prev == -2 && id == charset_ascii) \
3173 chars_96 = -1; \
3174 } while (0)
3175
3176
3177 /* Handle these composition sequence (ALT: alternate char):
3178
3179 (1) relative composition: ESC 0 CHAR ... ESC 1
3180 (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3181 (3) altchar composition: ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3182 (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3183
3184 When the start sequence (ESC 0/2/3/4) is found, this annotation
3185 header is produced.
3186
3187 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3188
3189 Then, upon reading CHAR or RULE (one or two bytes), these codes are
3190 produced until the end sequence (ESC 1) is found:
3191
3192 (1) CHAR ... CHAR
3193 (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3194 (3) ALT ... ALT -1 -1 CHAR ... CHAR
3195 (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3196
3197 When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3198 annotation header is updated as below:
3199
3200 (1) LENGTH: unchanged, NCHARS: number of CHARs
3201 (2) LENGTH: unchanged, NCHARS: number of CHARs
3202 (3) LENGTH: += number of ALTs + 2, NCHARS: number of CHARs
3203 (4) LENGTH: += number of ALTs * 3, NCHARS: number of CHARs
3204
3205 If an error is found while composing, the annotation header is
3206 changed to:
3207
3208 [ ESC '0'/'2'/'3'/'4' -2 0 ]
3209
3210 and the sequence [ -2 DECODED-RULE ] is changed to the original
3211 byte sequence as below:
3212 o the original byte sequence is B: [ B -1 ]
3213 o the original byte sequence is B1 B2: [ B1 B2 ]
3214 and the sequence [ -1 -1 ] is changed to the original byte
3215 sequence:
3216 [ ESC '0' ]
3217 */
3218
3219 /* Decode a composition rule C1 and maybe one more byte from the
3220 source, and set RULE to the encoded composition rule. If the rule
3221 is invalid, goto invalid_code. */
3222
3223 #define DECODE_COMPOSITION_RULE(rule) \
3224 do { \
3225 rule = c1 - 32; \
3226 if (rule < 0) \
3227 goto invalid_code; \
3228 if (rule < 81) /* old format (before ver.21) */ \
3229 { \
3230 int gref = (rule) / 9; \
3231 int nref = (rule) % 9; \
3232 if (gref == 4) gref = 10; \
3233 if (nref == 4) nref = 10; \
3234 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
3235 } \
3236 else /* new format (after ver.21) */ \
3237 { \
3238 int b; \
3239 \
3240 ONE_MORE_BYTE (b); \
3241 if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32)) \
3242 goto invalid_code; \
3243 rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32); \
3244 rule += 0x100; /* Distinguish it from the old format. */ \
3245 } \
3246 } while (0)
3247
3248 #define ENCODE_COMPOSITION_RULE(rule) \
3249 do { \
3250 int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3251 \
3252 if (rule < 0x100) /* old format */ \
3253 { \
3254 if (gref == 10) gref = 4; \
3255 if (nref == 10) nref = 4; \
3256 charbuf[idx] = 32 + gref * 9 + nref; \
3257 charbuf[idx + 1] = -1; \
3258 new_chars++; \
3259 } \
3260 else /* new format */ \
3261 { \
3262 charbuf[idx] = 32 + 81 + gref; \
3263 charbuf[idx + 1] = 32 + nref; \
3264 new_chars += 2; \
3265 } \
3266 } while (0)
3267
3268 /* Finish the current composition as invalid. */
3269
3270 static int
3271 finish_composition (int *charbuf, struct composition_status *cmp_status)
3272 {
3273 int idx = - cmp_status->length;
3274 int new_chars;
3275
3276 /* Recover the original ESC sequence */
3277 charbuf[idx++] = ISO_CODE_ESC;
3278 charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3279 : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3280 : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3281 /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3282 : '4');
3283 charbuf[idx++] = -2;
3284 charbuf[idx++] = 0;
3285 charbuf[idx++] = -1;
3286 new_chars = cmp_status->nchars;
3287 if (cmp_status->method >= COMPOSITION_WITH_RULE)
3288 for (; idx < 0; idx++)
3289 {
3290 int elt = charbuf[idx];
3291
3292 if (elt == -2)
3293 {
3294 ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3295 idx++;
3296 }
3297 else if (elt == -1)
3298 {
3299 charbuf[idx++] = ISO_CODE_ESC;
3300 charbuf[idx] = '0';
3301 new_chars += 2;
3302 }
3303 }
3304 cmp_status->state = COMPOSING_NO;
3305 return new_chars;
3306 }
3307
3308 /* If characters are under composition, finish the composition. */
3309 #define MAYBE_FINISH_COMPOSITION() \
3310 do { \
3311 if (cmp_status->state != COMPOSING_NO) \
3312 char_offset += finish_composition (charbuf, cmp_status); \
3313 } while (0)
3314
3315 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3316
3317 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3318 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3319 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3320 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3321
3322 Produce this annotation sequence now:
3323
3324 [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3325 */
3326
3327 #define DECODE_COMPOSITION_START(c1) \
3328 do { \
3329 if (c1 == '0' \
3330 && ((cmp_status->state == COMPOSING_COMPONENT_CHAR \
3331 && cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3332 || (cmp_status->state == COMPOSING_COMPONENT_RULE \
3333 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3334 { \
3335 *charbuf++ = -1; \
3336 *charbuf++= -1; \
3337 cmp_status->state = COMPOSING_CHAR; \
3338 cmp_status->length += 2; \
3339 } \
3340 else \
3341 { \
3342 MAYBE_FINISH_COMPOSITION (); \
3343 cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE \
3344 : c1 == '2' ? COMPOSITION_WITH_RULE \
3345 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
3346 : COMPOSITION_WITH_RULE_ALTCHARS); \
3347 cmp_status->state \
3348 = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR); \
3349 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
3350 cmp_status->length = MAX_ANNOTATION_LENGTH; \
3351 cmp_status->nchars = cmp_status->ncomps = 0; \
3352 coding->annotated = 1; \
3353 } \
3354 } while (0)
3355
3356
3357 /* Handle composition end sequence ESC 1. */
3358
3359 #define DECODE_COMPOSITION_END() \
3360 do { \
3361 if (cmp_status->nchars == 0 \
3362 || ((cmp_status->state == COMPOSING_CHAR) \
3363 == (cmp_status->method == COMPOSITION_WITH_RULE))) \
3364 { \
3365 MAYBE_FINISH_COMPOSITION (); \
3366 goto invalid_code; \
3367 } \
3368 if (cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3369 charbuf[- cmp_status->length] -= cmp_status->ncomps + 2; \
3370 else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS) \
3371 charbuf[- cmp_status->length] -= cmp_status->ncomps * 3; \
3372 charbuf[- cmp_status->length + 2] = cmp_status->nchars; \
3373 char_offset += cmp_status->nchars; \
3374 cmp_status->state = COMPOSING_NO; \
3375 } while (0)
3376
3377 /* Store a composition rule RULE in charbuf, and update cmp_status. */
3378
3379 #define STORE_COMPOSITION_RULE(rule) \
3380 do { \
3381 *charbuf++ = -2; \
3382 *charbuf++ = rule; \
3383 cmp_status->length += 2; \
3384 cmp_status->state--; \
3385 } while (0)
3386
3387 /* Store a composed char or a component char C in charbuf, and update
3388 cmp_status. */
3389
3390 #define STORE_COMPOSITION_CHAR(c) \
3391 do { \
3392 *charbuf++ = (c); \
3393 cmp_status->length++; \
3394 if (cmp_status->state == COMPOSING_CHAR) \
3395 cmp_status->nchars++; \
3396 else \
3397 cmp_status->ncomps++; \
3398 if (cmp_status->method == COMPOSITION_WITH_RULE \
3399 || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS \
3400 && cmp_status->state == COMPOSING_COMPONENT_CHAR)) \
3401 cmp_status->state++; \
3402 } while (0)
3403
3404
3405 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
3406
3407 static void
3408 decode_coding_iso_2022 (struct coding_system *coding)
3409 {
3410 const unsigned char *src = coding->source + coding->consumed;
3411 const unsigned char *src_end = coding->source + coding->src_bytes;
3412 const unsigned char *src_base;
3413 int *charbuf = coding->charbuf + coding->charbuf_used;
3414 /* We may produce two annotations (charset and composition) in one
3415 loop and one more charset annotation at the end. */
3416 int *charbuf_end
3417 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3418 ptrdiff_t consumed_chars = 0, consumed_chars_base;
3419 bool multibytep = coding->src_multibyte;
3420 /* Charsets invoked to graphic plane 0 and 1 respectively. */
3421 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3422 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3423 int charset_id_2, charset_id_3;
3424 struct charset *charset;
3425 int c;
3426 struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3427 Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3428 ptrdiff_t char_offset = coding->produced_char;
3429 ptrdiff_t last_offset = char_offset;
3430 int last_id = charset_ascii;
3431 bool eol_dos
3432 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3433 int byte_after_cr = -1;
3434 int i;
3435
3436 setup_iso_safe_charsets (attrs);
3437 coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3438
3439 if (cmp_status->state != COMPOSING_NO)
3440 {
3441 if (charbuf_end - charbuf < cmp_status->length)
3442 emacs_abort ();
3443 for (i = 0; i < cmp_status->length; i++)
3444 *charbuf++ = cmp_status->carryover[i];
3445 coding->annotated = 1;
3446 }
3447
3448 while (1)
3449 {
3450 int c1, c2, c3;
3451
3452 src_base = src;
3453 consumed_chars_base = consumed_chars;
3454
3455 if (charbuf >= charbuf_end)
3456 {
3457 if (byte_after_cr >= 0)
3458 src_base--;
3459 break;
3460 }
3461
3462 if (byte_after_cr >= 0)
3463 c1 = byte_after_cr, byte_after_cr = -1;
3464 else
3465 ONE_MORE_BYTE (c1);
3466 if (c1 < 0)
3467 goto invalid_code;
3468
3469 if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3470 {
3471 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3472 char_offset++;
3473 CODING_ISO_EXTSEGMENT_LEN (coding)--;
3474 continue;
3475 }
3476
3477 if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3478 {
3479 if (c1 == ISO_CODE_ESC)
3480 {
3481 if (src + 1 >= src_end)
3482 goto no_more_source;
3483 *charbuf++ = ISO_CODE_ESC;
3484 char_offset++;
3485 if (src[0] == '%' && src[1] == '@')
3486 {
3487 src += 2;
3488 consumed_chars += 2;
3489 char_offset += 2;
3490 /* We are sure charbuf can contain two more chars. */
3491 *charbuf++ = '%';
3492 *charbuf++ = '@';
3493 CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3494 }
3495 }
3496 else
3497 {
3498 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3499 char_offset++;
3500 }
3501 continue;
3502 }
3503
3504 if ((cmp_status->state == COMPOSING_RULE
3505 || cmp_status->state == COMPOSING_COMPONENT_RULE)
3506 && c1 != ISO_CODE_ESC)
3507 {
3508 int rule;
3509
3510 DECODE_COMPOSITION_RULE (rule);
3511 STORE_COMPOSITION_RULE (rule);
3512 continue;
3513 }
3514
3515 /* We produce at most one character. */
3516 switch (iso_code_class [c1])
3517 {
3518 case ISO_0x20_or_0x7F:
3519 if (charset_id_0 < 0
3520 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3521 /* This is SPACE or DEL. */
3522 charset = CHARSET_FROM_ID (charset_ascii);
3523 else
3524 charset = CHARSET_FROM_ID (charset_id_0);
3525 break;
3526
3527 case ISO_graphic_plane_0:
3528 if (charset_id_0 < 0)
3529 charset = CHARSET_FROM_ID (charset_ascii);
3530 else
3531 charset = CHARSET_FROM_ID (charset_id_0);
3532 break;
3533
3534 case ISO_0xA0_or_0xFF:
3535 if (charset_id_1 < 0
3536 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3537 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3538 goto invalid_code;
3539 /* This is a graphic character, we fall down ... */
3540
3541 case ISO_graphic_plane_1:
3542 if (charset_id_1 < 0)
3543 goto invalid_code;
3544 charset = CHARSET_FROM_ID (charset_id_1);
3545 break;
3546
3547 case ISO_control_0:
3548 if (eol_dos && c1 == '\r')
3549 ONE_MORE_BYTE (byte_after_cr);
3550 MAYBE_FINISH_COMPOSITION ();
3551 charset = CHARSET_FROM_ID (charset_ascii);
3552 break;
3553
3554 case ISO_control_1:
3555 goto invalid_code;
3556
3557 case ISO_shift_out:
3558 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3559 || CODING_ISO_DESIGNATION (coding, 1) < 0)
3560 goto invalid_code;
3561 CODING_ISO_INVOCATION (coding, 0) = 1;
3562 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3563 continue;
3564
3565 case ISO_shift_in:
3566 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3567 goto invalid_code;
3568 CODING_ISO_INVOCATION (coding, 0) = 0;
3569 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3570 continue;
3571
3572 case ISO_single_shift_2_7:
3573 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3574 goto invalid_code;
3575 case ISO_single_shift_2:
3576 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3577 goto invalid_code;
3578 /* SS2 is handled as an escape sequence of ESC 'N' */
3579 c1 = 'N';
3580 goto label_escape_sequence;
3581
3582 case ISO_single_shift_3:
3583 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3584 goto invalid_code;
3585 /* SS2 is handled as an escape sequence of ESC 'O' */
3586 c1 = 'O';
3587 goto label_escape_sequence;
3588
3589 case ISO_control_sequence_introducer:
3590 /* CSI is handled as an escape sequence of ESC '[' ... */
3591 c1 = '[';
3592 goto label_escape_sequence;
3593
3594 case ISO_escape:
3595 ONE_MORE_BYTE (c1);
3596 label_escape_sequence:
3597 /* Escape sequences handled here are invocation,
3598 designation, direction specification, and character
3599 composition specification. */
3600 switch (c1)
3601 {
3602 case '&': /* revision of following character set */
3603 ONE_MORE_BYTE (c1);
3604 if (!(c1 >= '@' && c1 <= '~'))
3605 goto invalid_code;
3606 ONE_MORE_BYTE (c1);
3607 if (c1 != ISO_CODE_ESC)
3608 goto invalid_code;
3609 ONE_MORE_BYTE (c1);
3610 goto label_escape_sequence;
3611
3612 case '$': /* designation of 2-byte character set */
3613 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3614 goto invalid_code;
3615 {
3616 int reg, chars96;
3617
3618 ONE_MORE_BYTE (c1);
3619 if (c1 >= '@' && c1 <= 'B')
3620 { /* designation of JISX0208.1978, GB2312.1980,
3621 or JISX0208.1980 */
3622 reg = 0, chars96 = 0;
3623 }
3624 else if (c1 >= 0x28 && c1 <= 0x2B)
3625 { /* designation of DIMENSION2_CHARS94 character set */
3626 reg = c1 - 0x28, chars96 = 0;
3627 ONE_MORE_BYTE (c1);
3628 }
3629 else if (c1 >= 0x2C && c1 <= 0x2F)
3630 { /* designation of DIMENSION2_CHARS96 character set */
3631 reg = c1 - 0x2C, chars96 = 1;
3632 ONE_MORE_BYTE (c1);
3633 }
3634 else
3635 goto invalid_code;
3636 DECODE_DESIGNATION (reg, 2, chars96, c1);
3637 /* We must update these variables now. */
3638 if (reg == 0)
3639 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3640 else if (reg == 1)
3641 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3642 if (chars96 < 0)
3643 goto invalid_code;
3644 }
3645 continue;
3646
3647 case 'n': /* invocation of locking-shift-2 */
3648 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3649 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3650 goto invalid_code;
3651 CODING_ISO_INVOCATION (coding, 0) = 2;
3652 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3653 continue;
3654
3655 case 'o': /* invocation of locking-shift-3 */
3656 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3657 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3658 goto invalid_code;
3659 CODING_ISO_INVOCATION (coding, 0) = 3;
3660 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3661 continue;
3662
3663 case 'N': /* invocation of single-shift-2 */
3664 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3665 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3666 goto invalid_code;
3667 charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3668 if (charset_id_2 < 0)
3669 charset = CHARSET_FROM_ID (charset_ascii);
3670 else
3671 charset = CHARSET_FROM_ID (charset_id_2);
3672 ONE_MORE_BYTE (c1);
3673 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3674 goto invalid_code;
3675 break;
3676
3677 case 'O': /* invocation of single-shift-3 */
3678 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3679 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3680 goto invalid_code;
3681 charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3682 if (charset_id_3 < 0)
3683 charset = CHARSET_FROM_ID (charset_ascii);
3684 else
3685 charset = CHARSET_FROM_ID (charset_id_3);
3686 ONE_MORE_BYTE (c1);
3687 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3688 goto invalid_code;
3689 break;
3690
3691 case '0': case '2': case '3': case '4': /* start composition */
3692 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3693 goto invalid_code;
3694 if (last_id != charset_ascii)
3695 {
3696 ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3697 last_id = charset_ascii;
3698 last_offset = char_offset;
3699 }
3700 DECODE_COMPOSITION_START (c1);
3701 continue;
3702
3703 case '1': /* end composition */
3704 if (cmp_status->state == COMPOSING_NO)
3705 goto invalid_code;
3706 DECODE_COMPOSITION_END ();
3707 continue;
3708
3709 case '[': /* specification of direction */
3710 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3711 goto invalid_code;
3712 /* For the moment, nested direction is not supported.
3713 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3714 left-to-right, and nonzero means right-to-left. */
3715 ONE_MORE_BYTE (c1);
3716 switch (c1)
3717 {
3718 case ']': /* end of the current direction */
3719 coding->mode &= ~CODING_MODE_DIRECTION;
3720
3721 case '0': /* end of the current direction */
3722 case '1': /* start of left-to-right direction */
3723 ONE_MORE_BYTE (c1);
3724 if (c1 == ']')
3725 coding->mode &= ~CODING_MODE_DIRECTION;
3726 else
3727 goto invalid_code;
3728 break;
3729
3730 case '2': /* start of right-to-left direction */
3731 ONE_MORE_BYTE (c1);
3732 if (c1 == ']')
3733 coding->mode |= CODING_MODE_DIRECTION;
3734 else
3735 goto invalid_code;
3736 break;
3737
3738 default:
3739 goto invalid_code;
3740 }
3741 continue;
3742
3743 case '%':
3744 ONE_MORE_BYTE (c1);
3745 if (c1 == '/')
3746 {
3747 /* CTEXT extended segment:
3748 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3749 We keep these bytes as is for the moment.
3750 They may be decoded by post-read-conversion. */
3751 int dim, M, L;
3752 int size;
3753
3754 ONE_MORE_BYTE (dim);
3755 if (dim < '0' || dim > '4')
3756 goto invalid_code;
3757 ONE_MORE_BYTE (M);
3758 if (M < 128)
3759 goto invalid_code;
3760 ONE_MORE_BYTE (L);
3761 if (L < 128)
3762 goto invalid_code;
3763 size = ((M - 128) * 128) + (L - 128);
3764 if (charbuf + 6 > charbuf_end)
3765 goto break_loop;
3766 *charbuf++ = ISO_CODE_ESC;
3767 *charbuf++ = '%';
3768 *charbuf++ = '/';
3769 *charbuf++ = dim;
3770 *charbuf++ = BYTE8_TO_CHAR (M);
3771 *charbuf++ = BYTE8_TO_CHAR (L);
3772 CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3773 }
3774 else if (c1 == 'G')
3775 {
3776 /* XFree86 extension for embedding UTF-8 in CTEXT:
3777 ESC % G --UTF-8-BYTES-- ESC % @
3778 We keep these bytes as is for the moment.
3779 They may be decoded by post-read-conversion. */
3780 if (charbuf + 3 > charbuf_end)
3781 goto break_loop;
3782 *charbuf++ = ISO_CODE_ESC;
3783 *charbuf++ = '%';
3784 *charbuf++ = 'G';
3785 CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3786 }
3787 else
3788 goto invalid_code;
3789 continue;
3790 break;
3791
3792 default:
3793 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3794 goto invalid_code;
3795 {
3796 int reg, chars96;
3797
3798 if (c1 >= 0x28 && c1 <= 0x2B)
3799 { /* designation of DIMENSION1_CHARS94 character set */
3800 reg = c1 - 0x28, chars96 = 0;
3801 ONE_MORE_BYTE (c1);
3802 }
3803 else if (c1 >= 0x2C && c1 <= 0x2F)
3804 { /* designation of DIMENSION1_CHARS96 character set */
3805 reg = c1 - 0x2C, chars96 = 1;
3806 ONE_MORE_BYTE (c1);
3807 }
3808 else
3809 goto invalid_code;
3810 DECODE_DESIGNATION (reg, 1, chars96, c1);
3811 /* We must update these variables now. */
3812 if (reg == 0)
3813 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3814 else if (reg == 1)
3815 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3816 if (chars96 < 0)
3817 goto invalid_code;
3818 }
3819 continue;
3820 }
3821 break;
3822
3823 default:
3824 emacs_abort ();
3825 }
3826
3827 if (cmp_status->state == COMPOSING_NO
3828 && charset->id != charset_ascii
3829 && last_id != charset->id)
3830 {
3831 if (last_id != charset_ascii)
3832 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3833 last_id = charset->id;
3834 last_offset = char_offset;
3835 }
3836
3837 /* Now we know CHARSET and 1st position code C1 of a character.
3838 Produce a decoded character while getting 2nd and 3rd
3839 position codes C2, C3 if necessary. */
3840 if (CHARSET_DIMENSION (charset) > 1)
3841 {
3842 ONE_MORE_BYTE (c2);
3843 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3844 || ((c1 & 0x80) != (c2 & 0x80)))
3845 /* C2 is not in a valid range. */
3846 goto invalid_code;
3847 if (CHARSET_DIMENSION (charset) == 2)
3848 c1 = (c1 << 8) | c2;
3849 else
3850 {
3851 ONE_MORE_BYTE (c3);
3852 if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3853 || ((c1 & 0x80) != (c3 & 0x80)))
3854 /* C3 is not in a valid range. */
3855 goto invalid_code;
3856 c1 = (c1 << 16) | (c2 << 8) | c2;
3857 }
3858 }
3859 c1 &= 0x7F7F7F;
3860 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3861 if (c < 0)
3862 {
3863 MAYBE_FINISH_COMPOSITION ();
3864 for (; src_base < src; src_base++, char_offset++)
3865 {
3866 if (ASCII_BYTE_P (*src_base))
3867 *charbuf++ = *src_base;
3868 else
3869 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3870 }
3871 }
3872 else if (cmp_status->state == COMPOSING_NO)
3873 {
3874 *charbuf++ = c;
3875 char_offset++;
3876 }
3877 else if ((cmp_status->state == COMPOSING_CHAR
3878 ? cmp_status->nchars
3879 : cmp_status->ncomps)
3880 >= MAX_COMPOSITION_COMPONENTS)
3881 {
3882 /* Too long composition. */
3883 MAYBE_FINISH_COMPOSITION ();
3884 *charbuf++ = c;
3885 char_offset++;
3886 }
3887 else
3888 STORE_COMPOSITION_CHAR (c);
3889 continue;
3890
3891 invalid_code:
3892 MAYBE_FINISH_COMPOSITION ();
3893 src = src_base;
3894 consumed_chars = consumed_chars_base;
3895 ONE_MORE_BYTE (c);
3896 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3897 char_offset++;
3898 coding->errors++;
3899 continue;
3900
3901 break_loop:
3902 break;
3903 }
3904
3905 no_more_source:
3906 if (cmp_status->state != COMPOSING_NO)
3907 {
3908 if (coding->mode & CODING_MODE_LAST_BLOCK)
3909 MAYBE_FINISH_COMPOSITION ();
3910 else
3911 {
3912 charbuf -= cmp_status->length;
3913 for (i = 0; i < cmp_status->length; i++)
3914 cmp_status->carryover[i] = charbuf[i];
3915 }
3916 }
3917 else if (last_id != charset_ascii)
3918 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3919 coding->consumed_char += consumed_chars_base;
3920 coding->consumed = src_base - coding->source;
3921 coding->charbuf_used = charbuf - coding->charbuf;
3922 }
3923
3924
3925 /* ISO2022 encoding stuff. */
3926
3927 /*
3928 It is not enough to say just "ISO2022" on encoding, we have to
3929 specify more details. In Emacs, each coding system of ISO2022
3930 variant has the following specifications:
3931 1. Initial designation to G0 thru G3.
3932 2. Allows short-form designation?
3933 3. ASCII should be designated to G0 before control characters?
3934 4. ASCII should be designated to G0 at end of line?
3935 5. 7-bit environment or 8-bit environment?
3936 6. Use locking-shift?
3937 7. Use Single-shift?
3938 And the following two are only for Japanese:
3939 8. Use ASCII in place of JIS0201-1976-Roman?
3940 9. Use JISX0208-1983 in place of JISX0208-1978?
3941 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3942 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
3943 details.
3944 */
3945
3946 /* Produce codes (escape sequence) for designating CHARSET to graphic
3947 register REG at DST, and increment DST. If <final-char> of CHARSET is
3948 '@', 'A', or 'B' and the coding system CODING allows, produce
3949 designation sequence of short-form. */
3950
3951 #define ENCODE_DESIGNATION(charset, reg, coding) \
3952 do { \
3953 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
3954 const char *intermediate_char_94 = "()*+"; \
3955 const char *intermediate_char_96 = ",-./"; \
3956 int revision = -1; \
3957 \
3958 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
3959 revision = CHARSET_ISO_REVISION (charset); \
3960 \
3961 if (revision >= 0) \
3962 { \
3963 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3964 EMIT_ONE_BYTE ('@' + revision); \
3965 } \
3966 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
3967 if (CHARSET_DIMENSION (charset) == 1) \
3968 { \
3969 int b; \
3970 if (! CHARSET_ISO_CHARS_96 (charset)) \
3971 b = intermediate_char_94[reg]; \
3972 else \
3973 b = intermediate_char_96[reg]; \
3974 EMIT_ONE_ASCII_BYTE (b); \
3975 } \
3976 else \
3977 { \
3978 EMIT_ONE_ASCII_BYTE ('$'); \
3979 if (! CHARSET_ISO_CHARS_96 (charset)) \
3980 { \
3981 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
3982 || reg != 0 \
3983 || final_char < '@' || final_char > 'B') \
3984 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
3985 } \
3986 else \
3987 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
3988 } \
3989 EMIT_ONE_ASCII_BYTE (final_char); \
3990 \
3991 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
3992 } while (0)
3993
3994
3995 /* The following two macros produce codes (control character or escape
3996 sequence) for ISO2022 single-shift functions (single-shift-2 and
3997 single-shift-3). */
3998
3999 #define ENCODE_SINGLE_SHIFT_2 \
4000 do { \
4001 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4002 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
4003 else \
4004 EMIT_ONE_BYTE (ISO_CODE_SS2); \
4005 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4006 } while (0)
4007
4008
4009 #define ENCODE_SINGLE_SHIFT_3 \
4010 do { \
4011 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4012 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
4013 else \
4014 EMIT_ONE_BYTE (ISO_CODE_SS3); \
4015 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4016 } while (0)
4017
4018
4019 /* The following four macros produce codes (control character or
4020 escape sequence) for ISO2022 locking-shift functions (shift-in,
4021 shift-out, locking-shift-2, and locking-shift-3). */
4022
4023 #define ENCODE_SHIFT_IN \
4024 do { \
4025 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
4026 CODING_ISO_INVOCATION (coding, 0) = 0; \
4027 } while (0)
4028
4029
4030 #define ENCODE_SHIFT_OUT \
4031 do { \
4032 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
4033 CODING_ISO_INVOCATION (coding, 0) = 1; \
4034 } while (0)
4035
4036
4037 #define ENCODE_LOCKING_SHIFT_2 \
4038 do { \
4039 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4040 CODING_ISO_INVOCATION (coding, 0) = 2; \
4041 } while (0)
4042
4043
4044 #define ENCODE_LOCKING_SHIFT_3 \
4045 do { \
4046 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4047 CODING_ISO_INVOCATION (coding, 0) = 3; \
4048 } while (0)
4049
4050
4051 /* Produce codes for a DIMENSION1 character whose character set is
4052 CHARSET and whose position-code is C1. Designation and invocation
4053 sequences are also produced in advance if necessary. */
4054
4055 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
4056 do { \
4057 int id = CHARSET_ID (charset); \
4058 \
4059 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
4060 && id == charset_ascii) \
4061 { \
4062 id = charset_jisx0201_roman; \
4063 charset = CHARSET_FROM_ID (id); \
4064 } \
4065 \
4066 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
4067 { \
4068 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4069 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
4070 else \
4071 EMIT_ONE_BYTE (c1 | 0x80); \
4072 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
4073 break; \
4074 } \
4075 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
4076 { \
4077 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
4078 break; \
4079 } \
4080 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
4081 { \
4082 EMIT_ONE_BYTE (c1 | 0x80); \
4083 break; \
4084 } \
4085 else \
4086 /* Since CHARSET is not yet invoked to any graphic planes, we \
4087 must invoke it, or, at first, designate it to some graphic \
4088 register. Then repeat the loop to actually produce the \
4089 character. */ \
4090 dst = encode_invocation_designation (charset, coding, dst, \
4091 &produced_chars); \
4092 } while (1)
4093
4094
4095 /* Produce codes for a DIMENSION2 character whose character set is
4096 CHARSET and whose position-codes are C1 and C2. Designation and
4097 invocation codes are also produced in advance if necessary. */
4098
4099 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
4100 do { \
4101 int id = CHARSET_ID (charset); \
4102 \
4103 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
4104 && id == charset_jisx0208) \
4105 { \
4106 id = charset_jisx0208_1978; \
4107 charset = CHARSET_FROM_ID (id); \
4108 } \
4109 \
4110 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
4111 { \
4112 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4113 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
4114 else \
4115 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
4116 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
4117 break; \
4118 } \
4119 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
4120 { \
4121 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
4122 break; \
4123 } \
4124 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
4125 { \
4126 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
4127 break; \
4128 } \
4129 else \
4130 /* Since CHARSET is not yet invoked to any graphic planes, we \
4131 must invoke it, or, at first, designate it to some graphic \
4132 register. Then repeat the loop to actually produce the \
4133 character. */ \
4134 dst = encode_invocation_designation (charset, coding, dst, \
4135 &produced_chars); \
4136 } while (1)
4137
4138
4139 #define ENCODE_ISO_CHARACTER(charset, c) \
4140 do { \
4141 unsigned code; \
4142 CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code); \
4143 \
4144 if (CHARSET_DIMENSION (charset) == 1) \
4145 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
4146 else \
4147 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4148 } while (0)
4149
4150
4151 /* Produce designation and invocation codes at a place pointed by DST
4152 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
4153 Return new DST. */
4154
4155 static unsigned char *
4156 encode_invocation_designation (struct charset *charset,
4157 struct coding_system *coding,
4158 unsigned char *dst, ptrdiff_t *p_nchars)
4159 {
4160 bool multibytep = coding->dst_multibyte;
4161 ptrdiff_t produced_chars = *p_nchars;
4162 int reg; /* graphic register number */
4163 int id = CHARSET_ID (charset);
4164
4165 /* At first, check designations. */
4166 for (reg = 0; reg < 4; reg++)
4167 if (id == CODING_ISO_DESIGNATION (coding, reg))
4168 break;
4169
4170 if (reg >= 4)
4171 {
4172 /* CHARSET is not yet designated to any graphic registers. */
4173 /* At first check the requested designation. */
4174 reg = CODING_ISO_REQUEST (coding, id);
4175 if (reg < 0)
4176 /* Since CHARSET requests no special designation, designate it
4177 to graphic register 0. */
4178 reg = 0;
4179
4180 ENCODE_DESIGNATION (charset, reg, coding);
4181 }
4182
4183 if (CODING_ISO_INVOCATION (coding, 0) != reg
4184 && CODING_ISO_INVOCATION (coding, 1) != reg)
4185 {
4186 /* Since the graphic register REG is not invoked to any graphic
4187 planes, invoke it to graphic plane 0. */
4188 switch (reg)
4189 {
4190 case 0: /* graphic register 0 */
4191 ENCODE_SHIFT_IN;
4192 break;
4193
4194 case 1: /* graphic register 1 */
4195 ENCODE_SHIFT_OUT;
4196 break;
4197
4198 case 2: /* graphic register 2 */
4199 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4200 ENCODE_SINGLE_SHIFT_2;
4201 else
4202 ENCODE_LOCKING_SHIFT_2;
4203 break;
4204
4205 case 3: /* graphic register 3 */
4206 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4207 ENCODE_SINGLE_SHIFT_3;
4208 else
4209 ENCODE_LOCKING_SHIFT_3;
4210 break;
4211 }
4212 }
4213
4214 *p_nchars = produced_chars;
4215 return dst;
4216 }
4217
4218
4219 /* Produce codes for designation and invocation to reset the graphic
4220 planes and registers to initial state. */
4221 #define ENCODE_RESET_PLANE_AND_REGISTER() \
4222 do { \
4223 int reg; \
4224 struct charset *charset; \
4225 \
4226 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
4227 ENCODE_SHIFT_IN; \
4228 for (reg = 0; reg < 4; reg++) \
4229 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
4230 && (CODING_ISO_DESIGNATION (coding, reg) \
4231 != CODING_ISO_INITIAL (coding, reg))) \
4232 { \
4233 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4234 ENCODE_DESIGNATION (charset, reg, coding); \
4235 } \
4236 } while (0)
4237
4238
4239 /* Produce designation sequences of charsets in the line started from
4240 CHARBUF to a place pointed by DST, and return the number of
4241 produced bytes. DST should not directly point a buffer text area
4242 which may be relocated by char_charset call.
4243
4244 If the current block ends before any end-of-line, we may fail to
4245 find all the necessary designations. */
4246
4247 static ptrdiff_t
4248 encode_designation_at_bol (struct coding_system *coding,
4249 int *charbuf, int *charbuf_end,
4250 unsigned char *dst)
4251 {
4252 unsigned char *orig = dst;
4253 struct charset *charset;
4254 /* Table of charsets to be designated to each graphic register. */
4255 int r[4];
4256 int c, found = 0, reg;
4257 ptrdiff_t produced_chars = 0;
4258 bool multibytep = coding->dst_multibyte;
4259 Lisp_Object attrs;
4260 Lisp_Object charset_list;
4261
4262 attrs = CODING_ID_ATTRS (coding->id);
4263 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4264 if (EQ (charset_list, Qiso_2022))
4265 charset_list = Viso_2022_charset_list;
4266
4267 for (reg = 0; reg < 4; reg++)
4268 r[reg] = -1;
4269
4270 while (charbuf < charbuf_end && found < 4)
4271 {
4272 int id;
4273
4274 c = *charbuf++;
4275 if (c == '\n')
4276 break;
4277 charset = char_charset (c, charset_list, NULL);
4278 id = CHARSET_ID (charset);
4279 reg = CODING_ISO_REQUEST (coding, id);
4280 if (reg >= 0 && r[reg] < 0)
4281 {
4282 found++;
4283 r[reg] = id;
4284 }
4285 }
4286
4287 if (found)
4288 {
4289 for (reg = 0; reg < 4; reg++)
4290 if (r[reg] >= 0
4291 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4292 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4293 }
4294
4295 return dst - orig;
4296 }
4297
4298 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
4299
4300 static bool
4301 encode_coding_iso_2022 (struct coding_system *coding)
4302 {
4303 bool multibytep = coding->dst_multibyte;
4304 int *charbuf = coding->charbuf;
4305 int *charbuf_end = charbuf + coding->charbuf_used;
4306 unsigned char *dst = coding->destination + coding->produced;
4307 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4308 int safe_room = 16;
4309 bool bol_designation
4310 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4311 && CODING_ISO_BOL (coding));
4312 ptrdiff_t produced_chars = 0;
4313 Lisp_Object attrs, eol_type, charset_list;
4314 bool ascii_compatible;
4315 int c;
4316 int preferred_charset_id = -1;
4317
4318 CODING_GET_INFO (coding, attrs, charset_list);
4319 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4320 if (VECTORP (eol_type))
4321 eol_type = Qunix;
4322
4323 setup_iso_safe_charsets (attrs);
4324 /* Charset list may have been changed. */
4325 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4326 coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4327
4328 ascii_compatible
4329 = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4330 && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4331 | CODING_ISO_FLAG_LOCKING_SHIFT)));
4332
4333 while (charbuf < charbuf_end)
4334 {
4335 ASSURE_DESTINATION (safe_room);
4336
4337 if (bol_designation)
4338 {
4339 /* We have to produce designation sequences if any now. */
4340 unsigned char desig_buf[16];
4341 int nbytes;
4342 ptrdiff_t offset;
4343
4344 charset_map_loaded = 0;
4345 nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4346 desig_buf);
4347 if (charset_map_loaded
4348 && (offset = coding_change_destination (coding)))
4349 {
4350 dst += offset;
4351 dst_end += offset;
4352 }
4353 memcpy (dst, desig_buf, nbytes);
4354 dst += nbytes;
4355 /* We are sure that designation sequences are all ASCII bytes. */
4356 produced_chars += nbytes;
4357 bol_designation = 0;
4358 ASSURE_DESTINATION (safe_room);
4359 }
4360
4361 c = *charbuf++;
4362
4363 if (c < 0)
4364 {
4365 /* Handle an annotation. */
4366 switch (*charbuf)
4367 {
4368 case CODING_ANNOTATE_COMPOSITION_MASK:
4369 /* Not yet implemented. */
4370 break;
4371 case CODING_ANNOTATE_CHARSET_MASK:
4372 preferred_charset_id = charbuf[2];
4373 if (preferred_charset_id >= 0
4374 && NILP (Fmemq (make_number (preferred_charset_id),
4375 charset_list)))
4376 preferred_charset_id = -1;
4377 break;
4378 default:
4379 emacs_abort ();
4380 }
4381 charbuf += -c - 1;
4382 continue;
4383 }
4384
4385 /* Now encode the character C. */
4386 if (c < 0x20 || c == 0x7F)
4387 {
4388 if (c == '\n'
4389 || (c == '\r' && EQ (eol_type, Qmac)))
4390 {
4391 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4392 ENCODE_RESET_PLANE_AND_REGISTER ();
4393 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4394 {
4395 int i;
4396
4397 for (i = 0; i < 4; i++)
4398 CODING_ISO_DESIGNATION (coding, i)
4399 = CODING_ISO_INITIAL (coding, i);
4400 }
4401 bol_designation = ((CODING_ISO_FLAGS (coding)
4402 & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4403 != 0);
4404 }
4405 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4406 ENCODE_RESET_PLANE_AND_REGISTER ();
4407 EMIT_ONE_ASCII_BYTE (c);
4408 }
4409 else if (ASCII_CHAR_P (c))
4410 {
4411 if (ascii_compatible)
4412 EMIT_ONE_ASCII_BYTE (c);
4413 else
4414 {
4415 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4416 ENCODE_ISO_CHARACTER (charset, c);
4417 }
4418 }
4419 else if (CHAR_BYTE8_P (c))
4420 {
4421 c = CHAR_TO_BYTE8 (c);
4422 EMIT_ONE_BYTE (c);
4423 }
4424 else
4425 {
4426 struct charset *charset;
4427
4428 if (preferred_charset_id >= 0)
4429 {
4430 bool result;
4431
4432 charset = CHARSET_FROM_ID (preferred_charset_id);
4433 CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4434 if (! result)
4435 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4436 NULL, charset);
4437 }
4438 else
4439 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4440 NULL, charset);
4441 if (!charset)
4442 {
4443 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4444 {
4445 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4446 charset = CHARSET_FROM_ID (charset_ascii);
4447 }
4448 else
4449 {
4450 c = coding->default_char;
4451 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4452 charset_list, NULL, charset);
4453 }
4454 }
4455 ENCODE_ISO_CHARACTER (charset, c);
4456 }
4457 }
4458
4459 if (coding->mode & CODING_MODE_LAST_BLOCK
4460 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4461 {
4462 ASSURE_DESTINATION (safe_room);
4463 ENCODE_RESET_PLANE_AND_REGISTER ();
4464 }
4465 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4466 CODING_ISO_BOL (coding) = bol_designation;
4467 coding->produced_char += produced_chars;
4468 coding->produced = dst - coding->destination;
4469 return 0;
4470 }
4471
4472 \f
4473 /*** 8,9. SJIS and BIG5 handlers ***/
4474
4475 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4476 quite widely. So, for the moment, Emacs supports them in the bare
4477 C code. But, in the future, they may be supported only by CCL. */
4478
4479 /* SJIS is a coding system encoding three character sets: ASCII, right
4480 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
4481 as is. A character of charset katakana-jisx0201 is encoded by
4482 "position-code + 0x80". A character of charset japanese-jisx0208
4483 is encoded in 2-byte but two position-codes are divided and shifted
4484 so that it fit in the range below.
4485
4486 --- CODE RANGE of SJIS ---
4487 (character set) (range)
4488 ASCII 0x00 .. 0x7F
4489 KATAKANA-JISX0201 0xA0 .. 0xDF
4490 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
4491 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
4492 -------------------------------
4493
4494 */
4495
4496 /* BIG5 is a coding system encoding two character sets: ASCII and
4497 Big5. An ASCII character is encoded as is. Big5 is a two-byte
4498 character set and is encoded in two-byte.
4499
4500 --- CODE RANGE of BIG5 ---
4501 (character set) (range)
4502 ASCII 0x00 .. 0x7F
4503 Big5 (1st byte) 0xA1 .. 0xFE
4504 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
4505 --------------------------
4506
4507 */
4508
4509 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4510 Return true if a text is encoded in SJIS. */
4511
4512 static bool
4513 detect_coding_sjis (struct coding_system *coding,
4514 struct coding_detection_info *detect_info)
4515 {
4516 const unsigned char *src = coding->source, *src_base;
4517 const unsigned char *src_end = coding->source + coding->src_bytes;
4518 bool multibytep = coding->src_multibyte;
4519 ptrdiff_t consumed_chars = 0;
4520 int found = 0;
4521 int c;
4522 Lisp_Object attrs, charset_list;
4523 int max_first_byte_of_2_byte_code;
4524
4525 CODING_GET_INFO (coding, attrs, charset_list);
4526 max_first_byte_of_2_byte_code
4527 = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4528
4529 detect_info->checked |= CATEGORY_MASK_SJIS;
4530 /* A coding system of this category is always ASCII compatible. */
4531 src += coding->head_ascii;
4532
4533 while (1)
4534 {
4535 src_base = src;
4536 ONE_MORE_BYTE (c);
4537 if (c < 0x80)
4538 continue;
4539 if ((c >= 0x81 && c <= 0x9F)
4540 || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4541 {
4542 ONE_MORE_BYTE (c);
4543 if (c < 0x40 || c == 0x7F || c > 0xFC)
4544 break;
4545 found = CATEGORY_MASK_SJIS;
4546 }
4547 else if (c >= 0xA0 && c < 0xE0)
4548 found = CATEGORY_MASK_SJIS;
4549 else
4550 break;
4551 }
4552 detect_info->rejected |= CATEGORY_MASK_SJIS;
4553 return 0;
4554
4555 no_more_source:
4556 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4557 {
4558 detect_info->rejected |= CATEGORY_MASK_SJIS;
4559 return 0;
4560 }
4561 detect_info->found |= found;
4562 return 1;
4563 }
4564
4565 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4566 Return true if a text is encoded in BIG5. */
4567
4568 static bool
4569 detect_coding_big5 (struct coding_system *coding,
4570 struct coding_detection_info *detect_info)
4571 {
4572 const unsigned char *src = coding->source, *src_base;
4573 const unsigned char *src_end = coding->source + coding->src_bytes;
4574 bool multibytep = coding->src_multibyte;
4575 ptrdiff_t consumed_chars = 0;
4576 int found = 0;
4577 int c;
4578
4579 detect_info->checked |= CATEGORY_MASK_BIG5;
4580 /* A coding system of this category is always ASCII compatible. */
4581 src += coding->head_ascii;
4582
4583 while (1)
4584 {
4585 src_base = src;
4586 ONE_MORE_BYTE (c);
4587 if (c < 0x80)
4588 continue;
4589 if (c >= 0xA1)
4590 {
4591 ONE_MORE_BYTE (c);
4592 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4593 return 0;
4594 found = CATEGORY_MASK_BIG5;
4595 }
4596 else
4597 break;
4598 }
4599 detect_info->rejected |= CATEGORY_MASK_BIG5;
4600 return 0;
4601
4602 no_more_source:
4603 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4604 {
4605 detect_info->rejected |= CATEGORY_MASK_BIG5;
4606 return 0;
4607 }
4608 detect_info->found |= found;
4609 return 1;
4610 }
4611
4612 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4613
4614 static void
4615 decode_coding_sjis (struct coding_system *coding)
4616 {
4617 const unsigned char *src = coding->source + coding->consumed;
4618 const unsigned char *src_end = coding->source + coding->src_bytes;
4619 const unsigned char *src_base;
4620 int *charbuf = coding->charbuf + coding->charbuf_used;
4621 /* We may produce one charset annotation in one loop and one more at
4622 the end. */
4623 int *charbuf_end
4624 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4625 ptrdiff_t consumed_chars = 0, consumed_chars_base;
4626 bool multibytep = coding->src_multibyte;
4627 struct charset *charset_roman, *charset_kanji, *charset_kana;
4628 struct charset *charset_kanji2;
4629 Lisp_Object attrs, charset_list, val;
4630 ptrdiff_t char_offset = coding->produced_char;
4631 ptrdiff_t last_offset = char_offset;
4632 int last_id = charset_ascii;
4633 bool eol_dos
4634 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4635 int byte_after_cr = -1;
4636
4637 CODING_GET_INFO (coding, attrs, charset_list);
4638
4639 val = charset_list;
4640 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4641 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4642 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4643 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4644
4645 while (1)
4646 {
4647 int c, c1;
4648 struct charset *charset;
4649
4650 src_base = src;
4651 consumed_chars_base = consumed_chars;
4652
4653 if (charbuf >= charbuf_end)
4654 {
4655 if (byte_after_cr >= 0)
4656 src_base--;
4657 break;
4658 }
4659
4660 if (byte_after_cr >= 0)
4661 c = byte_after_cr, byte_after_cr = -1;
4662 else
4663 ONE_MORE_BYTE (c);
4664 if (c < 0)
4665 goto invalid_code;
4666 if (c < 0x80)
4667 {
4668 if (eol_dos && c == '\r')
4669 ONE_MORE_BYTE (byte_after_cr);
4670 charset = charset_roman;
4671 }
4672 else if (c == 0x80 || c == 0xA0)
4673 goto invalid_code;
4674 else if (c >= 0xA1 && c <= 0xDF)
4675 {
4676 /* SJIS -> JISX0201-Kana */
4677 c &= 0x7F;
4678 charset = charset_kana;
4679 }
4680 else if (c <= 0xEF)
4681 {
4682 /* SJIS -> JISX0208 */
4683 ONE_MORE_BYTE (c1);
4684 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4685 goto invalid_code;
4686 c = (c << 8) | c1;
4687 SJIS_TO_JIS (c);
4688 charset = charset_kanji;
4689 }
4690 else if (c <= 0xFC && charset_kanji2)
4691 {
4692 /* SJIS -> JISX0213-2 */
4693 ONE_MORE_BYTE (c1);
4694 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4695 goto invalid_code;
4696 c = (c << 8) | c1;
4697 SJIS_TO_JIS2 (c);
4698 charset = charset_kanji2;
4699 }
4700 else
4701 goto invalid_code;
4702 if (charset->id != charset_ascii
4703 && last_id != charset->id)
4704 {
4705 if (last_id != charset_ascii)
4706 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4707 last_id = charset->id;
4708 last_offset = char_offset;
4709 }
4710 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4711 *charbuf++ = c;
4712 char_offset++;
4713 continue;
4714
4715 invalid_code:
4716 src = src_base;
4717 consumed_chars = consumed_chars_base;
4718 ONE_MORE_BYTE (c);
4719 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4720 char_offset++;
4721 coding->errors++;
4722 }
4723
4724 no_more_source:
4725 if (last_id != charset_ascii)
4726 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4727 coding->consumed_char += consumed_chars_base;
4728 coding->consumed = src_base - coding->source;
4729 coding->charbuf_used = charbuf - coding->charbuf;
4730 }
4731
4732 static void
4733 decode_coding_big5 (struct coding_system *coding)
4734 {
4735 const unsigned char *src = coding->source + coding->consumed;
4736 const unsigned char *src_end = coding->source + coding->src_bytes;
4737 const unsigned char *src_base;
4738 int *charbuf = coding->charbuf + coding->charbuf_used;
4739 /* We may produce one charset annotation in one loop and one more at
4740 the end. */
4741 int *charbuf_end
4742 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4743 ptrdiff_t consumed_chars = 0, consumed_chars_base;
4744 bool multibytep = coding->src_multibyte;
4745 struct charset *charset_roman, *charset_big5;
4746 Lisp_Object attrs, charset_list, val;
4747 ptrdiff_t char_offset = coding->produced_char;
4748 ptrdiff_t last_offset = char_offset;
4749 int last_id = charset_ascii;
4750 bool eol_dos
4751 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4752 int byte_after_cr = -1;
4753
4754 CODING_GET_INFO (coding, attrs, charset_list);
4755 val = charset_list;
4756 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4757 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4758
4759 while (1)
4760 {
4761 int c, c1;
4762 struct charset *charset;
4763
4764 src_base = src;
4765 consumed_chars_base = consumed_chars;
4766
4767 if (charbuf >= charbuf_end)
4768 {
4769 if (byte_after_cr >= 0)
4770 src_base--;
4771 break;
4772 }
4773
4774 if (byte_after_cr >= 0)
4775 c = byte_after_cr, byte_after_cr = -1;
4776 else
4777 ONE_MORE_BYTE (c);
4778
4779 if (c < 0)
4780 goto invalid_code;
4781 if (c < 0x80)
4782 {
4783 if (eol_dos && c == '\r')
4784 ONE_MORE_BYTE (byte_after_cr);
4785 charset = charset_roman;
4786 }
4787 else
4788 {
4789 /* BIG5 -> Big5 */
4790 if (c < 0xA1 || c > 0xFE)
4791 goto invalid_code;
4792 ONE_MORE_BYTE (c1);
4793 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4794 goto invalid_code;
4795 c = c << 8 | c1;
4796 charset = charset_big5;
4797 }
4798 if (charset->id != charset_ascii
4799 && last_id != charset->id)
4800 {
4801 if (last_id != charset_ascii)
4802 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4803 last_id = charset->id;
4804 last_offset = char_offset;
4805 }
4806 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4807 *charbuf++ = c;
4808 char_offset++;
4809 continue;
4810
4811 invalid_code:
4812 src = src_base;
4813 consumed_chars = consumed_chars_base;
4814 ONE_MORE_BYTE (c);
4815 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4816 char_offset++;
4817 coding->errors++;
4818 }
4819
4820 no_more_source:
4821 if (last_id != charset_ascii)
4822 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4823 coding->consumed_char += consumed_chars_base;
4824 coding->consumed = src_base - coding->source;
4825 coding->charbuf_used = charbuf - coding->charbuf;
4826 }
4827
4828 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4829 This function can encode charsets `ascii', `katakana-jisx0201',
4830 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4831 are sure that all these charsets are registered as official charset
4832 (i.e. do not have extended leading-codes). Characters of other
4833 charsets are produced without any encoding. */
4834
4835 static bool
4836 encode_coding_sjis (struct coding_system *coding)
4837 {
4838 bool multibytep = coding->dst_multibyte;
4839 int *charbuf = coding->charbuf;
4840 int *charbuf_end = charbuf + coding->charbuf_used;
4841 unsigned char *dst = coding->destination + coding->produced;
4842 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4843 int safe_room = 4;
4844 ptrdiff_t produced_chars = 0;
4845 Lisp_Object attrs, charset_list, val;
4846 bool ascii_compatible;
4847 struct charset *charset_kanji, *charset_kana;
4848 struct charset *charset_kanji2;
4849 int c;
4850
4851 CODING_GET_INFO (coding, attrs, charset_list);
4852 val = XCDR (charset_list);
4853 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4854 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4855 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4856
4857 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4858
4859 while (charbuf < charbuf_end)
4860 {
4861 ASSURE_DESTINATION (safe_room);
4862 c = *charbuf++;
4863 /* Now encode the character C. */
4864 if (ASCII_CHAR_P (c) && ascii_compatible)
4865 EMIT_ONE_ASCII_BYTE (c);
4866 else if (CHAR_BYTE8_P (c))
4867 {
4868 c = CHAR_TO_BYTE8 (c);
4869 EMIT_ONE_BYTE (c);
4870 }
4871 else
4872 {
4873 unsigned code;
4874 struct charset *charset;
4875 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4876 &code, charset);
4877
4878 if (!charset)
4879 {
4880 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4881 {
4882 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4883 charset = CHARSET_FROM_ID (charset_ascii);
4884 }
4885 else
4886 {
4887 c = coding->default_char;
4888 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4889 charset_list, &code, charset);
4890 }
4891 }
4892 if (code == CHARSET_INVALID_CODE (charset))
4893 emacs_abort ();
4894 if (charset == charset_kanji)
4895 {
4896 int c1, c2;
4897 JIS_TO_SJIS (code);
4898 c1 = code >> 8, c2 = code & 0xFF;
4899 EMIT_TWO_BYTES (c1, c2);
4900 }
4901 else if (charset == charset_kana)
4902 EMIT_ONE_BYTE (code | 0x80);
4903 else if (charset_kanji2 && charset == charset_kanji2)
4904 {
4905 int c1, c2;
4906
4907 c1 = code >> 8;
4908 if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4909 || c1 == 0x28
4910 || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4911 {
4912 JIS_TO_SJIS2 (code);
4913 c1 = code >> 8, c2 = code & 0xFF;
4914 EMIT_TWO_BYTES (c1, c2);
4915 }
4916 else
4917 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4918 }
4919 else
4920 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4921 }
4922 }
4923 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4924 coding->produced_char += produced_chars;
4925 coding->produced = dst - coding->destination;
4926 return 0;
4927 }
4928
4929 static bool
4930 encode_coding_big5 (struct coding_system *coding)
4931 {
4932 bool multibytep = coding->dst_multibyte;
4933 int *charbuf = coding->charbuf;
4934 int *charbuf_end = charbuf + coding->charbuf_used;
4935 unsigned char *dst = coding->destination + coding->produced;
4936 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4937 int safe_room = 4;
4938 ptrdiff_t produced_chars = 0;
4939 Lisp_Object attrs, charset_list, val;
4940 bool ascii_compatible;
4941 struct charset *charset_big5;
4942 int c;
4943
4944 CODING_GET_INFO (coding, attrs, charset_list);
4945 val = XCDR (charset_list);
4946 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4947 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4948
4949 while (charbuf < charbuf_end)
4950 {
4951 ASSURE_DESTINATION (safe_room);
4952 c = *charbuf++;
4953 /* Now encode the character C. */
4954 if (ASCII_CHAR_P (c) && ascii_compatible)
4955 EMIT_ONE_ASCII_BYTE (c);
4956 else if (CHAR_BYTE8_P (c))
4957 {
4958 c = CHAR_TO_BYTE8 (c);
4959 EMIT_ONE_BYTE (c);
4960 }
4961 else
4962 {
4963 unsigned code;
4964 struct charset *charset;
4965 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4966 &code, charset);
4967
4968 if (! charset)
4969 {
4970 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4971 {
4972 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4973 charset = CHARSET_FROM_ID (charset_ascii);
4974 }
4975 else
4976 {
4977 c = coding->default_char;
4978 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4979 charset_list, &code, charset);
4980 }
4981 }
4982 if (code == CHARSET_INVALID_CODE (charset))
4983 emacs_abort ();
4984 if (charset == charset_big5)
4985 {
4986 int c1, c2;
4987
4988 c1 = code >> 8, c2 = code & 0xFF;
4989 EMIT_TWO_BYTES (c1, c2);
4990 }
4991 else
4992 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4993 }
4994 }
4995 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4996 coding->produced_char += produced_chars;
4997 coding->produced = dst - coding->destination;
4998 return 0;
4999 }
5000
5001 \f
5002 /*** 10. CCL handlers ***/
5003
5004 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5005 Return true if a text is encoded in a coding system of which
5006 encoder/decoder are written in CCL program. */
5007
5008 static bool
5009 detect_coding_ccl (struct coding_system *coding,
5010 struct coding_detection_info *detect_info)
5011 {
5012 const unsigned char *src = coding->source, *src_base;
5013 const unsigned char *src_end = coding->source + coding->src_bytes;
5014 bool multibytep = coding->src_multibyte;
5015 ptrdiff_t consumed_chars = 0;
5016 int found = 0;
5017 unsigned char *valids;
5018 ptrdiff_t head_ascii = coding->head_ascii;
5019 Lisp_Object attrs;
5020
5021 detect_info->checked |= CATEGORY_MASK_CCL;
5022
5023 coding = &coding_categories[coding_category_ccl];
5024 valids = CODING_CCL_VALIDS (coding);
5025 attrs = CODING_ID_ATTRS (coding->id);
5026 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5027 src += head_ascii;
5028
5029 while (1)
5030 {
5031 int c;
5032
5033 src_base = src;
5034 ONE_MORE_BYTE (c);
5035 if (c < 0 || ! valids[c])
5036 break;
5037 if ((valids[c] > 1))
5038 found = CATEGORY_MASK_CCL;
5039 }
5040 detect_info->rejected |= CATEGORY_MASK_CCL;
5041 return 0;
5042
5043 no_more_source:
5044 detect_info->found |= found;
5045 return 1;
5046 }
5047
5048 static void
5049 decode_coding_ccl (struct coding_system *coding)
5050 {
5051 const unsigned char *src = coding->source + coding->consumed;
5052 const unsigned char *src_end = coding->source + coding->src_bytes;
5053 int *charbuf = coding->charbuf + coding->charbuf_used;
5054 int *charbuf_end = coding->charbuf + coding->charbuf_size;
5055 ptrdiff_t consumed_chars = 0;
5056 bool multibytep = coding->src_multibyte;
5057 struct ccl_program *ccl = &coding->spec.ccl->ccl;
5058 int source_charbuf[1024];
5059 int source_byteidx[1025];
5060 Lisp_Object attrs, charset_list;
5061
5062 CODING_GET_INFO (coding, attrs, charset_list);
5063
5064 while (1)
5065 {
5066 const unsigned char *p = src;
5067 ptrdiff_t offset;
5068 int i = 0;
5069
5070 if (multibytep)
5071 {
5072 while (i < 1024 && p < src_end)
5073 {
5074 source_byteidx[i] = p - src;
5075 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5076 }
5077 source_byteidx[i] = p - src;
5078 }
5079 else
5080 while (i < 1024 && p < src_end)
5081 source_charbuf[i++] = *p++;
5082
5083 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5084 ccl->last_block = 1;
5085 /* As ccl_driver calls DECODE_CHAR, buffer may be relocated. */
5086 charset_map_loaded = 0;
5087 ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5088 charset_list);
5089 if (charset_map_loaded
5090 && (offset = coding_change_source (coding)))
5091 {
5092 p += offset;
5093 src += offset;
5094 src_end += offset;
5095 }
5096 charbuf += ccl->produced;
5097 if (multibytep)
5098 src += source_byteidx[ccl->consumed];
5099 else
5100 src += ccl->consumed;
5101 consumed_chars += ccl->consumed;
5102 if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5103 break;
5104 }
5105
5106 switch (ccl->status)
5107 {
5108 case CCL_STAT_SUSPEND_BY_SRC:
5109 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5110 break;
5111 case CCL_STAT_SUSPEND_BY_DST:
5112 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5113 break;
5114 case CCL_STAT_QUIT:
5115 case CCL_STAT_INVALID_CMD:
5116 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5117 break;
5118 default:
5119 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5120 break;
5121 }
5122 coding->consumed_char += consumed_chars;
5123 coding->consumed = src - coding->source;
5124 coding->charbuf_used = charbuf - coding->charbuf;
5125 }
5126
5127 static bool
5128 encode_coding_ccl (struct coding_system *coding)
5129 {
5130 struct ccl_program *ccl = &coding->spec.ccl->ccl;
5131 bool multibytep = coding->dst_multibyte;
5132 int *charbuf = coding->charbuf;
5133 int *charbuf_end = charbuf + coding->charbuf_used;
5134 unsigned char *dst = coding->destination + coding->produced;
5135 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5136 int destination_charbuf[1024];
5137 ptrdiff_t produced_chars = 0;
5138 int i;
5139 Lisp_Object attrs, charset_list;
5140
5141 CODING_GET_INFO (coding, attrs, charset_list);
5142 if (coding->consumed_char == coding->src_chars
5143 && coding->mode & CODING_MODE_LAST_BLOCK)
5144 ccl->last_block = 1;
5145
5146 do
5147 {
5148 ptrdiff_t offset;
5149
5150 /* As ccl_driver calls DECODE_CHAR, buffer may be relocated. */
5151 charset_map_loaded = 0;
5152 ccl_driver (ccl, charbuf, destination_charbuf,
5153 charbuf_end - charbuf, 1024, charset_list);
5154 if (charset_map_loaded
5155 && (offset = coding_change_destination (coding)))
5156 dst += offset;
5157 if (multibytep)
5158 {
5159 ASSURE_DESTINATION (ccl->produced * 2);
5160 for (i = 0; i < ccl->produced; i++)
5161 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5162 }
5163 else
5164 {
5165 ASSURE_DESTINATION (ccl->produced);
5166 for (i = 0; i < ccl->produced; i++)
5167 *dst++ = destination_charbuf[i] & 0xFF;
5168 produced_chars += ccl->produced;
5169 }
5170 charbuf += ccl->consumed;
5171 if (ccl->status == CCL_STAT_QUIT
5172 || ccl->status == CCL_STAT_INVALID_CMD)
5173 break;
5174 }
5175 while (charbuf < charbuf_end);
5176
5177 switch (ccl->status)
5178 {
5179 case CCL_STAT_SUSPEND_BY_SRC:
5180 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5181 break;
5182 case CCL_STAT_SUSPEND_BY_DST:
5183 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5184 break;
5185 case CCL_STAT_QUIT:
5186 case CCL_STAT_INVALID_CMD:
5187 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5188 break;
5189 default:
5190 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5191 break;
5192 }
5193
5194 coding->produced_char += produced_chars;
5195 coding->produced = dst - coding->destination;
5196 return 0;
5197 }
5198
5199 \f
5200 /*** 10, 11. no-conversion handlers ***/
5201
5202 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
5203
5204 static void
5205 decode_coding_raw_text (struct coding_system *coding)
5206 {
5207 bool eol_dos
5208 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5209
5210 coding->chars_at_source = 1;
5211 coding->consumed_char = coding->src_chars;
5212 coding->consumed = coding->src_bytes;
5213 if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5214 {
5215 coding->consumed_char--;
5216 coding->consumed--;
5217 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5218 }
5219 else
5220 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5221 }
5222
5223 static bool
5224 encode_coding_raw_text (struct coding_system *coding)
5225 {
5226 bool multibytep = coding->dst_multibyte;
5227 int *charbuf = coding->charbuf;
5228 int *charbuf_end = coding->charbuf + coding->charbuf_used;
5229 unsigned char *dst = coding->destination + coding->produced;
5230 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5231 ptrdiff_t produced_chars = 0;
5232 int c;
5233
5234 if (multibytep)
5235 {
5236 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5237
5238 if (coding->src_multibyte)
5239 while (charbuf < charbuf_end)
5240 {
5241 ASSURE_DESTINATION (safe_room);
5242 c = *charbuf++;
5243 if (ASCII_CHAR_P (c))
5244 EMIT_ONE_ASCII_BYTE (c);
5245 else if (CHAR_BYTE8_P (c))
5246 {
5247 c = CHAR_TO_BYTE8 (c);
5248 EMIT_ONE_BYTE (c);
5249 }
5250 else
5251 {
5252 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5253
5254 CHAR_STRING_ADVANCE (c, p1);
5255 do
5256 {
5257 EMIT_ONE_BYTE (*p0);
5258 p0++;
5259 }
5260 while (p0 < p1);
5261 }
5262 }
5263 else
5264 while (charbuf < charbuf_end)
5265 {
5266 ASSURE_DESTINATION (safe_room);
5267 c = *charbuf++;
5268 EMIT_ONE_BYTE (c);
5269 }
5270 }
5271 else
5272 {
5273 if (coding->src_multibyte)
5274 {
5275 int safe_room = MAX_MULTIBYTE_LENGTH;
5276
5277 while (charbuf < charbuf_end)
5278 {
5279 ASSURE_DESTINATION (safe_room);
5280 c = *charbuf++;
5281 if (ASCII_CHAR_P (c))
5282 *dst++ = c;
5283 else if (CHAR_BYTE8_P (c))
5284 *dst++ = CHAR_TO_BYTE8 (c);
5285 else
5286 CHAR_STRING_ADVANCE (c, dst);
5287 }
5288 }
5289 else
5290 {
5291 ASSURE_DESTINATION (charbuf_end - charbuf);
5292 while (charbuf < charbuf_end && dst < dst_end)
5293 *dst++ = *charbuf++;
5294 }
5295 produced_chars = dst - (coding->destination + coding->produced);
5296 }
5297 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5298 coding->produced_char += produced_chars;
5299 coding->produced = dst - coding->destination;
5300 return 0;
5301 }
5302
5303 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5304 Return true if a text is encoded in a charset-based coding system. */
5305
5306 static bool
5307 detect_coding_charset (struct coding_system *coding,
5308 struct coding_detection_info *detect_info)
5309 {
5310 const unsigned char *src = coding->source, *src_base;
5311 const unsigned char *src_end = coding->source + coding->src_bytes;
5312 bool multibytep = coding->src_multibyte;
5313 ptrdiff_t consumed_chars = 0;
5314 Lisp_Object attrs, valids, name;
5315 int found = 0;
5316 ptrdiff_t head_ascii = coding->head_ascii;
5317 bool check_latin_extra = 0;
5318
5319 detect_info->checked |= CATEGORY_MASK_CHARSET;
5320
5321 coding = &coding_categories[coding_category_charset];
5322 attrs = CODING_ID_ATTRS (coding->id);
5323 valids = AREF (attrs, coding_attr_charset_valids);
5324 name = CODING_ID_NAME (coding->id);
5325 if (strncmp (SSDATA (SYMBOL_NAME (name)),
5326 "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5327 || strncmp (SSDATA (SYMBOL_NAME (name)),
5328 "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5329 check_latin_extra = 1;
5330
5331 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5332 src += head_ascii;
5333
5334 while (1)
5335 {
5336 int c;
5337 Lisp_Object val;
5338 struct charset *charset;
5339 int dim, idx;
5340
5341 src_base = src;
5342 ONE_MORE_BYTE (c);
5343 if (c < 0)
5344 continue;
5345 val = AREF (valids, c);
5346 if (NILP (val))
5347 break;
5348 if (c >= 0x80)
5349 {
5350 if (c < 0xA0
5351 && check_latin_extra
5352 && (!VECTORP (Vlatin_extra_code_table)
5353 || NILP (AREF (Vlatin_extra_code_table, c))))
5354 break;
5355 found = CATEGORY_MASK_CHARSET;
5356 }
5357 if (INTEGERP (val))
5358 {
5359 charset = CHARSET_FROM_ID (XFASTINT (val));
5360 dim = CHARSET_DIMENSION (charset);
5361 for (idx = 1; idx < dim; idx++)
5362 {
5363 if (src == src_end)
5364 goto too_short;
5365 ONE_MORE_BYTE (c);
5366 if (c < charset->code_space[(dim - 1 - idx) * 4]
5367 || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5368 break;
5369 }
5370 if (idx < dim)
5371 break;
5372 }
5373 else
5374 {
5375 idx = 1;
5376 for (; CONSP (val); val = XCDR (val))
5377 {
5378 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5379 dim = CHARSET_DIMENSION (charset);
5380 while (idx < dim)
5381 {
5382 if (src == src_end)
5383 goto too_short;
5384 ONE_MORE_BYTE (c);
5385 if (c < charset->code_space[(dim - 1 - idx) * 4]
5386 || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5387 break;
5388 idx++;
5389 }
5390 if (idx == dim)
5391 {
5392 val = Qnil;
5393 break;
5394 }
5395 }
5396 if (CONSP (val))
5397 break;
5398 }
5399 }
5400 too_short:
5401 detect_info->rejected |= CATEGORY_MASK_CHARSET;
5402 return 0;
5403
5404 no_more_source:
5405 detect_info->found |= found;
5406 return 1;
5407 }
5408
5409 static void
5410 decode_coding_charset (struct coding_system *coding)
5411 {
5412 const unsigned char *src = coding->source + coding->consumed;
5413 const unsigned char *src_end = coding->source + coding->src_bytes;
5414 const unsigned char *src_base;
5415 int *charbuf = coding->charbuf + coding->charbuf_used;
5416 /* We may produce one charset annotation in one loop and one more at
5417 the end. */
5418 int *charbuf_end
5419 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5420 ptrdiff_t consumed_chars = 0, consumed_chars_base;
5421 bool multibytep = coding->src_multibyte;
5422 Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5423 Lisp_Object valids;
5424 ptrdiff_t char_offset = coding->produced_char;
5425 ptrdiff_t last_offset = char_offset;
5426 int last_id = charset_ascii;
5427 bool eol_dos
5428 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5429 int byte_after_cr = -1;
5430
5431 valids = AREF (attrs, coding_attr_charset_valids);
5432
5433 while (1)
5434 {
5435 int c;
5436 Lisp_Object val;
5437 struct charset *charset;
5438 int dim;
5439 int len = 1;
5440 unsigned code;
5441
5442 src_base = src;
5443 consumed_chars_base = consumed_chars;
5444
5445 if (charbuf >= charbuf_end)
5446 {
5447 if (byte_after_cr >= 0)
5448 src_base--;
5449 break;
5450 }
5451
5452 if (byte_after_cr >= 0)
5453 {
5454 c = byte_after_cr;
5455 byte_after_cr = -1;
5456 }
5457 else
5458 {
5459 ONE_MORE_BYTE (c);
5460 if (eol_dos && c == '\r')
5461 ONE_MORE_BYTE (byte_after_cr);
5462 }
5463 if (c < 0)
5464 goto invalid_code;
5465 code = c;
5466
5467 val = AREF (valids, c);
5468 if (! INTEGERP (val) && ! CONSP (val))
5469 goto invalid_code;
5470 if (INTEGERP (val))
5471 {
5472 charset = CHARSET_FROM_ID (XFASTINT (val));
5473 dim = CHARSET_DIMENSION (charset);
5474 while (len < dim)
5475 {
5476 ONE_MORE_BYTE (c);
5477 code = (code << 8) | c;
5478 len++;
5479 }
5480 CODING_DECODE_CHAR (coding, src, src_base, src_end,
5481 charset, code, c);
5482 }
5483 else
5484 {
5485 /* VAL is a list of charset IDs. It is assured that the
5486 list is sorted by charset dimensions (smaller one
5487 comes first). */
5488 while (CONSP (val))
5489 {
5490 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5491 dim = CHARSET_DIMENSION (charset);
5492 while (len < dim)
5493 {
5494 ONE_MORE_BYTE (c);
5495 code = (code << 8) | c;
5496 len++;
5497 }
5498 CODING_DECODE_CHAR (coding, src, src_base,
5499 src_end, charset, code, c);
5500 if (c >= 0)
5501 break;
5502 val = XCDR (val);
5503 }
5504 }
5505 if (c < 0)
5506 goto invalid_code;
5507 if (charset->id != charset_ascii
5508 && last_id != charset->id)
5509 {
5510 if (last_id != charset_ascii)
5511 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5512 last_id = charset->id;
5513 last_offset = char_offset;
5514 }
5515
5516 *charbuf++ = c;
5517 char_offset++;
5518 continue;
5519
5520 invalid_code:
5521 src = src_base;
5522 consumed_chars = consumed_chars_base;
5523 ONE_MORE_BYTE (c);
5524 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5525 char_offset++;
5526 coding->errors++;
5527 }
5528
5529 no_more_source:
5530 if (last_id != charset_ascii)
5531 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5532 coding->consumed_char += consumed_chars_base;
5533 coding->consumed = src_base - coding->source;
5534 coding->charbuf_used = charbuf - coding->charbuf;
5535 }
5536
5537 static bool
5538 encode_coding_charset (struct coding_system *coding)
5539 {
5540 bool multibytep = coding->dst_multibyte;
5541 int *charbuf = coding->charbuf;
5542 int *charbuf_end = charbuf + coding->charbuf_used;
5543 unsigned char *dst = coding->destination + coding->produced;
5544 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5545 int safe_room = MAX_MULTIBYTE_LENGTH;
5546 ptrdiff_t produced_chars = 0;
5547 Lisp_Object attrs, charset_list;
5548 bool ascii_compatible;
5549 int c;
5550
5551 CODING_GET_INFO (coding, attrs, charset_list);
5552 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5553
5554 while (charbuf < charbuf_end)
5555 {
5556 struct charset *charset;
5557 unsigned code;
5558
5559 ASSURE_DESTINATION (safe_room);
5560 c = *charbuf++;
5561 if (ascii_compatible && ASCII_CHAR_P (c))
5562 EMIT_ONE_ASCII_BYTE (c);
5563 else if (CHAR_BYTE8_P (c))
5564 {
5565 c = CHAR_TO_BYTE8 (c);
5566 EMIT_ONE_BYTE (c);
5567 }
5568 else
5569 {
5570 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5571 &code, charset);
5572
5573 if (charset)
5574 {
5575 if (CHARSET_DIMENSION (charset) == 1)
5576 EMIT_ONE_BYTE (code);
5577 else if (CHARSET_DIMENSION (charset) == 2)
5578 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5579 else if (CHARSET_DIMENSION (charset) == 3)
5580 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5581 else
5582 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5583 (code >> 8) & 0xFF, code & 0xFF);
5584 }
5585 else
5586 {
5587 if (coding->mode & CODING_MODE_SAFE_ENCODING)
5588 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5589 else
5590 c = coding->default_char;
5591 EMIT_ONE_BYTE (c);
5592 }
5593 }
5594 }
5595
5596 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5597 coding->produced_char += produced_chars;
5598 coding->produced = dst - coding->destination;
5599 return 0;
5600 }
5601
5602 \f
5603 /*** 7. C library functions ***/
5604
5605 /* Setup coding context CODING from information about CODING_SYSTEM.
5606 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
5607 CODING_SYSTEM is invalid, signal an error. */
5608
5609 void
5610 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5611 {
5612 Lisp_Object attrs;
5613 Lisp_Object eol_type;
5614 Lisp_Object coding_type;
5615 Lisp_Object val;
5616
5617 if (NILP (coding_system))
5618 coding_system = Qundecided;
5619
5620 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5621
5622 attrs = CODING_ID_ATTRS (coding->id);
5623 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5624
5625 coding->mode = 0;
5626 coding->head_ascii = -1;
5627 if (VECTORP (eol_type))
5628 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5629 | CODING_REQUIRE_DETECTION_MASK);
5630 else if (! EQ (eol_type, Qunix))
5631 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5632 | CODING_REQUIRE_ENCODING_MASK);
5633 else
5634 coding->common_flags = 0;
5635 if (! NILP (CODING_ATTR_POST_READ (attrs)))
5636 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5637 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5638 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5639 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5640 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5641
5642 val = CODING_ATTR_SAFE_CHARSETS (attrs);
5643 coding->max_charset_id = SCHARS (val) - 1;
5644 coding->safe_charsets = SDATA (val);
5645 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5646 coding->carryover_bytes = 0;
5647
5648 coding_type = CODING_ATTR_TYPE (attrs);
5649 if (EQ (coding_type, Qundecided))
5650 {
5651 coding->detector = NULL;
5652 coding->decoder = decode_coding_raw_text;
5653 coding->encoder = encode_coding_raw_text;
5654 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5655 }
5656 else if (EQ (coding_type, Qiso_2022))
5657 {
5658 int i;
5659 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5660
5661 /* Invoke graphic register 0 to plane 0. */
5662 CODING_ISO_INVOCATION (coding, 0) = 0;
5663 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
5664 CODING_ISO_INVOCATION (coding, 1)
5665 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5666 /* Setup the initial status of designation. */
5667 for (i = 0; i < 4; i++)
5668 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5669 /* Not single shifting initially. */
5670 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5671 /* Beginning of buffer should also be regarded as bol. */
5672 CODING_ISO_BOL (coding) = 1;
5673 coding->detector = detect_coding_iso_2022;
5674 coding->decoder = decode_coding_iso_2022;
5675 coding->encoder = encode_coding_iso_2022;
5676 if (flags & CODING_ISO_FLAG_SAFE)
5677 coding->mode |= CODING_MODE_SAFE_ENCODING;
5678 coding->common_flags
5679 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5680 | CODING_REQUIRE_FLUSHING_MASK);
5681 if (flags & CODING_ISO_FLAG_COMPOSITION)
5682 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5683 if (flags & CODING_ISO_FLAG_DESIGNATION)
5684 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5685 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5686 {
5687 setup_iso_safe_charsets (attrs);
5688 val = CODING_ATTR_SAFE_CHARSETS (attrs);
5689 coding->max_charset_id = SCHARS (val) - 1;
5690 coding->safe_charsets = SDATA (val);
5691 }
5692 CODING_ISO_FLAGS (coding) = flags;
5693 CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5694 CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5695 CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5696 CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5697 }
5698 else if (EQ (coding_type, Qcharset))
5699 {
5700 coding->detector = detect_coding_charset;
5701 coding->decoder = decode_coding_charset;
5702 coding->encoder = encode_coding_charset;
5703 coding->common_flags
5704 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5705 }
5706 else if (EQ (coding_type, Qutf_8))
5707 {
5708 val = AREF (attrs, coding_attr_utf_bom);
5709 CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5710 : EQ (val, Qt) ? utf_with_bom
5711 : utf_without_bom);
5712 coding->detector = detect_coding_utf_8;
5713 coding->decoder = decode_coding_utf_8;
5714 coding->encoder = encode_coding_utf_8;
5715 coding->common_flags
5716 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5717 if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5718 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5719 }
5720 else if (EQ (coding_type, Qutf_16))
5721 {
5722 val = AREF (attrs, coding_attr_utf_bom);
5723 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5724 : EQ (val, Qt) ? utf_with_bom
5725 : utf_without_bom);
5726 val = AREF (attrs, coding_attr_utf_16_endian);
5727 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5728 : utf_16_little_endian);
5729 CODING_UTF_16_SURROGATE (coding) = 0;
5730 coding->detector = detect_coding_utf_16;
5731 coding->decoder = decode_coding_utf_16;
5732 coding->encoder = encode_coding_utf_16;
5733 coding->common_flags
5734 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5735 if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5736 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5737 }
5738 else if (EQ (coding_type, Qccl))
5739 {
5740 coding->detector = detect_coding_ccl;
5741 coding->decoder = decode_coding_ccl;
5742 coding->encoder = encode_coding_ccl;
5743 coding->common_flags
5744 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5745 | CODING_REQUIRE_FLUSHING_MASK);
5746 }
5747 else if (EQ (coding_type, Qemacs_mule))
5748 {
5749 coding->detector = detect_coding_emacs_mule;
5750 coding->decoder = decode_coding_emacs_mule;
5751 coding->encoder = encode_coding_emacs_mule;
5752 coding->common_flags
5753 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5754 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5755 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5756 {
5757 Lisp_Object tail, safe_charsets;
5758 int max_charset_id = 0;
5759
5760 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5761 tail = XCDR (tail))
5762 if (max_charset_id < XFASTINT (XCAR (tail)))
5763 max_charset_id = XFASTINT (XCAR (tail));
5764 safe_charsets = make_uninit_string (max_charset_id + 1);
5765 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5766 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5767 tail = XCDR (tail))
5768 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5769 coding->max_charset_id = max_charset_id;
5770 coding->safe_charsets = SDATA (safe_charsets);
5771 }
5772 coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5773 coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5774 }
5775 else if (EQ (coding_type, Qshift_jis))
5776 {
5777 coding->detector = detect_coding_sjis;
5778 coding->decoder = decode_coding_sjis;
5779 coding->encoder = encode_coding_sjis;
5780 coding->common_flags
5781 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5782 }
5783 else if (EQ (coding_type, Qbig5))
5784 {
5785 coding->detector = detect_coding_big5;
5786 coding->decoder = decode_coding_big5;
5787 coding->encoder = encode_coding_big5;
5788 coding->common_flags
5789 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5790 }
5791 else /* EQ (coding_type, Qraw_text) */
5792 {
5793 coding->detector = NULL;
5794 coding->decoder = decode_coding_raw_text;
5795 coding->encoder = encode_coding_raw_text;
5796 if (! EQ (eol_type, Qunix))
5797 {
5798 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5799 if (! VECTORP (eol_type))
5800 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5801 }
5802
5803 }
5804
5805 return;
5806 }
5807
5808 /* Return a list of charsets supported by CODING. */
5809
5810 Lisp_Object
5811 coding_charset_list (struct coding_system *coding)
5812 {
5813 Lisp_Object attrs, charset_list;
5814
5815 CODING_GET_INFO (coding, attrs, charset_list);
5816 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5817 {
5818 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5819
5820 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5821 charset_list = Viso_2022_charset_list;
5822 }
5823 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5824 {
5825 charset_list = Vemacs_mule_charset_list;
5826 }
5827 return charset_list;
5828 }
5829
5830
5831 /* Return a list of charsets supported by CODING-SYSTEM. */
5832
5833 Lisp_Object
5834 coding_system_charset_list (Lisp_Object coding_system)
5835 {
5836 ptrdiff_t id;
5837 Lisp_Object attrs, charset_list;
5838
5839 CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5840 attrs = CODING_ID_ATTRS (id);
5841
5842 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5843 {
5844 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5845
5846 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5847 charset_list = Viso_2022_charset_list;
5848 else
5849 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5850 }
5851 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5852 {
5853 charset_list = Vemacs_mule_charset_list;
5854 }
5855 else
5856 {
5857 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5858 }
5859 return charset_list;
5860 }
5861
5862
5863 /* Return raw-text or one of its subsidiaries that has the same
5864 eol_type as CODING-SYSTEM. */
5865
5866 Lisp_Object
5867 raw_text_coding_system (Lisp_Object coding_system)
5868 {
5869 Lisp_Object spec, attrs;
5870 Lisp_Object eol_type, raw_text_eol_type;
5871
5872 if (NILP (coding_system))
5873 return Qraw_text;
5874 spec = CODING_SYSTEM_SPEC (coding_system);
5875 attrs = AREF (spec, 0);
5876
5877 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5878 return coding_system;
5879
5880 eol_type = AREF (spec, 2);
5881 if (VECTORP (eol_type))
5882 return Qraw_text;
5883 spec = CODING_SYSTEM_SPEC (Qraw_text);
5884 raw_text_eol_type = AREF (spec, 2);
5885 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5886 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5887 : AREF (raw_text_eol_type, 2));
5888 }
5889
5890
5891 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5892 the subsidiary that has the same eol-spec as PARENT (if it is not
5893 nil and specifies end-of-line format) or the system's setting
5894 (system_eol_type). */
5895
5896 Lisp_Object
5897 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
5898 {
5899 Lisp_Object spec, eol_type;
5900
5901 if (NILP (coding_system))
5902 coding_system = Qraw_text;
5903 spec = CODING_SYSTEM_SPEC (coding_system);
5904 eol_type = AREF (spec, 2);
5905 if (VECTORP (eol_type))
5906 {
5907 Lisp_Object parent_eol_type;
5908
5909 if (! NILP (parent))
5910 {
5911 Lisp_Object parent_spec;
5912
5913 parent_spec = CODING_SYSTEM_SPEC (parent);
5914 parent_eol_type = AREF (parent_spec, 2);
5915 if (VECTORP (parent_eol_type))
5916 parent_eol_type = system_eol_type;
5917 }
5918 else
5919 parent_eol_type = system_eol_type;
5920 if (EQ (parent_eol_type, Qunix))
5921 coding_system = AREF (eol_type, 0);
5922 else if (EQ (parent_eol_type, Qdos))
5923 coding_system = AREF (eol_type, 1);
5924 else if (EQ (parent_eol_type, Qmac))
5925 coding_system = AREF (eol_type, 2);
5926 }
5927 return coding_system;
5928 }
5929
5930
5931 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
5932 decided for writing to a process. If not, complement them, and
5933 return a new coding system. */
5934
5935 Lisp_Object
5936 complement_process_encoding_system (Lisp_Object coding_system)
5937 {
5938 Lisp_Object coding_base = Qnil, eol_base = Qnil;
5939 Lisp_Object spec, attrs;
5940 int i;
5941
5942 for (i = 0; i < 3; i++)
5943 {
5944 if (i == 1)
5945 coding_system = CDR_SAFE (Vdefault_process_coding_system);
5946 else if (i == 2)
5947 coding_system = preferred_coding_system ();
5948 spec = CODING_SYSTEM_SPEC (coding_system);
5949 if (NILP (spec))
5950 continue;
5951 attrs = AREF (spec, 0);
5952 if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
5953 coding_base = CODING_ATTR_BASE_NAME (attrs);
5954 if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
5955 eol_base = coding_system;
5956 if (! NILP (coding_base) && ! NILP (eol_base))
5957 break;
5958 }
5959
5960 if (i > 0)
5961 /* The original CODING_SYSTEM didn't specify text-conversion or
5962 eol-conversion. Be sure that we return a fully complemented
5963 coding system. */
5964 coding_system = coding_inherit_eol_type (coding_base, eol_base);
5965 return coding_system;
5966 }
5967
5968
5969 /* Emacs has a mechanism to automatically detect a coding system if it
5970 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
5971 it's impossible to distinguish some coding systems accurately
5972 because they use the same range of codes. So, at first, coding
5973 systems are categorized into 7, those are:
5974
5975 o coding-category-emacs-mule
5976
5977 The category for a coding system which has the same code range
5978 as Emacs' internal format. Assigned the coding-system (Lisp
5979 symbol) `emacs-mule' by default.
5980
5981 o coding-category-sjis
5982
5983 The category for a coding system which has the same code range
5984 as SJIS. Assigned the coding-system (Lisp
5985 symbol) `japanese-shift-jis' by default.
5986
5987 o coding-category-iso-7
5988
5989 The category for a coding system which has the same code range
5990 as ISO2022 of 7-bit environment. This doesn't use any locking
5991 shift and single shift functions. This can encode/decode all
5992 charsets. Assigned the coding-system (Lisp symbol)
5993 `iso-2022-7bit' by default.
5994
5995 o coding-category-iso-7-tight
5996
5997 Same as coding-category-iso-7 except that this can
5998 encode/decode only the specified charsets.
5999
6000 o coding-category-iso-8-1
6001
6002 The category for a coding system which has the same code range
6003 as ISO2022 of 8-bit environment and graphic plane 1 used only
6004 for DIMENSION1 charset. This doesn't use any locking shift
6005 and single shift functions. Assigned the coding-system (Lisp
6006 symbol) `iso-latin-1' by default.
6007
6008 o coding-category-iso-8-2
6009
6010 The category for a coding system which has the same code range
6011 as ISO2022 of 8-bit environment and graphic plane 1 used only
6012 for DIMENSION2 charset. This doesn't use any locking shift
6013 and single shift functions. Assigned the coding-system (Lisp
6014 symbol) `japanese-iso-8bit' by default.
6015
6016 o coding-category-iso-7-else
6017
6018 The category for a coding system which has the same code range
6019 as ISO2022 of 7-bit environment but uses locking shift or
6020 single shift functions. Assigned the coding-system (Lisp
6021 symbol) `iso-2022-7bit-lock' by default.
6022
6023 o coding-category-iso-8-else
6024
6025 The category for a coding system which has the same code range
6026 as ISO2022 of 8-bit environment but uses locking shift or
6027 single shift functions. Assigned the coding-system (Lisp
6028 symbol) `iso-2022-8bit-ss2' by default.
6029
6030 o coding-category-big5
6031
6032 The category for a coding system which has the same code range
6033 as BIG5. Assigned the coding-system (Lisp symbol)
6034 `cn-big5' by default.
6035
6036 o coding-category-utf-8
6037
6038 The category for a coding system which has the same code range
6039 as UTF-8 (cf. RFC3629). Assigned the coding-system (Lisp
6040 symbol) `utf-8' by default.
6041
6042 o coding-category-utf-16-be
6043
6044 The category for a coding system in which a text has an
6045 Unicode signature (cf. Unicode Standard) in the order of BIG
6046 endian at the head. Assigned the coding-system (Lisp symbol)
6047 `utf-16-be' by default.
6048
6049 o coding-category-utf-16-le
6050
6051 The category for a coding system in which a text has an
6052 Unicode signature (cf. Unicode Standard) in the order of
6053 LITTLE endian at the head. Assigned the coding-system (Lisp
6054 symbol) `utf-16-le' by default.
6055
6056 o coding-category-ccl
6057
6058 The category for a coding system of which encoder/decoder is
6059 written in CCL programs. The default value is nil, i.e., no
6060 coding system is assigned.
6061
6062 o coding-category-binary
6063
6064 The category for a coding system not categorized in any of the
6065 above. Assigned the coding-system (Lisp symbol)
6066 `no-conversion' by default.
6067
6068 Each of them is a Lisp symbol and the value is an actual
6069 `coding-system's (this is also a Lisp symbol) assigned by a user.
6070 What Emacs does actually is to detect a category of coding system.
6071 Then, it uses a `coding-system' assigned to it. If Emacs can't
6072 decide only one possible category, it selects a category of the
6073 highest priority. Priorities of categories are also specified by a
6074 user in a Lisp variable `coding-category-list'.
6075
6076 */
6077
6078 #define EOL_SEEN_NONE 0
6079 #define EOL_SEEN_LF 1
6080 #define EOL_SEEN_CR 2
6081 #define EOL_SEEN_CRLF 4
6082
6083 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6084 SOURCE is encoded. If CATEGORY is one of
6085 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6086 two-byte, else they are encoded by one-byte.
6087
6088 Return one of EOL_SEEN_XXX. */
6089
6090 #define MAX_EOL_CHECK_COUNT 3
6091
6092 static int
6093 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6094 enum coding_category category)
6095 {
6096 const unsigned char *src = source, *src_end = src + src_bytes;
6097 unsigned char c;
6098 int total = 0;
6099 int eol_seen = EOL_SEEN_NONE;
6100
6101 if ((1 << category) & CATEGORY_MASK_UTF_16)
6102 {
6103 bool msb = category == (coding_category_utf_16_le
6104 | coding_category_utf_16_le_nosig);
6105 bool lsb = !msb;
6106
6107 while (src + 1 < src_end)
6108 {
6109 c = src[lsb];
6110 if (src[msb] == 0 && (c == '\n' || c == '\r'))
6111 {
6112 int this_eol;
6113
6114 if (c == '\n')
6115 this_eol = EOL_SEEN_LF;
6116 else if (src + 3 >= src_end
6117 || src[msb + 2] != 0
6118 || src[lsb + 2] != '\n')
6119 this_eol = EOL_SEEN_CR;
6120 else
6121 {
6122 this_eol = EOL_SEEN_CRLF;
6123 src += 2;
6124 }
6125
6126 if (eol_seen == EOL_SEEN_NONE)
6127 /* This is the first end-of-line. */
6128 eol_seen = this_eol;
6129 else if (eol_seen != this_eol)
6130 {
6131 /* The found type is different from what found before.
6132 Allow for stray ^M characters in DOS EOL files. */
6133 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6134 || (eol_seen == EOL_SEEN_CRLF
6135 && this_eol == EOL_SEEN_CR))
6136 eol_seen = EOL_SEEN_CRLF;
6137 else
6138 {
6139 eol_seen = EOL_SEEN_LF;
6140 break;
6141 }
6142 }
6143 if (++total == MAX_EOL_CHECK_COUNT)
6144 break;
6145 }
6146 src += 2;
6147 }
6148 }
6149 else
6150 while (src < src_end)
6151 {
6152 c = *src++;
6153 if (c == '\n' || c == '\r')
6154 {
6155 int this_eol;
6156
6157 if (c == '\n')
6158 this_eol = EOL_SEEN_LF;
6159 else if (src >= src_end || *src != '\n')
6160 this_eol = EOL_SEEN_CR;
6161 else
6162 this_eol = EOL_SEEN_CRLF, src++;
6163
6164 if (eol_seen == EOL_SEEN_NONE)
6165 /* This is the first end-of-line. */
6166 eol_seen = this_eol;
6167 else if (eol_seen != this_eol)
6168 {
6169 /* The found type is different from what found before.
6170 Allow for stray ^M characters in DOS EOL files. */
6171 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6172 || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6173 eol_seen = EOL_SEEN_CRLF;
6174 else
6175 {
6176 eol_seen = EOL_SEEN_LF;
6177 break;
6178 }
6179 }
6180 if (++total == MAX_EOL_CHECK_COUNT)
6181 break;
6182 }
6183 }
6184 return eol_seen;
6185 }
6186
6187
6188 static Lisp_Object
6189 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6190 {
6191 Lisp_Object eol_type;
6192
6193 eol_type = CODING_ID_EOL_TYPE (coding->id);
6194 if (eol_seen & EOL_SEEN_LF)
6195 {
6196 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6197 eol_type = Qunix;
6198 }
6199 else if (eol_seen & EOL_SEEN_CRLF)
6200 {
6201 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6202 eol_type = Qdos;
6203 }
6204 else if (eol_seen & EOL_SEEN_CR)
6205 {
6206 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6207 eol_type = Qmac;
6208 }
6209 return eol_type;
6210 }
6211
6212 /* Detect how a text specified in CODING is encoded. If a coding
6213 system is detected, update fields of CODING by the detected coding
6214 system. */
6215
6216 static void
6217 detect_coding (struct coding_system *coding)
6218 {
6219 const unsigned char *src, *src_end;
6220 unsigned int saved_mode = coding->mode;
6221
6222 coding->consumed = coding->consumed_char = 0;
6223 coding->produced = coding->produced_char = 0;
6224 coding_set_source (coding);
6225
6226 src_end = coding->source + coding->src_bytes;
6227 coding->head_ascii = 0;
6228
6229 /* If we have not yet decided the text encoding type, detect it
6230 now. */
6231 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6232 {
6233 int c, i;
6234 struct coding_detection_info detect_info;
6235 bool null_byte_found = 0, eight_bit_found = 0;
6236
6237 detect_info.checked = detect_info.found = detect_info.rejected = 0;
6238 for (src = coding->source; src < src_end; src++)
6239 {
6240 c = *src;
6241 if (c & 0x80)
6242 {
6243 eight_bit_found = 1;
6244 if (null_byte_found)
6245 break;
6246 }
6247 else if (c < 0x20)
6248 {
6249 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6250 && ! inhibit_iso_escape_detection
6251 && ! detect_info.checked)
6252 {
6253 if (detect_coding_iso_2022 (coding, &detect_info))
6254 {
6255 /* We have scanned the whole data. */
6256 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6257 {
6258 /* We didn't find an 8-bit code. We may
6259 have found a null-byte, but it's very
6260 rare that a binary file conforms to
6261 ISO-2022. */
6262 src = src_end;
6263 coding->head_ascii = src - coding->source;
6264 }
6265 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6266 break;
6267 }
6268 }
6269 else if (! c && !inhibit_null_byte_detection)
6270 {
6271 null_byte_found = 1;
6272 if (eight_bit_found)
6273 break;
6274 }
6275 if (! eight_bit_found)
6276 coding->head_ascii++;
6277 }
6278 else if (! eight_bit_found)
6279 coding->head_ascii++;
6280 }
6281
6282 if (null_byte_found || eight_bit_found
6283 || coding->head_ascii < coding->src_bytes
6284 || detect_info.found)
6285 {
6286 enum coding_category category;
6287 struct coding_system *this;
6288
6289 if (coding->head_ascii == coding->src_bytes)
6290 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
6291 for (i = 0; i < coding_category_raw_text; i++)
6292 {
6293 category = coding_priorities[i];
6294 this = coding_categories + category;
6295 if (detect_info.found & (1 << category))
6296 break;
6297 }
6298 else
6299 {
6300 if (null_byte_found)
6301 {
6302 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6303 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6304 }
6305 for (i = 0; i < coding_category_raw_text; i++)
6306 {
6307 category = coding_priorities[i];
6308 this = coding_categories + category;
6309 /* Some of this->detector (e.g. detect_coding_sjis)
6310 require this information. */
6311 coding->id = this->id;
6312 if (this->id < 0)
6313 {
6314 /* No coding system of this category is defined. */
6315 detect_info.rejected |= (1 << category);
6316 }
6317 else if (category >= coding_category_raw_text)
6318 continue;
6319 else if (detect_info.checked & (1 << category))
6320 {
6321 if (detect_info.found & (1 << category))
6322 break;
6323 }
6324 else if ((*(this->detector)) (coding, &detect_info)
6325 && detect_info.found & (1 << category))
6326 {
6327 if (category == coding_category_utf_16_auto)
6328 {
6329 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6330 category = coding_category_utf_16_le;
6331 else
6332 category = coding_category_utf_16_be;
6333 }
6334 break;
6335 }
6336 }
6337 }
6338
6339 if (i < coding_category_raw_text)
6340 setup_coding_system (CODING_ID_NAME (this->id), coding);
6341 else if (null_byte_found)
6342 setup_coding_system (Qno_conversion, coding);
6343 else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6344 == CATEGORY_MASK_ANY)
6345 setup_coding_system (Qraw_text, coding);
6346 else if (detect_info.rejected)
6347 for (i = 0; i < coding_category_raw_text; i++)
6348 if (! (detect_info.rejected & (1 << coding_priorities[i])))
6349 {
6350 this = coding_categories + coding_priorities[i];
6351 setup_coding_system (CODING_ID_NAME (this->id), coding);
6352 break;
6353 }
6354 }
6355 }
6356 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6357 == coding_category_utf_8_auto)
6358 {
6359 Lisp_Object coding_systems;
6360 struct coding_detection_info detect_info;
6361
6362 coding_systems
6363 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6364 detect_info.found = detect_info.rejected = 0;
6365 coding->head_ascii = 0;
6366 if (CONSP (coding_systems)
6367 && detect_coding_utf_8 (coding, &detect_info))
6368 {
6369 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6370 setup_coding_system (XCAR (coding_systems), coding);
6371 else
6372 setup_coding_system (XCDR (coding_systems), coding);
6373 }
6374 }
6375 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6376 == coding_category_utf_16_auto)
6377 {
6378 Lisp_Object coding_systems;
6379 struct coding_detection_info detect_info;
6380
6381 coding_systems
6382 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6383 detect_info.found = detect_info.rejected = 0;
6384 coding->head_ascii = 0;
6385 if (CONSP (coding_systems)
6386 && detect_coding_utf_16 (coding, &detect_info))
6387 {
6388 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6389 setup_coding_system (XCAR (coding_systems), coding);
6390 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6391 setup_coding_system (XCDR (coding_systems), coding);
6392 }
6393 }
6394 coding->mode = saved_mode;
6395 }
6396
6397
6398 static void
6399 decode_eol (struct coding_system *coding)
6400 {
6401 Lisp_Object eol_type;
6402 unsigned char *p, *pbeg, *pend;
6403
6404 eol_type = CODING_ID_EOL_TYPE (coding->id);
6405 if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6406 return;
6407
6408 if (NILP (coding->dst_object))
6409 pbeg = coding->destination;
6410 else
6411 pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6412 pend = pbeg + coding->produced;
6413
6414 if (VECTORP (eol_type))
6415 {
6416 int eol_seen = EOL_SEEN_NONE;
6417
6418 for (p = pbeg; p < pend; p++)
6419 {
6420 if (*p == '\n')
6421 eol_seen |= EOL_SEEN_LF;
6422 else if (*p == '\r')
6423 {
6424 if (p + 1 < pend && *(p + 1) == '\n')
6425 {
6426 eol_seen |= EOL_SEEN_CRLF;
6427 p++;
6428 }
6429 else
6430 eol_seen |= EOL_SEEN_CR;
6431 }
6432 }
6433 /* Handle DOS-style EOLs in a file with stray ^M characters. */
6434 if ((eol_seen & EOL_SEEN_CRLF) != 0
6435 && (eol_seen & EOL_SEEN_CR) != 0
6436 && (eol_seen & EOL_SEEN_LF) == 0)
6437 eol_seen = EOL_SEEN_CRLF;
6438 else if (eol_seen != EOL_SEEN_NONE
6439 && eol_seen != EOL_SEEN_LF
6440 && eol_seen != EOL_SEEN_CRLF
6441 && eol_seen != EOL_SEEN_CR)
6442 eol_seen = EOL_SEEN_LF;
6443 if (eol_seen != EOL_SEEN_NONE)
6444 eol_type = adjust_coding_eol_type (coding, eol_seen);
6445 }
6446
6447 if (EQ (eol_type, Qmac))
6448 {
6449 for (p = pbeg; p < pend; p++)
6450 if (*p == '\r')
6451 *p = '\n';
6452 }
6453 else if (EQ (eol_type, Qdos))
6454 {
6455 ptrdiff_t n = 0;
6456
6457 if (NILP (coding->dst_object))
6458 {
6459 /* Start deleting '\r' from the tail to minimize the memory
6460 movement. */
6461 for (p = pend - 2; p >= pbeg; p--)
6462 if (*p == '\r')
6463 {
6464 memmove (p, p + 1, pend-- - p - 1);
6465 n++;
6466 }
6467 }
6468 else
6469 {
6470 ptrdiff_t pos_byte = coding->dst_pos_byte;
6471 ptrdiff_t pos = coding->dst_pos;
6472 ptrdiff_t pos_end = pos + coding->produced_char - 1;
6473
6474 while (pos < pos_end)
6475 {
6476 p = BYTE_POS_ADDR (pos_byte);
6477 if (*p == '\r' && p[1] == '\n')
6478 {
6479 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6480 n++;
6481 pos_end--;
6482 }
6483 pos++;
6484 if (coding->dst_multibyte)
6485 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6486 else
6487 pos_byte++;
6488 }
6489 }
6490 coding->produced -= n;
6491 coding->produced_char -= n;
6492 }
6493 }
6494
6495
6496 /* Return a translation table (or list of them) from coding system
6497 attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6498 not ENCODEP). */
6499
6500 static Lisp_Object
6501 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6502 {
6503 Lisp_Object standard, translation_table;
6504 Lisp_Object val;
6505
6506 if (NILP (Venable_character_translation))
6507 {
6508 if (max_lookup)
6509 *max_lookup = 0;
6510 return Qnil;
6511 }
6512 if (encodep)
6513 translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6514 standard = Vstandard_translation_table_for_encode;
6515 else
6516 translation_table = CODING_ATTR_DECODE_TBL (attrs),
6517 standard = Vstandard_translation_table_for_decode;
6518 if (NILP (translation_table))
6519 translation_table = standard;
6520 else
6521 {
6522 if (SYMBOLP (translation_table))
6523 translation_table = Fget (translation_table, Qtranslation_table);
6524 else if (CONSP (translation_table))
6525 {
6526 translation_table = Fcopy_sequence (translation_table);
6527 for (val = translation_table; CONSP (val); val = XCDR (val))
6528 if (SYMBOLP (XCAR (val)))
6529 XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6530 }
6531 if (CHAR_TABLE_P (standard))
6532 {
6533 if (CONSP (translation_table))
6534 translation_table = nconc2 (translation_table,
6535 Fcons (standard, Qnil));
6536 else
6537 translation_table = Fcons (translation_table,
6538 Fcons (standard, Qnil));
6539 }
6540 }
6541
6542 if (max_lookup)
6543 {
6544 *max_lookup = 1;
6545 if (CHAR_TABLE_P (translation_table)
6546 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6547 {
6548 val = XCHAR_TABLE (translation_table)->extras[1];
6549 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6550 *max_lookup = XFASTINT (val);
6551 }
6552 else if (CONSP (translation_table))
6553 {
6554 Lisp_Object tail;
6555
6556 for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6557 if (CHAR_TABLE_P (XCAR (tail))
6558 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6559 {
6560 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6561 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6562 *max_lookup = XFASTINT (tailval);
6563 }
6564 }
6565 }
6566 return translation_table;
6567 }
6568
6569 #define LOOKUP_TRANSLATION_TABLE(table, c, trans) \
6570 do { \
6571 trans = Qnil; \
6572 if (CHAR_TABLE_P (table)) \
6573 { \
6574 trans = CHAR_TABLE_REF (table, c); \
6575 if (CHARACTERP (trans)) \
6576 c = XFASTINT (trans), trans = Qnil; \
6577 } \
6578 else if (CONSP (table)) \
6579 { \
6580 Lisp_Object tail; \
6581 \
6582 for (tail = table; CONSP (tail); tail = XCDR (tail)) \
6583 if (CHAR_TABLE_P (XCAR (tail))) \
6584 { \
6585 trans = CHAR_TABLE_REF (XCAR (tail), c); \
6586 if (CHARACTERP (trans)) \
6587 c = XFASTINT (trans), trans = Qnil; \
6588 else if (! NILP (trans)) \
6589 break; \
6590 } \
6591 } \
6592 } while (0)
6593
6594
6595 /* Return a translation of character(s) at BUF according to TRANS.
6596 TRANS is TO-CHAR or ((FROM . TO) ...) where
6597 FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6598 The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6599 translation is found, and Qnil if not found..
6600 If BUF is too short to lookup characters in FROM, return Qt. */
6601
6602 static Lisp_Object
6603 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6604 {
6605
6606 if (INTEGERP (trans))
6607 return trans;
6608 for (; CONSP (trans); trans = XCDR (trans))
6609 {
6610 Lisp_Object val = XCAR (trans);
6611 Lisp_Object from = XCAR (val);
6612 ptrdiff_t len = ASIZE (from);
6613 ptrdiff_t i;
6614
6615 for (i = 0; i < len; i++)
6616 {
6617 if (buf + i == buf_end)
6618 return Qt;
6619 if (XINT (AREF (from, i)) != buf[i])
6620 break;
6621 }
6622 if (i == len)
6623 return val;
6624 }
6625 return Qnil;
6626 }
6627
6628
6629 static int
6630 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6631 bool last_block)
6632 {
6633 unsigned char *dst = coding->destination + coding->produced;
6634 unsigned char *dst_end = coding->destination + coding->dst_bytes;
6635 ptrdiff_t produced;
6636 ptrdiff_t produced_chars = 0;
6637 int carryover = 0;
6638
6639 if (! coding->chars_at_source)
6640 {
6641 /* Source characters are in coding->charbuf. */
6642 int *buf = coding->charbuf;
6643 int *buf_end = buf + coding->charbuf_used;
6644
6645 if (EQ (coding->src_object, coding->dst_object))
6646 {
6647 coding_set_source (coding);
6648 dst_end = ((unsigned char *) coding->source) + coding->consumed;
6649 }
6650
6651 while (buf < buf_end)
6652 {
6653 int c = *buf;
6654 ptrdiff_t i;
6655
6656 if (c >= 0)
6657 {
6658 ptrdiff_t from_nchars = 1, to_nchars = 1;
6659 Lisp_Object trans = Qnil;
6660
6661 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6662 if (! NILP (trans))
6663 {
6664 trans = get_translation (trans, buf, buf_end);
6665 if (INTEGERP (trans))
6666 c = XINT (trans);
6667 else if (CONSP (trans))
6668 {
6669 from_nchars = ASIZE (XCAR (trans));
6670 trans = XCDR (trans);
6671 if (INTEGERP (trans))
6672 c = XINT (trans);
6673 else
6674 {
6675 to_nchars = ASIZE (trans);
6676 c = XINT (AREF (trans, 0));
6677 }
6678 }
6679 else if (EQ (trans, Qt) && ! last_block)
6680 break;
6681 }
6682
6683 if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
6684 {
6685 if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
6686 / MAX_MULTIBYTE_LENGTH)
6687 < to_nchars)
6688 memory_full (SIZE_MAX);
6689 dst = alloc_destination (coding,
6690 buf_end - buf
6691 + MAX_MULTIBYTE_LENGTH * to_nchars,
6692 dst);
6693 if (EQ (coding->src_object, coding->dst_object))
6694 {
6695 coding_set_source (coding);
6696 dst_end = (((unsigned char *) coding->source)
6697 + coding->consumed);
6698 }
6699 else
6700 dst_end = coding->destination + coding->dst_bytes;
6701 }
6702
6703 for (i = 0; i < to_nchars; i++)
6704 {
6705 if (i > 0)
6706 c = XINT (AREF (trans, i));
6707 if (coding->dst_multibyte
6708 || ! CHAR_BYTE8_P (c))
6709 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6710 else
6711 *dst++ = CHAR_TO_BYTE8 (c);
6712 }
6713 produced_chars += to_nchars;
6714 buf += from_nchars;
6715 }
6716 else
6717 /* This is an annotation datum. (-C) is the length. */
6718 buf += -c;
6719 }
6720 carryover = buf_end - buf;
6721 }
6722 else
6723 {
6724 /* Source characters are at coding->source. */
6725 const unsigned char *src = coding->source;
6726 const unsigned char *src_end = src + coding->consumed;
6727
6728 if (EQ (coding->dst_object, coding->src_object))
6729 dst_end = (unsigned char *) src;
6730 if (coding->src_multibyte != coding->dst_multibyte)
6731 {
6732 if (coding->src_multibyte)
6733 {
6734 bool multibytep = 1;
6735 ptrdiff_t consumed_chars = 0;
6736
6737 while (1)
6738 {
6739 const unsigned char *src_base = src;
6740 int c;
6741
6742 ONE_MORE_BYTE (c);
6743 if (dst == dst_end)
6744 {
6745 if (EQ (coding->src_object, coding->dst_object))
6746 dst_end = (unsigned char *) src;
6747 if (dst == dst_end)
6748 {
6749 ptrdiff_t offset = src - coding->source;
6750
6751 dst = alloc_destination (coding, src_end - src + 1,
6752 dst);
6753 dst_end = coding->destination + coding->dst_bytes;
6754 coding_set_source (coding);
6755 src = coding->source + offset;
6756 src_end = coding->source + coding->consumed;
6757 if (EQ (coding->src_object, coding->dst_object))
6758 dst_end = (unsigned char *) src;
6759 }
6760 }
6761 *dst++ = c;
6762 produced_chars++;
6763 }
6764 no_more_source:
6765 ;
6766 }
6767 else
6768 while (src < src_end)
6769 {
6770 bool multibytep = 1;
6771 int c = *src++;
6772
6773 if (dst >= dst_end - 1)
6774 {
6775 if (EQ (coding->src_object, coding->dst_object))
6776 dst_end = (unsigned char *) src;
6777 if (dst >= dst_end - 1)
6778 {
6779 ptrdiff_t offset = src - coding->source;
6780 ptrdiff_t more_bytes;
6781
6782 if (EQ (coding->src_object, coding->dst_object))
6783 more_bytes = ((src_end - src) / 2) + 2;
6784 else
6785 more_bytes = src_end - src + 2;
6786 dst = alloc_destination (coding, more_bytes, dst);
6787 dst_end = coding->destination + coding->dst_bytes;
6788 coding_set_source (coding);
6789 src = coding->source + offset;
6790 src_end = coding->source + coding->consumed;
6791 if (EQ (coding->src_object, coding->dst_object))
6792 dst_end = (unsigned char *) src;
6793 }
6794 }
6795 EMIT_ONE_BYTE (c);
6796 }
6797 }
6798 else
6799 {
6800 if (!EQ (coding->src_object, coding->dst_object))
6801 {
6802 ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
6803
6804 if (require > 0)
6805 {
6806 ptrdiff_t offset = src - coding->source;
6807
6808 dst = alloc_destination (coding, require, dst);
6809 coding_set_source (coding);
6810 src = coding->source + offset;
6811 src_end = coding->source + coding->consumed;
6812 }
6813 }
6814 produced_chars = coding->consumed_char;
6815 while (src < src_end)
6816 *dst++ = *src++;
6817 }
6818 }
6819
6820 produced = dst - (coding->destination + coding->produced);
6821 if (BUFFERP (coding->dst_object) && produced_chars > 0)
6822 insert_from_gap (produced_chars, produced);
6823 coding->produced += produced;
6824 coding->produced_char += produced_chars;
6825 return carryover;
6826 }
6827
6828 /* Compose text in CODING->object according to the annotation data at
6829 CHARBUF. CHARBUF is an array:
6830 [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
6831 */
6832
6833 static void
6834 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
6835 {
6836 int len;
6837 ptrdiff_t to;
6838 enum composition_method method;
6839 Lisp_Object components;
6840
6841 len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
6842 to = pos + charbuf[2];
6843 method = (enum composition_method) (charbuf[4]);
6844
6845 if (method == COMPOSITION_RELATIVE)
6846 components = Qnil;
6847 else
6848 {
6849 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6850 int i, j;
6851
6852 if (method == COMPOSITION_WITH_RULE)
6853 len = charbuf[2] * 3 - 2;
6854 charbuf += MAX_ANNOTATION_LENGTH;
6855 /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
6856 for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
6857 {
6858 if (charbuf[i] >= 0)
6859 args[j] = make_number (charbuf[i]);
6860 else
6861 {
6862 i++;
6863 args[j] = make_number (charbuf[i] % 0x100);
6864 }
6865 }
6866 components = (i == j ? Fstring (j, args) : Fvector (j, args));
6867 }
6868 compose_text (pos, to, components, Qnil, coding->dst_object);
6869 }
6870
6871
6872 /* Put `charset' property on text in CODING->object according to
6873 the annotation data at CHARBUF. CHARBUF is an array:
6874 [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6875 */
6876
6877 static void
6878 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
6879 {
6880 ptrdiff_t from = pos - charbuf[2];
6881 struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
6882
6883 Fput_text_property (make_number (from), make_number (pos),
6884 Qcharset, CHARSET_NAME (charset),
6885 coding->dst_object);
6886 }
6887
6888
6889 #define CHARBUF_SIZE 0x4000
6890
6891 #define ALLOC_CONVERSION_WORK_AREA(coding) \
6892 do { \
6893 int size = CHARBUF_SIZE; \
6894 \
6895 coding->charbuf = NULL; \
6896 while (size > 1024) \
6897 { \
6898 coding->charbuf = alloca (sizeof (int) * size); \
6899 if (coding->charbuf) \
6900 break; \
6901 size >>= 1; \
6902 } \
6903 if (! coding->charbuf) \
6904 { \
6905 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
6906 return; \
6907 } \
6908 coding->charbuf_size = size; \
6909 } while (0)
6910
6911
6912 static void
6913 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
6914 {
6915 int *charbuf = coding->charbuf;
6916 int *charbuf_end = charbuf + coding->charbuf_used;
6917
6918 if (NILP (coding->dst_object))
6919 return;
6920
6921 while (charbuf < charbuf_end)
6922 {
6923 if (*charbuf >= 0)
6924 pos++, charbuf++;
6925 else
6926 {
6927 int len = -*charbuf;
6928
6929 if (len > 2)
6930 switch (charbuf[1])
6931 {
6932 case CODING_ANNOTATE_COMPOSITION_MASK:
6933 produce_composition (coding, charbuf, pos);
6934 break;
6935 case CODING_ANNOTATE_CHARSET_MASK:
6936 produce_charset (coding, charbuf, pos);
6937 break;
6938 }
6939 charbuf += len;
6940 }
6941 }
6942 }
6943
6944 /* Decode the data at CODING->src_object into CODING->dst_object.
6945 CODING->src_object is a buffer, a string, or nil.
6946 CODING->dst_object is a buffer.
6947
6948 If CODING->src_object is a buffer, it must be the current buffer.
6949 In this case, if CODING->src_pos is positive, it is a position of
6950 the source text in the buffer, otherwise, the source text is in the
6951 gap area of the buffer, and CODING->src_pos specifies the offset of
6952 the text from GPT (which must be the same as PT). If this is the
6953 same buffer as CODING->dst_object, CODING->src_pos must be
6954 negative.
6955
6956 If CODING->src_object is a string, CODING->src_pos is an index to
6957 that string.
6958
6959 If CODING->src_object is nil, CODING->source must already point to
6960 the non-relocatable memory area. In this case, CODING->src_pos is
6961 an offset from CODING->source.
6962
6963 The decoded data is inserted at the current point of the buffer
6964 CODING->dst_object.
6965 */
6966
6967 static void
6968 decode_coding (struct coding_system *coding)
6969 {
6970 Lisp_Object attrs;
6971 Lisp_Object undo_list;
6972 Lisp_Object translation_table;
6973 struct ccl_spec cclspec;
6974 int carryover;
6975 int i;
6976
6977 if (BUFFERP (coding->src_object)
6978 && coding->src_pos > 0
6979 && coding->src_pos < GPT
6980 && coding->src_pos + coding->src_chars > GPT)
6981 move_gap_both (coding->src_pos, coding->src_pos_byte);
6982
6983 undo_list = Qt;
6984 if (BUFFERP (coding->dst_object))
6985 {
6986 set_buffer_internal (XBUFFER (coding->dst_object));
6987 if (GPT != PT)
6988 move_gap_both (PT, PT_BYTE);
6989
6990 /* We must disable undo_list in order to record the whole insert
6991 transaction via record_insert at the end. But doing so also
6992 disables the recording of the first change to the undo_list.
6993 Therefore we check for first change here and record it via
6994 record_first_change if needed. */
6995 if (MODIFF <= SAVE_MODIFF)
6996 record_first_change ();
6997
6998 undo_list = BVAR (current_buffer, undo_list);
6999 bset_undo_list (current_buffer, Qt);
7000 }
7001
7002 coding->consumed = coding->consumed_char = 0;
7003 coding->produced = coding->produced_char = 0;
7004 coding->chars_at_source = 0;
7005 record_conversion_result (coding, CODING_RESULT_SUCCESS);
7006 coding->errors = 0;
7007
7008 ALLOC_CONVERSION_WORK_AREA (coding);
7009
7010 attrs = CODING_ID_ATTRS (coding->id);
7011 translation_table = get_translation_table (attrs, 0, NULL);
7012
7013 carryover = 0;
7014 if (coding->decoder == decode_coding_ccl)
7015 {
7016 coding->spec.ccl = &cclspec;
7017 setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7018 }
7019 do
7020 {
7021 ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7022
7023 coding_set_source (coding);
7024 coding->annotated = 0;
7025 coding->charbuf_used = carryover;
7026 (*(coding->decoder)) (coding);
7027 coding_set_destination (coding);
7028 carryover = produce_chars (coding, translation_table, 0);
7029 if (coding->annotated)
7030 produce_annotation (coding, pos);
7031 for (i = 0; i < carryover; i++)
7032 coding->charbuf[i]
7033 = coding->charbuf[coding->charbuf_used - carryover + i];
7034 }
7035 while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7036 || (coding->consumed < coding->src_bytes
7037 && (coding->result == CODING_RESULT_SUCCESS
7038 || coding->result == CODING_RESULT_INVALID_SRC)));
7039
7040 if (carryover > 0)
7041 {
7042 coding_set_destination (coding);
7043 coding->charbuf_used = carryover;
7044 produce_chars (coding, translation_table, 1);
7045 }
7046
7047 coding->carryover_bytes = 0;
7048 if (coding->consumed < coding->src_bytes)
7049 {
7050 int nbytes = coding->src_bytes - coding->consumed;
7051 const unsigned char *src;
7052
7053 coding_set_source (coding);
7054 coding_set_destination (coding);
7055 src = coding->source + coding->consumed;
7056
7057 if (coding->mode & CODING_MODE_LAST_BLOCK)
7058 {
7059 /* Flush out unprocessed data as binary chars. We are sure
7060 that the number of data is less than the size of
7061 coding->charbuf. */
7062 coding->charbuf_used = 0;
7063 coding->chars_at_source = 0;
7064
7065 while (nbytes-- > 0)
7066 {
7067 int c = *src++;
7068
7069 if (c & 0x80)
7070 c = BYTE8_TO_CHAR (c);
7071 coding->charbuf[coding->charbuf_used++] = c;
7072 }
7073 produce_chars (coding, Qnil, 1);
7074 }
7075 else
7076 {
7077 /* Record unprocessed bytes in coding->carryover. We are
7078 sure that the number of data is less than the size of
7079 coding->carryover. */
7080 unsigned char *p = coding->carryover;
7081
7082 if (nbytes > sizeof coding->carryover)
7083 nbytes = sizeof coding->carryover;
7084 coding->carryover_bytes = nbytes;
7085 while (nbytes-- > 0)
7086 *p++ = *src++;
7087 }
7088 coding->consumed = coding->src_bytes;
7089 }
7090
7091 if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7092 && !inhibit_eol_conversion)
7093 decode_eol (coding);
7094 if (BUFFERP (coding->dst_object))
7095 {
7096 bset_undo_list (current_buffer, undo_list);
7097 record_insert (coding->dst_pos, coding->produced_char);
7098 }
7099 }
7100
7101
7102 /* Extract an annotation datum from a composition starting at POS and
7103 ending before LIMIT of CODING->src_object (buffer or string), store
7104 the data in BUF, set *STOP to a starting position of the next
7105 composition (if any) or to LIMIT, and return the address of the
7106 next element of BUF.
7107
7108 If such an annotation is not found, set *STOP to a starting
7109 position of a composition after POS (if any) or to LIMIT, and
7110 return BUF. */
7111
7112 static int *
7113 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7114 struct coding_system *coding, int *buf,
7115 ptrdiff_t *stop)
7116 {
7117 ptrdiff_t start, end;
7118 Lisp_Object prop;
7119
7120 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7121 || end > limit)
7122 *stop = limit;
7123 else if (start > pos)
7124 *stop = start;
7125 else
7126 {
7127 if (start == pos)
7128 {
7129 /* We found a composition. Store the corresponding
7130 annotation data in BUF. */
7131 int *head = buf;
7132 enum composition_method method = COMPOSITION_METHOD (prop);
7133 int nchars = COMPOSITION_LENGTH (prop);
7134
7135 ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7136 if (method != COMPOSITION_RELATIVE)
7137 {
7138 Lisp_Object components;
7139 ptrdiff_t i, len, i_byte;
7140
7141 components = COMPOSITION_COMPONENTS (prop);
7142 if (VECTORP (components))
7143 {
7144 len = ASIZE (components);
7145 for (i = 0; i < len; i++)
7146 *buf++ = XINT (AREF (components, i));
7147 }
7148 else if (STRINGP (components))
7149 {
7150 len = SCHARS (components);
7151 i = i_byte = 0;
7152 while (i < len)
7153 {
7154 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7155 buf++;
7156 }
7157 }
7158 else if (INTEGERP (components))
7159 {
7160 len = 1;
7161 *buf++ = XINT (components);
7162 }
7163 else if (CONSP (components))
7164 {
7165 for (len = 0; CONSP (components);
7166 len++, components = XCDR (components))
7167 *buf++ = XINT (XCAR (components));
7168 }
7169 else
7170 emacs_abort ();
7171 *head -= len;
7172 }
7173 }
7174
7175 if (find_composition (end, limit, &start, &end, &prop,
7176 coding->src_object)
7177 && end <= limit)
7178 *stop = start;
7179 else
7180 *stop = limit;
7181 }
7182 return buf;
7183 }
7184
7185
7186 /* Extract an annotation datum from a text property `charset' at POS of
7187 CODING->src_object (buffer of string), store the data in BUF, set
7188 *STOP to the position where the value of `charset' property changes
7189 (limiting by LIMIT), and return the address of the next element of
7190 BUF.
7191
7192 If the property value is nil, set *STOP to the position where the
7193 property value is non-nil (limiting by LIMIT), and return BUF. */
7194
7195 static int *
7196 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7197 struct coding_system *coding, int *buf,
7198 ptrdiff_t *stop)
7199 {
7200 Lisp_Object val, next;
7201 int id;
7202
7203 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7204 if (! NILP (val) && CHARSETP (val))
7205 id = XINT (CHARSET_SYMBOL_ID (val));
7206 else
7207 id = -1;
7208 ADD_CHARSET_DATA (buf, 0, id);
7209 next = Fnext_single_property_change (make_number (pos), Qcharset,
7210 coding->src_object,
7211 make_number (limit));
7212 *stop = XINT (next);
7213 return buf;
7214 }
7215
7216
7217 static void
7218 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7219 int max_lookup)
7220 {
7221 int *buf = coding->charbuf;
7222 int *buf_end = coding->charbuf + coding->charbuf_size;
7223 const unsigned char *src = coding->source + coding->consumed;
7224 const unsigned char *src_end = coding->source + coding->src_bytes;
7225 ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7226 ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7227 bool multibytep = coding->src_multibyte;
7228 Lisp_Object eol_type;
7229 int c;
7230 ptrdiff_t stop, stop_composition, stop_charset;
7231 int *lookup_buf = NULL;
7232
7233 if (! NILP (translation_table))
7234 lookup_buf = alloca (sizeof (int) * max_lookup);
7235
7236 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7237 if (VECTORP (eol_type))
7238 eol_type = Qunix;
7239
7240 /* Note: composition handling is not yet implemented. */
7241 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7242
7243 if (NILP (coding->src_object))
7244 stop = stop_composition = stop_charset = end_pos;
7245 else
7246 {
7247 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7248 stop = stop_composition = pos;
7249 else
7250 stop = stop_composition = end_pos;
7251 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7252 stop = stop_charset = pos;
7253 else
7254 stop_charset = end_pos;
7255 }
7256
7257 /* Compensate for CRLF and conversion. */
7258 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7259 while (buf < buf_end)
7260 {
7261 Lisp_Object trans;
7262
7263 if (pos == stop)
7264 {
7265 if (pos == end_pos)
7266 break;
7267 if (pos == stop_composition)
7268 buf = handle_composition_annotation (pos, end_pos, coding,
7269 buf, &stop_composition);
7270 if (pos == stop_charset)
7271 buf = handle_charset_annotation (pos, end_pos, coding,
7272 buf, &stop_charset);
7273 stop = (stop_composition < stop_charset
7274 ? stop_composition : stop_charset);
7275 }
7276
7277 if (! multibytep)
7278 {
7279 int bytes;
7280
7281 if (coding->encoder == encode_coding_raw_text
7282 || coding->encoder == encode_coding_ccl)
7283 c = *src++, pos++;
7284 else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7285 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7286 else
7287 c = BYTE8_TO_CHAR (*src), src++, pos++;
7288 }
7289 else
7290 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7291 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7292 c = '\n';
7293 if (! EQ (eol_type, Qunix))
7294 {
7295 if (c == '\n')
7296 {
7297 if (EQ (eol_type, Qdos))
7298 *buf++ = '\r';
7299 else
7300 c = '\r';
7301 }
7302 }
7303
7304 trans = Qnil;
7305 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7306 if (NILP (trans))
7307 *buf++ = c;
7308 else
7309 {
7310 ptrdiff_t from_nchars = 1, to_nchars = 1;
7311 int *lookup_buf_end;
7312 const unsigned char *p = src;
7313 int i;
7314
7315 lookup_buf[0] = c;
7316 for (i = 1; i < max_lookup && p < src_end; i++)
7317 lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7318 lookup_buf_end = lookup_buf + i;
7319 trans = get_translation (trans, lookup_buf, lookup_buf_end);
7320 if (INTEGERP (trans))
7321 c = XINT (trans);
7322 else if (CONSP (trans))
7323 {
7324 from_nchars = ASIZE (XCAR (trans));
7325 trans = XCDR (trans);
7326 if (INTEGERP (trans))
7327 c = XINT (trans);
7328 else
7329 {
7330 to_nchars = ASIZE (trans);
7331 if (buf_end - buf < to_nchars)
7332 break;
7333 c = XINT (AREF (trans, 0));
7334 }
7335 }
7336 else
7337 break;
7338 *buf++ = c;
7339 for (i = 1; i < to_nchars; i++)
7340 *buf++ = XINT (AREF (trans, i));
7341 for (i = 1; i < from_nchars; i++, pos++)
7342 src += MULTIBYTE_LENGTH_NO_CHECK (src);
7343 }
7344 }
7345
7346 coding->consumed = src - coding->source;
7347 coding->consumed_char = pos - coding->src_pos;
7348 coding->charbuf_used = buf - coding->charbuf;
7349 coding->chars_at_source = 0;
7350 }
7351
7352
7353 /* Encode the text at CODING->src_object into CODING->dst_object.
7354 CODING->src_object is a buffer or a string.
7355 CODING->dst_object is a buffer or nil.
7356
7357 If CODING->src_object is a buffer, it must be the current buffer.
7358 In this case, if CODING->src_pos is positive, it is a position of
7359 the source text in the buffer, otherwise. the source text is in the
7360 gap area of the buffer, and coding->src_pos specifies the offset of
7361 the text from GPT (which must be the same as PT). If this is the
7362 same buffer as CODING->dst_object, CODING->src_pos must be
7363 negative and CODING should not have `pre-write-conversion'.
7364
7365 If CODING->src_object is a string, CODING should not have
7366 `pre-write-conversion'.
7367
7368 If CODING->dst_object is a buffer, the encoded data is inserted at
7369 the current point of that buffer.
7370
7371 If CODING->dst_object is nil, the encoded data is placed at the
7372 memory area specified by CODING->destination. */
7373
7374 static void
7375 encode_coding (struct coding_system *coding)
7376 {
7377 Lisp_Object attrs;
7378 Lisp_Object translation_table;
7379 int max_lookup;
7380 struct ccl_spec cclspec;
7381
7382 attrs = CODING_ID_ATTRS (coding->id);
7383 if (coding->encoder == encode_coding_raw_text)
7384 translation_table = Qnil, max_lookup = 0;
7385 else
7386 translation_table = get_translation_table (attrs, 1, &max_lookup);
7387
7388 if (BUFFERP (coding->dst_object))
7389 {
7390 set_buffer_internal (XBUFFER (coding->dst_object));
7391 coding->dst_multibyte
7392 = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7393 }
7394
7395 coding->consumed = coding->consumed_char = 0;
7396 coding->produced = coding->produced_char = 0;
7397 record_conversion_result (coding, CODING_RESULT_SUCCESS);
7398 coding->errors = 0;
7399
7400 ALLOC_CONVERSION_WORK_AREA (coding);
7401
7402 if (coding->encoder == encode_coding_ccl)
7403 {
7404 coding->spec.ccl = &cclspec;
7405 setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7406 }
7407 do {
7408 coding_set_source (coding);
7409 consume_chars (coding, translation_table, max_lookup);
7410 coding_set_destination (coding);
7411 (*(coding->encoder)) (coding);
7412 } while (coding->consumed_char < coding->src_chars);
7413
7414 if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7415 insert_from_gap (coding->produced_char, coding->produced);
7416 }
7417
7418
7419 /* Name (or base name) of work buffer for code conversion. */
7420 static Lisp_Object Vcode_conversion_workbuf_name;
7421
7422 /* A working buffer used by the top level conversion. Once it is
7423 created, it is never destroyed. It has the name
7424 Vcode_conversion_workbuf_name. The other working buffers are
7425 destroyed after the use is finished, and their names are modified
7426 versions of Vcode_conversion_workbuf_name. */
7427 static Lisp_Object Vcode_conversion_reused_workbuf;
7428
7429 /* True iff Vcode_conversion_reused_workbuf is already in use. */
7430 static bool reused_workbuf_in_use;
7431
7432
7433 /* Return a working buffer of code conversion. MULTIBYTE specifies the
7434 multibyteness of returning buffer. */
7435
7436 static Lisp_Object
7437 make_conversion_work_buffer (bool multibyte)
7438 {
7439 Lisp_Object name, workbuf;
7440 struct buffer *current;
7441
7442 if (reused_workbuf_in_use)
7443 {
7444 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7445 workbuf = Fget_buffer_create (name);
7446 }
7447 else
7448 {
7449 reused_workbuf_in_use = 1;
7450 if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7451 Vcode_conversion_reused_workbuf
7452 = Fget_buffer_create (Vcode_conversion_workbuf_name);
7453 workbuf = Vcode_conversion_reused_workbuf;
7454 }
7455 current = current_buffer;
7456 set_buffer_internal (XBUFFER (workbuf));
7457 /* We can't allow modification hooks to run in the work buffer. For
7458 instance, directory_files_internal assumes that file decoding
7459 doesn't compile new regexps. */
7460 Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7461 Ferase_buffer ();
7462 bset_undo_list (current_buffer, Qt);
7463 bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7464 set_buffer_internal (current);
7465 return workbuf;
7466 }
7467
7468
7469 static Lisp_Object
7470 code_conversion_restore (Lisp_Object arg)
7471 {
7472 Lisp_Object current, workbuf;
7473 struct gcpro gcpro1;
7474
7475 GCPRO1 (arg);
7476 current = XCAR (arg);
7477 workbuf = XCDR (arg);
7478 if (! NILP (workbuf))
7479 {
7480 if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7481 reused_workbuf_in_use = 0;
7482 else
7483 Fkill_buffer (workbuf);
7484 }
7485 set_buffer_internal (XBUFFER (current));
7486 UNGCPRO;
7487 return Qnil;
7488 }
7489
7490 Lisp_Object
7491 code_conversion_save (bool with_work_buf, bool multibyte)
7492 {
7493 Lisp_Object workbuf = Qnil;
7494
7495 if (with_work_buf)
7496 workbuf = make_conversion_work_buffer (multibyte);
7497 record_unwind_protect (code_conversion_restore,
7498 Fcons (Fcurrent_buffer (), workbuf));
7499 return workbuf;
7500 }
7501
7502 void
7503 decode_coding_gap (struct coding_system *coding,
7504 ptrdiff_t chars, ptrdiff_t bytes)
7505 {
7506 ptrdiff_t count = SPECPDL_INDEX ();
7507 Lisp_Object attrs;
7508
7509 code_conversion_save (0, 0);
7510
7511 coding->src_object = Fcurrent_buffer ();
7512 coding->src_chars = chars;
7513 coding->src_bytes = bytes;
7514 coding->src_pos = -chars;
7515 coding->src_pos_byte = -bytes;
7516 coding->src_multibyte = chars < bytes;
7517 coding->dst_object = coding->src_object;
7518 coding->dst_pos = PT;
7519 coding->dst_pos_byte = PT_BYTE;
7520 coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7521
7522 if (CODING_REQUIRE_DETECTION (coding))
7523 detect_coding (coding);
7524
7525 coding->mode |= CODING_MODE_LAST_BLOCK;
7526 current_buffer->text->inhibit_shrinking = 1;
7527 decode_coding (coding);
7528 current_buffer->text->inhibit_shrinking = 0;
7529
7530 attrs = CODING_ID_ATTRS (coding->id);
7531 if (! NILP (CODING_ATTR_POST_READ (attrs)))
7532 {
7533 ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7534 Lisp_Object val;
7535
7536 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7537 val = call1 (CODING_ATTR_POST_READ (attrs),
7538 make_number (coding->produced_char));
7539 CHECK_NATNUM (val);
7540 coding->produced_char += Z - prev_Z;
7541 coding->produced += Z_BYTE - prev_Z_BYTE;
7542 }
7543
7544 unbind_to (count, Qnil);
7545 }
7546
7547
7548 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7549 SRC_OBJECT into DST_OBJECT by coding context CODING.
7550
7551 SRC_OBJECT is a buffer, a string, or Qnil.
7552
7553 If it is a buffer, the text is at point of the buffer. FROM and TO
7554 are positions in the buffer.
7555
7556 If it is a string, the text is at the beginning of the string.
7557 FROM and TO are indices to the string.
7558
7559 If it is nil, the text is at coding->source. FROM and TO are
7560 indices to coding->source.
7561
7562 DST_OBJECT is a buffer, Qt, or Qnil.
7563
7564 If it is a buffer, the decoded text is inserted at point of the
7565 buffer. If the buffer is the same as SRC_OBJECT, the source text
7566 is deleted.
7567
7568 If it is Qt, a string is made from the decoded text, and
7569 set in CODING->dst_object.
7570
7571 If it is Qnil, the decoded text is stored at CODING->destination.
7572 The caller must allocate CODING->dst_bytes bytes at
7573 CODING->destination by xmalloc. If the decoded text is longer than
7574 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7575 */
7576
7577 void
7578 decode_coding_object (struct coding_system *coding,
7579 Lisp_Object src_object,
7580 ptrdiff_t from, ptrdiff_t from_byte,
7581 ptrdiff_t to, ptrdiff_t to_byte,
7582 Lisp_Object dst_object)
7583 {
7584 ptrdiff_t count = SPECPDL_INDEX ();
7585 unsigned char *destination IF_LINT (= NULL);
7586 ptrdiff_t dst_bytes IF_LINT (= 0);
7587 ptrdiff_t chars = to - from;
7588 ptrdiff_t bytes = to_byte - from_byte;
7589 Lisp_Object attrs;
7590 ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7591 bool need_marker_adjustment = 0;
7592 Lisp_Object old_deactivate_mark;
7593
7594 old_deactivate_mark = Vdeactivate_mark;
7595
7596 if (NILP (dst_object))
7597 {
7598 destination = coding->destination;
7599 dst_bytes = coding->dst_bytes;
7600 }
7601
7602 coding->src_object = src_object;
7603 coding->src_chars = chars;
7604 coding->src_bytes = bytes;
7605 coding->src_multibyte = chars < bytes;
7606
7607 if (STRINGP (src_object))
7608 {
7609 coding->src_pos = from;
7610 coding->src_pos_byte = from_byte;
7611 }
7612 else if (BUFFERP (src_object))
7613 {
7614 set_buffer_internal (XBUFFER (src_object));
7615 if (from != GPT)
7616 move_gap_both (from, from_byte);
7617 if (EQ (src_object, dst_object))
7618 {
7619 struct Lisp_Marker *tail;
7620
7621 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7622 {
7623 tail->need_adjustment
7624 = tail->charpos == (tail->insertion_type ? from : to);
7625 need_marker_adjustment |= tail->need_adjustment;
7626 }
7627 saved_pt = PT, saved_pt_byte = PT_BYTE;
7628 TEMP_SET_PT_BOTH (from, from_byte);
7629 current_buffer->text->inhibit_shrinking = 1;
7630 del_range_both (from, from_byte, to, to_byte, 1);
7631 coding->src_pos = -chars;
7632 coding->src_pos_byte = -bytes;
7633 }
7634 else
7635 {
7636 coding->src_pos = from;
7637 coding->src_pos_byte = from_byte;
7638 }
7639 }
7640
7641 if (CODING_REQUIRE_DETECTION (coding))
7642 detect_coding (coding);
7643 attrs = CODING_ID_ATTRS (coding->id);
7644
7645 if (EQ (dst_object, Qt)
7646 || (! NILP (CODING_ATTR_POST_READ (attrs))
7647 && NILP (dst_object)))
7648 {
7649 coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7650 coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
7651 coding->dst_pos = BEG;
7652 coding->dst_pos_byte = BEG_BYTE;
7653 }
7654 else if (BUFFERP (dst_object))
7655 {
7656 code_conversion_save (0, 0);
7657 coding->dst_object = dst_object;
7658 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7659 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7660 coding->dst_multibyte
7661 = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7662 }
7663 else
7664 {
7665 code_conversion_save (0, 0);
7666 coding->dst_object = Qnil;
7667 /* Most callers presume this will return a multibyte result, and they
7668 won't use `binary' or `raw-text' anyway, so let's not worry about
7669 CODING_FOR_UNIBYTE. */
7670 coding->dst_multibyte = 1;
7671 }
7672
7673 decode_coding (coding);
7674
7675 if (BUFFERP (coding->dst_object))
7676 set_buffer_internal (XBUFFER (coding->dst_object));
7677
7678 if (! NILP (CODING_ATTR_POST_READ (attrs)))
7679 {
7680 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7681 ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7682 Lisp_Object val;
7683
7684 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7685 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7686 old_deactivate_mark);
7687 val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7688 make_number (coding->produced_char));
7689 UNGCPRO;
7690 CHECK_NATNUM (val);
7691 coding->produced_char += Z - prev_Z;
7692 coding->produced += Z_BYTE - prev_Z_BYTE;
7693 }
7694
7695 if (EQ (dst_object, Qt))
7696 {
7697 coding->dst_object = Fbuffer_string ();
7698 }
7699 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7700 {
7701 set_buffer_internal (XBUFFER (coding->dst_object));
7702 if (dst_bytes < coding->produced)
7703 {
7704 destination = xrealloc (destination, coding->produced);
7705 if (! destination)
7706 {
7707 record_conversion_result (coding,
7708 CODING_RESULT_INSUFFICIENT_MEM);
7709 unbind_to (count, Qnil);
7710 return;
7711 }
7712 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7713 move_gap_both (BEGV, BEGV_BYTE);
7714 memcpy (destination, BEGV_ADDR, coding->produced);
7715 coding->destination = destination;
7716 }
7717 }
7718
7719 if (saved_pt >= 0)
7720 {
7721 /* This is the case of:
7722 (BUFFERP (src_object) && EQ (src_object, dst_object))
7723 As we have moved PT while replacing the original buffer
7724 contents, we must recover it now. */
7725 set_buffer_internal (XBUFFER (src_object));
7726 current_buffer->text->inhibit_shrinking = 0;
7727 if (saved_pt < from)
7728 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7729 else if (saved_pt < from + chars)
7730 TEMP_SET_PT_BOTH (from, from_byte);
7731 else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7732 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7733 saved_pt_byte + (coding->produced - bytes));
7734 else
7735 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7736 saved_pt_byte + (coding->produced - bytes));
7737
7738 if (need_marker_adjustment)
7739 {
7740 struct Lisp_Marker *tail;
7741
7742 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7743 if (tail->need_adjustment)
7744 {
7745 tail->need_adjustment = 0;
7746 if (tail->insertion_type)
7747 {
7748 tail->bytepos = from_byte;
7749 tail->charpos = from;
7750 }
7751 else
7752 {
7753 tail->bytepos = from_byte + coding->produced;
7754 tail->charpos
7755 = (NILP (BVAR (current_buffer, enable_multibyte_characters))
7756 ? tail->bytepos : from + coding->produced_char);
7757 }
7758 }
7759 }
7760 }
7761
7762 Vdeactivate_mark = old_deactivate_mark;
7763 unbind_to (count, coding->dst_object);
7764 }
7765
7766
7767 void
7768 encode_coding_object (struct coding_system *coding,
7769 Lisp_Object src_object,
7770 ptrdiff_t from, ptrdiff_t from_byte,
7771 ptrdiff_t to, ptrdiff_t to_byte,
7772 Lisp_Object dst_object)
7773 {
7774 ptrdiff_t count = SPECPDL_INDEX ();
7775 ptrdiff_t chars = to - from;
7776 ptrdiff_t bytes = to_byte - from_byte;
7777 Lisp_Object attrs;
7778 ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7779 bool need_marker_adjustment = 0;
7780 bool kill_src_buffer = 0;
7781 Lisp_Object old_deactivate_mark;
7782
7783 old_deactivate_mark = Vdeactivate_mark;
7784
7785 coding->src_object = src_object;
7786 coding->src_chars = chars;
7787 coding->src_bytes = bytes;
7788 coding->src_multibyte = chars < bytes;
7789
7790 attrs = CODING_ID_ATTRS (coding->id);
7791
7792 if (EQ (src_object, dst_object))
7793 {
7794 struct Lisp_Marker *tail;
7795
7796 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7797 {
7798 tail->need_adjustment
7799 = tail->charpos == (tail->insertion_type ? from : to);
7800 need_marker_adjustment |= tail->need_adjustment;
7801 }
7802 }
7803
7804 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
7805 {
7806 coding->src_object = code_conversion_save (1, coding->src_multibyte);
7807 set_buffer_internal (XBUFFER (coding->src_object));
7808 if (STRINGP (src_object))
7809 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7810 else if (BUFFERP (src_object))
7811 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7812 else
7813 insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
7814
7815 if (EQ (src_object, dst_object))
7816 {
7817 set_buffer_internal (XBUFFER (src_object));
7818 saved_pt = PT, saved_pt_byte = PT_BYTE;
7819 del_range_both (from, from_byte, to, to_byte, 1);
7820 set_buffer_internal (XBUFFER (coding->src_object));
7821 }
7822
7823 {
7824 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7825
7826 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7827 old_deactivate_mark);
7828 safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
7829 make_number (BEG), make_number (Z));
7830 UNGCPRO;
7831 }
7832 if (XBUFFER (coding->src_object) != current_buffer)
7833 kill_src_buffer = 1;
7834 coding->src_object = Fcurrent_buffer ();
7835 if (BEG != GPT)
7836 move_gap_both (BEG, BEG_BYTE);
7837 coding->src_chars = Z - BEG;
7838 coding->src_bytes = Z_BYTE - BEG_BYTE;
7839 coding->src_pos = BEG;
7840 coding->src_pos_byte = BEG_BYTE;
7841 coding->src_multibyte = Z < Z_BYTE;
7842 }
7843 else if (STRINGP (src_object))
7844 {
7845 code_conversion_save (0, 0);
7846 coding->src_pos = from;
7847 coding->src_pos_byte = from_byte;
7848 }
7849 else if (BUFFERP (src_object))
7850 {
7851 code_conversion_save (0, 0);
7852 set_buffer_internal (XBUFFER (src_object));
7853 if (EQ (src_object, dst_object))
7854 {
7855 saved_pt = PT, saved_pt_byte = PT_BYTE;
7856 coding->src_object = del_range_1 (from, to, 1, 1);
7857 coding->src_pos = 0;
7858 coding->src_pos_byte = 0;
7859 }
7860 else
7861 {
7862 if (from < GPT && to >= GPT)
7863 move_gap_both (from, from_byte);
7864 coding->src_pos = from;
7865 coding->src_pos_byte = from_byte;
7866 }
7867 }
7868 else
7869 code_conversion_save (0, 0);
7870
7871 if (BUFFERP (dst_object))
7872 {
7873 coding->dst_object = dst_object;
7874 if (EQ (src_object, dst_object))
7875 {
7876 coding->dst_pos = from;
7877 coding->dst_pos_byte = from_byte;
7878 }
7879 else
7880 {
7881 struct buffer *current = current_buffer;
7882
7883 set_buffer_temp (XBUFFER (dst_object));
7884 coding->dst_pos = PT;
7885 coding->dst_pos_byte = PT_BYTE;
7886 move_gap_both (coding->dst_pos, coding->dst_pos_byte);
7887 set_buffer_temp (current);
7888 }
7889 coding->dst_multibyte
7890 = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7891 }
7892 else if (EQ (dst_object, Qt))
7893 {
7894 ptrdiff_t dst_bytes = max (1, coding->src_chars);
7895 coding->dst_object = Qnil;
7896 coding->destination = xmalloc (dst_bytes);
7897 coding->dst_bytes = dst_bytes;
7898 coding->dst_multibyte = 0;
7899 }
7900 else
7901 {
7902 coding->dst_object = Qnil;
7903 coding->dst_multibyte = 0;
7904 }
7905
7906 encode_coding (coding);
7907
7908 if (EQ (dst_object, Qt))
7909 {
7910 if (BUFFERP (coding->dst_object))
7911 coding->dst_object = Fbuffer_string ();
7912 else
7913 {
7914 coding->dst_object
7915 = make_unibyte_string ((char *) coding->destination,
7916 coding->produced);
7917 xfree (coding->destination);
7918 }
7919 }
7920
7921 if (saved_pt >= 0)
7922 {
7923 /* This is the case of:
7924 (BUFFERP (src_object) && EQ (src_object, dst_object))
7925 As we have moved PT while replacing the original buffer
7926 contents, we must recover it now. */
7927 set_buffer_internal (XBUFFER (src_object));
7928 if (saved_pt < from)
7929 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7930 else if (saved_pt < from + chars)
7931 TEMP_SET_PT_BOTH (from, from_byte);
7932 else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7933 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7934 saved_pt_byte + (coding->produced - bytes));
7935 else
7936 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7937 saved_pt_byte + (coding->produced - bytes));
7938
7939 if (need_marker_adjustment)
7940 {
7941 struct Lisp_Marker *tail;
7942
7943 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7944 if (tail->need_adjustment)
7945 {
7946 tail->need_adjustment = 0;
7947 if (tail->insertion_type)
7948 {
7949 tail->bytepos = from_byte;
7950 tail->charpos = from;
7951 }
7952 else
7953 {
7954 tail->bytepos = from_byte + coding->produced;
7955 tail->charpos
7956 = (NILP (BVAR (current_buffer, enable_multibyte_characters))
7957 ? tail->bytepos : from + coding->produced_char);
7958 }
7959 }
7960 }
7961 }
7962
7963 if (kill_src_buffer)
7964 Fkill_buffer (coding->src_object);
7965
7966 Vdeactivate_mark = old_deactivate_mark;
7967 unbind_to (count, Qnil);
7968 }
7969
7970
7971 Lisp_Object
7972 preferred_coding_system (void)
7973 {
7974 int id = coding_categories[coding_priorities[0]].id;
7975
7976 return CODING_ID_NAME (id);
7977 }
7978
7979 #if defined (WINDOWSNT) || defined (CYGWIN)
7980
7981 Lisp_Object
7982 from_unicode (Lisp_Object str)
7983 {
7984 CHECK_STRING (str);
7985 if (!STRING_MULTIBYTE (str) &&
7986 SBYTES (str) & 1)
7987 {
7988 str = Fsubstring (str, make_number (0), make_number (-1));
7989 }
7990
7991 return code_convert_string_norecord (str, Qutf_16le, 0);
7992 }
7993
7994 wchar_t *
7995 to_unicode (Lisp_Object str, Lisp_Object *buf)
7996 {
7997 *buf = code_convert_string_norecord (str, Qutf_16le, 1);
7998 /* We need to make a another copy (in addition to the one made by
7999 code_convert_string_norecord) to ensure that the final string is
8000 _doubly_ zero terminated --- that is, that the string is
8001 terminated by two zero bytes and one utf-16le null character.
8002 Because strings are already terminated with a single zero byte,
8003 we just add one additional zero. */
8004 str = make_uninit_string (SBYTES (*buf) + 1);
8005 memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
8006 SDATA (str) [SBYTES (*buf)] = '\0';
8007 *buf = str;
8008 return WCSDATA (*buf);
8009 }
8010
8011 #endif /* WINDOWSNT || CYGWIN */
8012
8013 \f
8014 #ifdef emacs
8015 /*** 8. Emacs Lisp library functions ***/
8016
8017 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8018 doc: /* Return t if OBJECT is nil or a coding-system.
8019 See the documentation of `define-coding-system' for information
8020 about coding-system objects. */)
8021 (Lisp_Object object)
8022 {
8023 if (NILP (object)
8024 || CODING_SYSTEM_ID (object) >= 0)
8025 return Qt;
8026 if (! SYMBOLP (object)
8027 || NILP (Fget (object, Qcoding_system_define_form)))
8028 return Qnil;
8029 return Qt;
8030 }
8031
8032 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8033 Sread_non_nil_coding_system, 1, 1, 0,
8034 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
8035 (Lisp_Object prompt)
8036 {
8037 Lisp_Object val;
8038 do
8039 {
8040 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8041 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8042 }
8043 while (SCHARS (val) == 0);
8044 return (Fintern (val, Qnil));
8045 }
8046
8047 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8048 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8049 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8050 Ignores case when completing coding systems (all Emacs coding systems
8051 are lower-case). */)
8052 (Lisp_Object prompt, Lisp_Object default_coding_system)
8053 {
8054 Lisp_Object val;
8055 ptrdiff_t count = SPECPDL_INDEX ();
8056
8057 if (SYMBOLP (default_coding_system))
8058 default_coding_system = SYMBOL_NAME (default_coding_system);
8059 specbind (Qcompletion_ignore_case, Qt);
8060 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8061 Qt, Qnil, Qcoding_system_history,
8062 default_coding_system, Qnil);
8063 unbind_to (count, Qnil);
8064 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8065 }
8066
8067 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8068 1, 1, 0,
8069 doc: /* Check validity of CODING-SYSTEM.
8070 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8071 It is valid if it is nil or a symbol defined as a coding system by the
8072 function `define-coding-system'. */)
8073 (Lisp_Object coding_system)
8074 {
8075 Lisp_Object define_form;
8076
8077 define_form = Fget (coding_system, Qcoding_system_define_form);
8078 if (! NILP (define_form))
8079 {
8080 Fput (coding_system, Qcoding_system_define_form, Qnil);
8081 safe_eval (define_form);
8082 }
8083 if (!NILP (Fcoding_system_p (coding_system)))
8084 return coding_system;
8085 xsignal1 (Qcoding_system_error, coding_system);
8086 }
8087
8088 \f
8089 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
8090 HIGHEST, return the coding system of the highest
8091 priority among the detected coding systems. Otherwise return a
8092 list of detected coding systems sorted by their priorities. If
8093 MULTIBYTEP, it is assumed that the bytes are in correct
8094 multibyte form but contains only ASCII and eight-bit chars.
8095 Otherwise, the bytes are raw bytes.
8096
8097 CODING-SYSTEM controls the detection as below:
8098
8099 If it is nil, detect both text-format and eol-format. If the
8100 text-format part of CODING-SYSTEM is already specified
8101 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
8102 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8103 detect only text-format. */
8104
8105 Lisp_Object
8106 detect_coding_system (const unsigned char *src,
8107 ptrdiff_t src_chars, ptrdiff_t src_bytes,
8108 bool highest, bool multibytep,
8109 Lisp_Object coding_system)
8110 {
8111 const unsigned char *src_end = src + src_bytes;
8112 Lisp_Object attrs, eol_type;
8113 Lisp_Object val = Qnil;
8114 struct coding_system coding;
8115 ptrdiff_t id;
8116 struct coding_detection_info detect_info;
8117 enum coding_category base_category;
8118 bool null_byte_found = 0, eight_bit_found = 0;
8119
8120 if (NILP (coding_system))
8121 coding_system = Qundecided;
8122 setup_coding_system (coding_system, &coding);
8123 attrs = CODING_ID_ATTRS (coding.id);
8124 eol_type = CODING_ID_EOL_TYPE (coding.id);
8125 coding_system = CODING_ATTR_BASE_NAME (attrs);
8126
8127 coding.source = src;
8128 coding.src_chars = src_chars;
8129 coding.src_bytes = src_bytes;
8130 coding.src_multibyte = multibytep;
8131 coding.consumed = 0;
8132 coding.mode |= CODING_MODE_LAST_BLOCK;
8133 coding.head_ascii = 0;
8134
8135 detect_info.checked = detect_info.found = detect_info.rejected = 0;
8136
8137 /* At first, detect text-format if necessary. */
8138 base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8139 if (base_category == coding_category_undecided)
8140 {
8141 enum coding_category category IF_LINT (= 0);
8142 struct coding_system *this IF_LINT (= NULL);
8143 int c, i;
8144
8145 /* Skip all ASCII bytes except for a few ISO2022 controls. */
8146 for (; src < src_end; src++)
8147 {
8148 c = *src;
8149 if (c & 0x80)
8150 {
8151 eight_bit_found = 1;
8152 if (null_byte_found)
8153 break;
8154 }
8155 else if (c < 0x20)
8156 {
8157 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8158 && ! inhibit_iso_escape_detection
8159 && ! detect_info.checked)
8160 {
8161 if (detect_coding_iso_2022 (&coding, &detect_info))
8162 {
8163 /* We have scanned the whole data. */
8164 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8165 {
8166 /* We didn't find an 8-bit code. We may
8167 have found a null-byte, but it's very
8168 rare that a binary file confirm to
8169 ISO-2022. */
8170 src = src_end;
8171 coding.head_ascii = src - coding.source;
8172 }
8173 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8174 break;
8175 }
8176 }
8177 else if (! c && !inhibit_null_byte_detection)
8178 {
8179 null_byte_found = 1;
8180 if (eight_bit_found)
8181 break;
8182 }
8183 if (! eight_bit_found)
8184 coding.head_ascii++;
8185 }
8186 else if (! eight_bit_found)
8187 coding.head_ascii++;
8188 }
8189
8190 if (null_byte_found || eight_bit_found
8191 || coding.head_ascii < coding.src_bytes
8192 || detect_info.found)
8193 {
8194 if (coding.head_ascii == coding.src_bytes)
8195 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
8196 for (i = 0; i < coding_category_raw_text; i++)
8197 {
8198 category = coding_priorities[i];
8199 this = coding_categories + category;
8200 if (detect_info.found & (1 << category))
8201 break;
8202 }
8203 else
8204 {
8205 if (null_byte_found)
8206 {
8207 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8208 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8209 }
8210 for (i = 0; i < coding_category_raw_text; i++)
8211 {
8212 category = coding_priorities[i];
8213 this = coding_categories + category;
8214
8215 if (this->id < 0)
8216 {
8217 /* No coding system of this category is defined. */
8218 detect_info.rejected |= (1 << category);
8219 }
8220 else if (category >= coding_category_raw_text)
8221 continue;
8222 else if (detect_info.checked & (1 << category))
8223 {
8224 if (highest
8225 && (detect_info.found & (1 << category)))
8226 break;
8227 }
8228 else if ((*(this->detector)) (&coding, &detect_info)
8229 && highest
8230 && (detect_info.found & (1 << category)))
8231 {
8232 if (category == coding_category_utf_16_auto)
8233 {
8234 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8235 category = coding_category_utf_16_le;
8236 else
8237 category = coding_category_utf_16_be;
8238 }
8239 break;
8240 }
8241 }
8242 }
8243 }
8244
8245 if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8246 || null_byte_found)
8247 {
8248 detect_info.found = CATEGORY_MASK_RAW_TEXT;
8249 id = CODING_SYSTEM_ID (Qno_conversion);
8250 val = Fcons (make_number (id), Qnil);
8251 }
8252 else if (! detect_info.rejected && ! detect_info.found)
8253 {
8254 detect_info.found = CATEGORY_MASK_ANY;
8255 id = coding_categories[coding_category_undecided].id;
8256 val = Fcons (make_number (id), Qnil);
8257 }
8258 else if (highest)
8259 {
8260 if (detect_info.found)
8261 {
8262 detect_info.found = 1 << category;
8263 val = Fcons (make_number (this->id), Qnil);
8264 }
8265 else
8266 for (i = 0; i < coding_category_raw_text; i++)
8267 if (! (detect_info.rejected & (1 << coding_priorities[i])))
8268 {
8269 detect_info.found = 1 << coding_priorities[i];
8270 id = coding_categories[coding_priorities[i]].id;
8271 val = Fcons (make_number (id), Qnil);
8272 break;
8273 }
8274 }
8275 else
8276 {
8277 int mask = detect_info.rejected | detect_info.found;
8278 int found = 0;
8279
8280 for (i = coding_category_raw_text - 1; i >= 0; i--)
8281 {
8282 category = coding_priorities[i];
8283 if (! (mask & (1 << category)))
8284 {
8285 found |= 1 << category;
8286 id = coding_categories[category].id;
8287 if (id >= 0)
8288 val = Fcons (make_number (id), val);
8289 }
8290 }
8291 for (i = coding_category_raw_text - 1; i >= 0; i--)
8292 {
8293 category = coding_priorities[i];
8294 if (detect_info.found & (1 << category))
8295 {
8296 id = coding_categories[category].id;
8297 val = Fcons (make_number (id), val);
8298 }
8299 }
8300 detect_info.found |= found;
8301 }
8302 }
8303 else if (base_category == coding_category_utf_8_auto)
8304 {
8305 if (detect_coding_utf_8 (&coding, &detect_info))
8306 {
8307 struct coding_system *this;
8308
8309 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8310 this = coding_categories + coding_category_utf_8_sig;
8311 else
8312 this = coding_categories + coding_category_utf_8_nosig;
8313 val = Fcons (make_number (this->id), Qnil);
8314 }
8315 }
8316 else if (base_category == coding_category_utf_16_auto)
8317 {
8318 if (detect_coding_utf_16 (&coding, &detect_info))
8319 {
8320 struct coding_system *this;
8321
8322 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8323 this = coding_categories + coding_category_utf_16_le;
8324 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8325 this = coding_categories + coding_category_utf_16_be;
8326 else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8327 this = coding_categories + coding_category_utf_16_be_nosig;
8328 else
8329 this = coding_categories + coding_category_utf_16_le_nosig;
8330 val = Fcons (make_number (this->id), Qnil);
8331 }
8332 }
8333 else
8334 {
8335 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8336 val = Fcons (make_number (coding.id), Qnil);
8337 }
8338
8339 /* Then, detect eol-format if necessary. */
8340 {
8341 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8342 Lisp_Object tail;
8343
8344 if (VECTORP (eol_type))
8345 {
8346 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8347 {
8348 if (null_byte_found)
8349 normal_eol = EOL_SEEN_LF;
8350 else
8351 normal_eol = detect_eol (coding.source, src_bytes,
8352 coding_category_raw_text);
8353 }
8354 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8355 | CATEGORY_MASK_UTF_16_BE_NOSIG))
8356 utf_16_be_eol = detect_eol (coding.source, src_bytes,
8357 coding_category_utf_16_be);
8358 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8359 | CATEGORY_MASK_UTF_16_LE_NOSIG))
8360 utf_16_le_eol = detect_eol (coding.source, src_bytes,
8361 coding_category_utf_16_le);
8362 }
8363 else
8364 {
8365 if (EQ (eol_type, Qunix))
8366 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8367 else if (EQ (eol_type, Qdos))
8368 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8369 else
8370 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8371 }
8372
8373 for (tail = val; CONSP (tail); tail = XCDR (tail))
8374 {
8375 enum coding_category category;
8376 int this_eol;
8377
8378 id = XINT (XCAR (tail));
8379 attrs = CODING_ID_ATTRS (id);
8380 category = XINT (CODING_ATTR_CATEGORY (attrs));
8381 eol_type = CODING_ID_EOL_TYPE (id);
8382 if (VECTORP (eol_type))
8383 {
8384 if (category == coding_category_utf_16_be
8385 || category == coding_category_utf_16_be_nosig)
8386 this_eol = utf_16_be_eol;
8387 else if (category == coding_category_utf_16_le
8388 || category == coding_category_utf_16_le_nosig)
8389 this_eol = utf_16_le_eol;
8390 else
8391 this_eol = normal_eol;
8392
8393 if (this_eol == EOL_SEEN_LF)
8394 XSETCAR (tail, AREF (eol_type, 0));
8395 else if (this_eol == EOL_SEEN_CRLF)
8396 XSETCAR (tail, AREF (eol_type, 1));
8397 else if (this_eol == EOL_SEEN_CR)
8398 XSETCAR (tail, AREF (eol_type, 2));
8399 else
8400 XSETCAR (tail, CODING_ID_NAME (id));
8401 }
8402 else
8403 XSETCAR (tail, CODING_ID_NAME (id));
8404 }
8405 }
8406
8407 return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8408 }
8409
8410
8411 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8412 2, 3, 0,
8413 doc: /* Detect coding system of the text in the region between START and END.
8414 Return a list of possible coding systems ordered by priority.
8415 The coding systems to try and their priorities follows what
8416 the function `coding-system-priority-list' (which see) returns.
8417
8418 If only ASCII characters are found (except for such ISO-2022 control
8419 characters as ESC), it returns a list of single element `undecided'
8420 or its subsidiary coding system according to a detected end-of-line
8421 format.
8422
8423 If optional argument HIGHEST is non-nil, return the coding system of
8424 highest priority. */)
8425 (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8426 {
8427 ptrdiff_t from, to;
8428 ptrdiff_t from_byte, to_byte;
8429
8430 validate_region (&start, &end);
8431 from = XINT (start), to = XINT (end);
8432 from_byte = CHAR_TO_BYTE (from);
8433 to_byte = CHAR_TO_BYTE (to);
8434
8435 if (from < GPT && to >= GPT)
8436 move_gap_both (to, to_byte);
8437
8438 return detect_coding_system (BYTE_POS_ADDR (from_byte),
8439 to - from, to_byte - from_byte,
8440 !NILP (highest),
8441 !NILP (BVAR (current_buffer
8442 , enable_multibyte_characters)),
8443 Qnil);
8444 }
8445
8446 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8447 1, 2, 0,
8448 doc: /* Detect coding system of the text in STRING.
8449 Return a list of possible coding systems ordered by priority.
8450 The coding systems to try and their priorities follows what
8451 the function `coding-system-priority-list' (which see) returns.
8452
8453 If only ASCII characters are found (except for such ISO-2022 control
8454 characters as ESC), it returns a list of single element `undecided'
8455 or its subsidiary coding system according to a detected end-of-line
8456 format.
8457
8458 If optional argument HIGHEST is non-nil, return the coding system of
8459 highest priority. */)
8460 (Lisp_Object string, Lisp_Object highest)
8461 {
8462 CHECK_STRING (string);
8463
8464 return detect_coding_system (SDATA (string),
8465 SCHARS (string), SBYTES (string),
8466 !NILP (highest), STRING_MULTIBYTE (string),
8467 Qnil);
8468 }
8469
8470
8471 static bool
8472 char_encodable_p (int c, Lisp_Object attrs)
8473 {
8474 Lisp_Object tail;
8475 struct charset *charset;
8476 Lisp_Object translation_table;
8477
8478 translation_table = CODING_ATTR_TRANS_TBL (attrs);
8479 if (! NILP (translation_table))
8480 c = translate_char (translation_table, c);
8481 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8482 CONSP (tail); tail = XCDR (tail))
8483 {
8484 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8485 if (CHAR_CHARSET_P (c, charset))
8486 break;
8487 }
8488 return (! NILP (tail));
8489 }
8490
8491
8492 /* Return a list of coding systems that safely encode the text between
8493 START and END. If EXCLUDE is non-nil, it is a list of coding
8494 systems not to check. The returned list doesn't contain any such
8495 coding systems. In any case, if the text contains only ASCII or is
8496 unibyte, return t. */
8497
8498 DEFUN ("find-coding-systems-region-internal",
8499 Ffind_coding_systems_region_internal,
8500 Sfind_coding_systems_region_internal, 2, 3, 0,
8501 doc: /* Internal use only. */)
8502 (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8503 {
8504 Lisp_Object coding_attrs_list, safe_codings;
8505 ptrdiff_t start_byte, end_byte;
8506 const unsigned char *p, *pbeg, *pend;
8507 int c;
8508 Lisp_Object tail, elt, work_table;
8509
8510 if (STRINGP (start))
8511 {
8512 if (!STRING_MULTIBYTE (start)
8513 || SCHARS (start) == SBYTES (start))
8514 return Qt;
8515 start_byte = 0;
8516 end_byte = SBYTES (start);
8517 }
8518 else
8519 {
8520 CHECK_NUMBER_COERCE_MARKER (start);
8521 CHECK_NUMBER_COERCE_MARKER (end);
8522 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8523 args_out_of_range (start, end);
8524 if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8525 return Qt;
8526 start_byte = CHAR_TO_BYTE (XINT (start));
8527 end_byte = CHAR_TO_BYTE (XINT (end));
8528 if (XINT (end) - XINT (start) == end_byte - start_byte)
8529 return Qt;
8530
8531 if (XINT (start) < GPT && XINT (end) > GPT)
8532 {
8533 if ((GPT - XINT (start)) < (XINT (end) - GPT))
8534 move_gap_both (XINT (start), start_byte);
8535 else
8536 move_gap_both (XINT (end), end_byte);
8537 }
8538 }
8539
8540 coding_attrs_list = Qnil;
8541 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8542 if (NILP (exclude)
8543 || NILP (Fmemq (XCAR (tail), exclude)))
8544 {
8545 Lisp_Object attrs;
8546
8547 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8548 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8549 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8550 {
8551 ASET (attrs, coding_attr_trans_tbl,
8552 get_translation_table (attrs, 1, NULL));
8553 coding_attrs_list = Fcons (attrs, coding_attrs_list);
8554 }
8555 }
8556
8557 if (STRINGP (start))
8558 p = pbeg = SDATA (start);
8559 else
8560 p = pbeg = BYTE_POS_ADDR (start_byte);
8561 pend = p + (end_byte - start_byte);
8562
8563 while (p < pend && ASCII_BYTE_P (*p)) p++;
8564 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8565
8566 work_table = Fmake_char_table (Qnil, Qnil);
8567 while (p < pend)
8568 {
8569 if (ASCII_BYTE_P (*p))
8570 p++;
8571 else
8572 {
8573 c = STRING_CHAR_ADVANCE (p);
8574 if (!NILP (char_table_ref (work_table, c)))
8575 /* This character was already checked. Ignore it. */
8576 continue;
8577
8578 charset_map_loaded = 0;
8579 for (tail = coding_attrs_list; CONSP (tail);)
8580 {
8581 elt = XCAR (tail);
8582 if (NILP (elt))
8583 tail = XCDR (tail);
8584 else if (char_encodable_p (c, elt))
8585 tail = XCDR (tail);
8586 else if (CONSP (XCDR (tail)))
8587 {
8588 XSETCAR (tail, XCAR (XCDR (tail)));
8589 XSETCDR (tail, XCDR (XCDR (tail)));
8590 }
8591 else
8592 {
8593 XSETCAR (tail, Qnil);
8594 tail = XCDR (tail);
8595 }
8596 }
8597 if (charset_map_loaded)
8598 {
8599 ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
8600
8601 if (STRINGP (start))
8602 pbeg = SDATA (start);
8603 else
8604 pbeg = BYTE_POS_ADDR (start_byte);
8605 p = pbeg + p_offset;
8606 pend = pbeg + pend_offset;
8607 }
8608 char_table_set (work_table, c, Qt);
8609 }
8610 }
8611
8612 safe_codings = list2 (Qraw_text, Qno_conversion);
8613 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8614 if (! NILP (XCAR (tail)))
8615 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8616
8617 return safe_codings;
8618 }
8619
8620
8621 DEFUN ("unencodable-char-position", Funencodable_char_position,
8622 Sunencodable_char_position, 3, 5, 0,
8623 doc: /*
8624 Return position of first un-encodable character in a region.
8625 START and END specify the region and CODING-SYSTEM specifies the
8626 encoding to check. Return nil if CODING-SYSTEM does encode the region.
8627
8628 If optional 4th argument COUNT is non-nil, it specifies at most how
8629 many un-encodable characters to search. In this case, the value is a
8630 list of positions.
8631
8632 If optional 5th argument STRING is non-nil, it is a string to search
8633 for un-encodable characters. In that case, START and END are indexes
8634 to the string. */)
8635 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
8636 {
8637 EMACS_INT n;
8638 struct coding_system coding;
8639 Lisp_Object attrs, charset_list, translation_table;
8640 Lisp_Object positions;
8641 ptrdiff_t from, to;
8642 const unsigned char *p, *stop, *pend;
8643 bool ascii_compatible;
8644
8645 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8646 attrs = CODING_ID_ATTRS (coding.id);
8647 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8648 return Qnil;
8649 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8650 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8651 translation_table = get_translation_table (attrs, 1, NULL);
8652
8653 if (NILP (string))
8654 {
8655 validate_region (&start, &end);
8656 from = XINT (start);
8657 to = XINT (end);
8658 if (NILP (BVAR (current_buffer, enable_multibyte_characters))
8659 || (ascii_compatible
8660 && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8661 return Qnil;
8662 p = CHAR_POS_ADDR (from);
8663 pend = CHAR_POS_ADDR (to);
8664 if (from < GPT && to >= GPT)
8665 stop = GPT_ADDR;
8666 else
8667 stop = pend;
8668 }
8669 else
8670 {
8671 CHECK_STRING (string);
8672 CHECK_NATNUM (start);
8673 CHECK_NATNUM (end);
8674 if (! (XINT (start) <= XINT (end) && XINT (end) <= SCHARS (string)))
8675 args_out_of_range_3 (string, start, end);
8676 from = XINT (start);
8677 to = XINT (end);
8678 if (! STRING_MULTIBYTE (string))
8679 return Qnil;
8680 p = SDATA (string) + string_char_to_byte (string, from);
8681 stop = pend = SDATA (string) + string_char_to_byte (string, to);
8682 if (ascii_compatible && (to - from) == (pend - p))
8683 return Qnil;
8684 }
8685
8686 if (NILP (count))
8687 n = 1;
8688 else
8689 {
8690 CHECK_NATNUM (count);
8691 n = XINT (count);
8692 }
8693
8694 positions = Qnil;
8695 charset_map_loaded = 0;
8696 while (1)
8697 {
8698 int c;
8699
8700 if (ascii_compatible)
8701 while (p < stop && ASCII_BYTE_P (*p))
8702 p++, from++;
8703 if (p >= stop)
8704 {
8705 if (p >= pend)
8706 break;
8707 stop = pend;
8708 p = GAP_END_ADDR;
8709 }
8710
8711 c = STRING_CHAR_ADVANCE (p);
8712 if (! (ASCII_CHAR_P (c) && ascii_compatible)
8713 && ! char_charset (translate_char (translation_table, c),
8714 charset_list, NULL))
8715 {
8716 positions = Fcons (make_number (from), positions);
8717 n--;
8718 if (n == 0)
8719 break;
8720 }
8721
8722 from++;
8723 if (charset_map_loaded && NILP (string))
8724 {
8725 p = CHAR_POS_ADDR (from);
8726 pend = CHAR_POS_ADDR (to);
8727 if (from < GPT && to >= GPT)
8728 stop = GPT_ADDR;
8729 else
8730 stop = pend;
8731 charset_map_loaded = 0;
8732 }
8733 }
8734
8735 return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8736 }
8737
8738
8739 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8740 Scheck_coding_systems_region, 3, 3, 0,
8741 doc: /* Check if the region is encodable by coding systems.
8742
8743 START and END are buffer positions specifying the region.
8744 CODING-SYSTEM-LIST is a list of coding systems to check.
8745
8746 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
8747 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
8748 whole region, POS0, POS1, ... are buffer positions where non-encodable
8749 characters are found.
8750
8751 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8752 value is nil.
8753
8754 START may be a string. In that case, check if the string is
8755 encodable, and the value contains indices to the string instead of
8756 buffer positions. END is ignored.
8757
8758 If the current buffer (or START if it is a string) is unibyte, the value
8759 is nil. */)
8760 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
8761 {
8762 Lisp_Object list;
8763 ptrdiff_t start_byte, end_byte;
8764 ptrdiff_t pos;
8765 const unsigned char *p, *pbeg, *pend;
8766 int c;
8767 Lisp_Object tail, elt, attrs;
8768
8769 if (STRINGP (start))
8770 {
8771 if (!STRING_MULTIBYTE (start)
8772 || SCHARS (start) == SBYTES (start))
8773 return Qnil;
8774 start_byte = 0;
8775 end_byte = SBYTES (start);
8776 pos = 0;
8777 }
8778 else
8779 {
8780 CHECK_NUMBER_COERCE_MARKER (start);
8781 CHECK_NUMBER_COERCE_MARKER (end);
8782 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8783 args_out_of_range (start, end);
8784 if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8785 return Qnil;
8786 start_byte = CHAR_TO_BYTE (XINT (start));
8787 end_byte = CHAR_TO_BYTE (XINT (end));
8788 if (XINT (end) - XINT (start) == end_byte - start_byte)
8789 return Qnil;
8790
8791 if (XINT (start) < GPT && XINT (end) > GPT)
8792 {
8793 if ((GPT - XINT (start)) < (XINT (end) - GPT))
8794 move_gap_both (XINT (start), start_byte);
8795 else
8796 move_gap_both (XINT (end), end_byte);
8797 }
8798 pos = XINT (start);
8799 }
8800
8801 list = Qnil;
8802 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
8803 {
8804 elt = XCAR (tail);
8805 attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
8806 ASET (attrs, coding_attr_trans_tbl,
8807 get_translation_table (attrs, 1, NULL));
8808 list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
8809 }
8810
8811 if (STRINGP (start))
8812 p = pbeg = SDATA (start);
8813 else
8814 p = pbeg = BYTE_POS_ADDR (start_byte);
8815 pend = p + (end_byte - start_byte);
8816
8817 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8818 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8819
8820 while (p < pend)
8821 {
8822 if (ASCII_BYTE_P (*p))
8823 p++;
8824 else
8825 {
8826 c = STRING_CHAR_ADVANCE (p);
8827
8828 charset_map_loaded = 0;
8829 for (tail = list; CONSP (tail); tail = XCDR (tail))
8830 {
8831 elt = XCDR (XCAR (tail));
8832 if (! char_encodable_p (c, XCAR (elt)))
8833 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8834 }
8835 if (charset_map_loaded)
8836 {
8837 ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
8838
8839 if (STRINGP (start))
8840 pbeg = SDATA (start);
8841 else
8842 pbeg = BYTE_POS_ADDR (start_byte);
8843 p = pbeg + p_offset;
8844 pend = pbeg + pend_offset;
8845 }
8846 }
8847 pos++;
8848 }
8849
8850 tail = list;
8851 list = Qnil;
8852 for (; CONSP (tail); tail = XCDR (tail))
8853 {
8854 elt = XCAR (tail);
8855 if (CONSP (XCDR (XCDR (elt))))
8856 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8857 list);
8858 }
8859
8860 return list;
8861 }
8862
8863
8864 static Lisp_Object
8865 code_convert_region (Lisp_Object start, Lisp_Object end,
8866 Lisp_Object coding_system, Lisp_Object dst_object,
8867 bool encodep, bool norecord)
8868 {
8869 struct coding_system coding;
8870 ptrdiff_t from, from_byte, to, to_byte;
8871 Lisp_Object src_object;
8872
8873 if (NILP (coding_system))
8874 coding_system = Qno_conversion;
8875 else
8876 CHECK_CODING_SYSTEM (coding_system);
8877 src_object = Fcurrent_buffer ();
8878 if (NILP (dst_object))
8879 dst_object = src_object;
8880 else if (! EQ (dst_object, Qt))
8881 CHECK_BUFFER (dst_object);
8882
8883 validate_region (&start, &end);
8884 from = XFASTINT (start);
8885 from_byte = CHAR_TO_BYTE (from);
8886 to = XFASTINT (end);
8887 to_byte = CHAR_TO_BYTE (to);
8888
8889 setup_coding_system (coding_system, &coding);
8890 coding.mode |= CODING_MODE_LAST_BLOCK;
8891
8892 if (encodep)
8893 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8894 dst_object);
8895 else
8896 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8897 dst_object);
8898 if (! norecord)
8899 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8900
8901 return (BUFFERP (dst_object)
8902 ? make_number (coding.produced_char)
8903 : coding.dst_object);
8904 }
8905
8906
8907 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
8908 3, 4, "r\nzCoding system: ",
8909 doc: /* Decode the current region from the specified coding system.
8910 When called from a program, takes four arguments:
8911 START, END, CODING-SYSTEM, and DESTINATION.
8912 START and END are buffer positions.
8913
8914 Optional 4th arguments DESTINATION specifies where the decoded text goes.
8915 If nil, the region between START and END is replaced by the decoded text.
8916 If buffer, the decoded text is inserted in that buffer after point (point
8917 does not move).
8918 In those cases, the length of the decoded text is returned.
8919 If DESTINATION is t, the decoded text is returned.
8920
8921 This function sets `last-coding-system-used' to the precise coding system
8922 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8923 not fully specified.) */)
8924 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
8925 {
8926 return code_convert_region (start, end, coding_system, destination, 0, 0);
8927 }
8928
8929 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
8930 3, 4, "r\nzCoding system: ",
8931 doc: /* Encode the current region by specified coding system.
8932 When called from a program, takes four arguments:
8933 START, END, CODING-SYSTEM and DESTINATION.
8934 START and END are buffer positions.
8935
8936 Optional 4th arguments DESTINATION specifies where the encoded text goes.
8937 If nil, the region between START and END is replace by the encoded text.
8938 If buffer, the encoded text is inserted in that buffer after point (point
8939 does not move).
8940 In those cases, the length of the encoded text is returned.
8941 If DESTINATION is t, the encoded text is returned.
8942
8943 This function sets `last-coding-system-used' to the precise coding system
8944 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8945 not fully specified.) */)
8946 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
8947 {
8948 return code_convert_region (start, end, coding_system, destination, 1, 0);
8949 }
8950
8951 Lisp_Object
8952 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
8953 Lisp_Object dst_object, bool encodep, bool nocopy,
8954 bool norecord)
8955 {
8956 struct coding_system coding;
8957 ptrdiff_t chars, bytes;
8958
8959 CHECK_STRING (string);
8960 if (NILP (coding_system))
8961 {
8962 if (! norecord)
8963 Vlast_coding_system_used = Qno_conversion;
8964 if (NILP (dst_object))
8965 return (nocopy ? Fcopy_sequence (string) : string);
8966 }
8967
8968 if (NILP (coding_system))
8969 coding_system = Qno_conversion;
8970 else
8971 CHECK_CODING_SYSTEM (coding_system);
8972 if (NILP (dst_object))
8973 dst_object = Qt;
8974 else if (! EQ (dst_object, Qt))
8975 CHECK_BUFFER (dst_object);
8976
8977 setup_coding_system (coding_system, &coding);
8978 coding.mode |= CODING_MODE_LAST_BLOCK;
8979 chars = SCHARS (string);
8980 bytes = SBYTES (string);
8981 if (encodep)
8982 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8983 else
8984 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8985 if (! norecord)
8986 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8987
8988 return (BUFFERP (dst_object)
8989 ? make_number (coding.produced_char)
8990 : coding.dst_object);
8991 }
8992
8993
8994 /* Encode or decode STRING according to CODING_SYSTEM.
8995 Do not set Vlast_coding_system_used.
8996
8997 This function is called only from macros DECODE_FILE and
8998 ENCODE_FILE, thus we ignore character composition. */
8999
9000 Lisp_Object
9001 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9002 bool encodep)
9003 {
9004 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9005 }
9006
9007
9008 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9009 2, 4, 0,
9010 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9011
9012 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9013 if the decoding operation is trivial.
9014
9015 Optional fourth arg BUFFER non-nil means that the decoded text is
9016 inserted in that buffer after point (point does not move). In this
9017 case, the return value is the length of the decoded text.
9018
9019 This function sets `last-coding-system-used' to the precise coding system
9020 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9021 not fully specified.) */)
9022 (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9023 {
9024 return code_convert_string (string, coding_system, buffer,
9025 0, ! NILP (nocopy), 0);
9026 }
9027
9028 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9029 2, 4, 0,
9030 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9031
9032 Optional third arg NOCOPY non-nil means it is OK to return STRING
9033 itself if the encoding operation is trivial.
9034
9035 Optional fourth arg BUFFER non-nil means that the encoded text is
9036 inserted in that buffer after point (point does not move). In this
9037 case, the return value is the length of the encoded text.
9038
9039 This function sets `last-coding-system-used' to the precise coding system
9040 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9041 not fully specified.) */)
9042 (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9043 {
9044 return code_convert_string (string, coding_system, buffer,
9045 1, ! NILP (nocopy), 0);
9046 }
9047
9048 \f
9049 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9050 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9051 Return the corresponding character. */)
9052 (Lisp_Object code)
9053 {
9054 Lisp_Object spec, attrs, val;
9055 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9056 EMACS_INT ch;
9057 int c;
9058
9059 CHECK_NATNUM (code);
9060 ch = XFASTINT (code);
9061 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9062 attrs = AREF (spec, 0);
9063
9064 if (ASCII_BYTE_P (ch)
9065 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9066 return code;
9067
9068 val = CODING_ATTR_CHARSET_LIST (attrs);
9069 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9070 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9071 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9072
9073 if (ch <= 0x7F)
9074 {
9075 c = ch;
9076 charset = charset_roman;
9077 }
9078 else if (ch >= 0xA0 && ch < 0xDF)
9079 {
9080 c = ch - 0x80;
9081 charset = charset_kana;
9082 }
9083 else
9084 {
9085 EMACS_INT c1 = ch >> 8;
9086 int c2 = ch & 0xFF;
9087
9088 if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9089 || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9090 error ("Invalid code: %"pI"d", ch);
9091 c = ch;
9092 SJIS_TO_JIS (c);
9093 charset = charset_kanji;
9094 }
9095 c = DECODE_CHAR (charset, c);
9096 if (c < 0)
9097 error ("Invalid code: %"pI"d", ch);
9098 return make_number (c);
9099 }
9100
9101
9102 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9103 doc: /* Encode a Japanese character CH to shift_jis encoding.
9104 Return the corresponding code in SJIS. */)
9105 (Lisp_Object ch)
9106 {
9107 Lisp_Object spec, attrs, charset_list;
9108 int c;
9109 struct charset *charset;
9110 unsigned code;
9111
9112 CHECK_CHARACTER (ch);
9113 c = XFASTINT (ch);
9114 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9115 attrs = AREF (spec, 0);
9116
9117 if (ASCII_CHAR_P (c)
9118 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9119 return ch;
9120
9121 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9122 charset = char_charset (c, charset_list, &code);
9123 if (code == CHARSET_INVALID_CODE (charset))
9124 error ("Can't encode by shift_jis encoding: %c", c);
9125 JIS_TO_SJIS (code);
9126
9127 return make_number (code);
9128 }
9129
9130 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9131 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9132 Return the corresponding character. */)
9133 (Lisp_Object code)
9134 {
9135 Lisp_Object spec, attrs, val;
9136 struct charset *charset_roman, *charset_big5, *charset;
9137 EMACS_INT ch;
9138 int c;
9139
9140 CHECK_NATNUM (code);
9141 ch = XFASTINT (code);
9142 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9143 attrs = AREF (spec, 0);
9144
9145 if (ASCII_BYTE_P (ch)
9146 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9147 return code;
9148
9149 val = CODING_ATTR_CHARSET_LIST (attrs);
9150 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9151 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9152
9153 if (ch <= 0x7F)
9154 {
9155 c = ch;
9156 charset = charset_roman;
9157 }
9158 else
9159 {
9160 EMACS_INT b1 = ch >> 8;
9161 int b2 = ch & 0x7F;
9162 if (b1 < 0xA1 || b1 > 0xFE
9163 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9164 error ("Invalid code: %"pI"d", ch);
9165 c = ch;
9166 charset = charset_big5;
9167 }
9168 c = DECODE_CHAR (charset, c);
9169 if (c < 0)
9170 error ("Invalid code: %"pI"d", ch);
9171 return make_number (c);
9172 }
9173
9174 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9175 doc: /* Encode the Big5 character CH to BIG5 coding system.
9176 Return the corresponding character code in Big5. */)
9177 (Lisp_Object ch)
9178 {
9179 Lisp_Object spec, attrs, charset_list;
9180 struct charset *charset;
9181 int c;
9182 unsigned code;
9183
9184 CHECK_CHARACTER (ch);
9185 c = XFASTINT (ch);
9186 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9187 attrs = AREF (spec, 0);
9188 if (ASCII_CHAR_P (c)
9189 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9190 return ch;
9191
9192 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9193 charset = char_charset (c, charset_list, &code);
9194 if (code == CHARSET_INVALID_CODE (charset))
9195 error ("Can't encode by Big5 encoding: %c", c);
9196
9197 return make_number (code);
9198 }
9199
9200 \f
9201 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9202 Sset_terminal_coding_system_internal, 1, 2, 0,
9203 doc: /* Internal use only. */)
9204 (Lisp_Object coding_system, Lisp_Object terminal)
9205 {
9206 struct terminal *term = get_terminal (terminal, 1);
9207 struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9208 CHECK_SYMBOL (coding_system);
9209 setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9210 /* We had better not send unsafe characters to terminal. */
9211 terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9212 /* Character composition should be disabled. */
9213 terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9214 terminal_coding->src_multibyte = 1;
9215 terminal_coding->dst_multibyte = 0;
9216 tset_charset_list
9217 (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9218 ? coding_charset_list (terminal_coding)
9219 : Fcons (make_number (charset_ascii), Qnil)));
9220 return Qnil;
9221 }
9222
9223 DEFUN ("set-safe-terminal-coding-system-internal",
9224 Fset_safe_terminal_coding_system_internal,
9225 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9226 doc: /* Internal use only. */)
9227 (Lisp_Object coding_system)
9228 {
9229 CHECK_SYMBOL (coding_system);
9230 setup_coding_system (Fcheck_coding_system (coding_system),
9231 &safe_terminal_coding);
9232 /* Character composition should be disabled. */
9233 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9234 safe_terminal_coding.src_multibyte = 1;
9235 safe_terminal_coding.dst_multibyte = 0;
9236 return Qnil;
9237 }
9238
9239 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9240 Sterminal_coding_system, 0, 1, 0,
9241 doc: /* Return coding system specified for terminal output on the given terminal.
9242 TERMINAL may be a terminal object, a frame, or nil for the selected
9243 frame's terminal device. */)
9244 (Lisp_Object terminal)
9245 {
9246 struct coding_system *terminal_coding
9247 = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9248 Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9249
9250 /* For backward compatibility, return nil if it is `undecided'. */
9251 return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9252 }
9253
9254 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9255 Sset_keyboard_coding_system_internal, 1, 2, 0,
9256 doc: /* Internal use only. */)
9257 (Lisp_Object coding_system, Lisp_Object terminal)
9258 {
9259 struct terminal *t = get_terminal (terminal, 1);
9260 CHECK_SYMBOL (coding_system);
9261 if (NILP (coding_system))
9262 coding_system = Qno_conversion;
9263 else
9264 Fcheck_coding_system (coding_system);
9265 setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9266 /* Character composition should be disabled. */
9267 TERMINAL_KEYBOARD_CODING (t)->common_flags
9268 &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9269 return Qnil;
9270 }
9271
9272 DEFUN ("keyboard-coding-system",
9273 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9274 doc: /* Return coding system specified for decoding keyboard input. */)
9275 (Lisp_Object terminal)
9276 {
9277 return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9278 (get_terminal (terminal, 1))->id);
9279 }
9280
9281 \f
9282 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9283 Sfind_operation_coding_system, 1, MANY, 0,
9284 doc: /* Choose a coding system for an operation based on the target name.
9285 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9286 DECODING-SYSTEM is the coding system to use for decoding
9287 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9288 for encoding (in case OPERATION does encoding).
9289
9290 The first argument OPERATION specifies an I/O primitive:
9291 For file I/O, `insert-file-contents' or `write-region'.
9292 For process I/O, `call-process', `call-process-region', or `start-process'.
9293 For network I/O, `open-network-stream'.
9294
9295 The remaining arguments should be the same arguments that were passed
9296 to the primitive. Depending on which primitive, one of those arguments
9297 is selected as the TARGET. For example, if OPERATION does file I/O,
9298 whichever argument specifies the file name is TARGET.
9299
9300 TARGET has a meaning which depends on OPERATION:
9301 For file I/O, TARGET is a file name (except for the special case below).
9302 For process I/O, TARGET is a process name.
9303 For network I/O, TARGET is a service name or a port number.
9304
9305 This function looks up what is specified for TARGET in
9306 `file-coding-system-alist', `process-coding-system-alist',
9307 or `network-coding-system-alist' depending on OPERATION.
9308 They may specify a coding system, a cons of coding systems,
9309 or a function symbol to call.
9310 In the last case, we call the function with one argument,
9311 which is a list of all the arguments given to this function.
9312 If the function can't decide a coding system, it can return
9313 `undecided' so that the normal code-detection is performed.
9314
9315 If OPERATION is `insert-file-contents', the argument corresponding to
9316 TARGET may be a cons (FILENAME . BUFFER). In that case, FILENAME is a
9317 file name to look up, and BUFFER is a buffer that contains the file's
9318 contents (not yet decoded). If `file-coding-system-alist' specifies a
9319 function to call for FILENAME, that function should examine the
9320 contents of BUFFER instead of reading the file.
9321
9322 usage: (find-operation-coding-system OPERATION ARGUMENTS...) */)
9323 (ptrdiff_t nargs, Lisp_Object *args)
9324 {
9325 Lisp_Object operation, target_idx, target, val;
9326 register Lisp_Object chain;
9327
9328 if (nargs < 2)
9329 error ("Too few arguments");
9330 operation = args[0];
9331 if (!SYMBOLP (operation)
9332 || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9333 error ("Invalid first argument");
9334 if (nargs <= 1 + XFASTINT (target_idx))
9335 error ("Too few arguments for operation `%s'",
9336 SDATA (SYMBOL_NAME (operation)));
9337 target = args[XFASTINT (target_idx) + 1];
9338 if (!(STRINGP (target)
9339 || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9340 && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9341 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9342 error ("Invalid argument %"pI"d of operation `%s'",
9343 XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9344 if (CONSP (target))
9345 target = XCAR (target);
9346
9347 chain = ((EQ (operation, Qinsert_file_contents)
9348 || EQ (operation, Qwrite_region))
9349 ? Vfile_coding_system_alist
9350 : (EQ (operation, Qopen_network_stream)
9351 ? Vnetwork_coding_system_alist
9352 : Vprocess_coding_system_alist));
9353 if (NILP (chain))
9354 return Qnil;
9355
9356 for (; CONSP (chain); chain = XCDR (chain))
9357 {
9358 Lisp_Object elt;
9359
9360 elt = XCAR (chain);
9361 if (CONSP (elt)
9362 && ((STRINGP (target)
9363 && STRINGP (XCAR (elt))
9364 && fast_string_match (XCAR (elt), target) >= 0)
9365 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9366 {
9367 val = XCDR (elt);
9368 /* Here, if VAL is both a valid coding system and a valid
9369 function symbol, we return VAL as a coding system. */
9370 if (CONSP (val))
9371 return val;
9372 if (! SYMBOLP (val))
9373 return Qnil;
9374 if (! NILP (Fcoding_system_p (val)))
9375 return Fcons (val, val);
9376 if (! NILP (Ffboundp (val)))
9377 {
9378 /* We use call1 rather than safe_call1
9379 so as to get bug reports about functions called here
9380 which don't handle the current interface. */
9381 val = call1 (val, Flist (nargs, args));
9382 if (CONSP (val))
9383 return val;
9384 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9385 return Fcons (val, val);
9386 }
9387 return Qnil;
9388 }
9389 }
9390 return Qnil;
9391 }
9392
9393 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9394 Sset_coding_system_priority, 0, MANY, 0,
9395 doc: /* Assign higher priority to the coding systems given as arguments.
9396 If multiple coding systems belong to the same category,
9397 all but the first one are ignored.
9398
9399 usage: (set-coding-system-priority &rest coding-systems) */)
9400 (ptrdiff_t nargs, Lisp_Object *args)
9401 {
9402 ptrdiff_t i, j;
9403 bool changed[coding_category_max];
9404 enum coding_category priorities[coding_category_max];
9405
9406 memset (changed, 0, sizeof changed);
9407
9408 for (i = j = 0; i < nargs; i++)
9409 {
9410 enum coding_category category;
9411 Lisp_Object spec, attrs;
9412
9413 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9414 attrs = AREF (spec, 0);
9415 category = XINT (CODING_ATTR_CATEGORY (attrs));
9416 if (changed[category])
9417 /* Ignore this coding system because a coding system of the
9418 same category already had a higher priority. */
9419 continue;
9420 changed[category] = 1;
9421 priorities[j++] = category;
9422 if (coding_categories[category].id >= 0
9423 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9424 setup_coding_system (args[i], &coding_categories[category]);
9425 Fset (AREF (Vcoding_category_table, category), args[i]);
9426 }
9427
9428 /* Now we have decided top J priorities. Reflect the order of the
9429 original priorities to the remaining priorities. */
9430
9431 for (i = j, j = 0; i < coding_category_max; i++, j++)
9432 {
9433 while (j < coding_category_max
9434 && changed[coding_priorities[j]])
9435 j++;
9436 if (j == coding_category_max)
9437 emacs_abort ();
9438 priorities[i] = coding_priorities[j];
9439 }
9440
9441 memcpy (coding_priorities, priorities, sizeof priorities);
9442
9443 /* Update `coding-category-list'. */
9444 Vcoding_category_list = Qnil;
9445 for (i = coding_category_max; i-- > 0; )
9446 Vcoding_category_list
9447 = Fcons (AREF (Vcoding_category_table, priorities[i]),
9448 Vcoding_category_list);
9449
9450 return Qnil;
9451 }
9452
9453 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9454 Scoding_system_priority_list, 0, 1, 0,
9455 doc: /* Return a list of coding systems ordered by their priorities.
9456 The list contains a subset of coding systems; i.e. coding systems
9457 assigned to each coding category (see `coding-category-list').
9458
9459 HIGHESTP non-nil means just return the highest priority one. */)
9460 (Lisp_Object highestp)
9461 {
9462 int i;
9463 Lisp_Object val;
9464
9465 for (i = 0, val = Qnil; i < coding_category_max; i++)
9466 {
9467 enum coding_category category = coding_priorities[i];
9468 int id = coding_categories[category].id;
9469 Lisp_Object attrs;
9470
9471 if (id < 0)
9472 continue;
9473 attrs = CODING_ID_ATTRS (id);
9474 if (! NILP (highestp))
9475 return CODING_ATTR_BASE_NAME (attrs);
9476 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9477 }
9478 return Fnreverse (val);
9479 }
9480
9481 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9482
9483 static Lisp_Object
9484 make_subsidiaries (Lisp_Object base)
9485 {
9486 Lisp_Object subsidiaries;
9487 ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
9488 char *buf = alloca (base_name_len + 6);
9489 int i;
9490
9491 memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
9492 subsidiaries = Fmake_vector (make_number (3), Qnil);
9493 for (i = 0; i < 3; i++)
9494 {
9495 strcpy (buf + base_name_len, suffixes[i]);
9496 ASET (subsidiaries, i, intern (buf));
9497 }
9498 return subsidiaries;
9499 }
9500
9501
9502 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9503 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9504 doc: /* For internal use only.
9505 usage: (define-coding-system-internal ...) */)
9506 (ptrdiff_t nargs, Lisp_Object *args)
9507 {
9508 Lisp_Object name;
9509 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
9510 Lisp_Object attrs; /* Vector of attributes. */
9511 Lisp_Object eol_type;
9512 Lisp_Object aliases;
9513 Lisp_Object coding_type, charset_list, safe_charsets;
9514 enum coding_category category;
9515 Lisp_Object tail, val;
9516 int max_charset_id = 0;
9517 int i;
9518
9519 if (nargs < coding_arg_max)
9520 goto short_args;
9521
9522 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9523
9524 name = args[coding_arg_name];
9525 CHECK_SYMBOL (name);
9526 ASET (attrs, coding_attr_base_name, name);
9527
9528 val = args[coding_arg_mnemonic];
9529 if (! STRINGP (val))
9530 CHECK_CHARACTER (val);
9531 ASET (attrs, coding_attr_mnemonic, val);
9532
9533 coding_type = args[coding_arg_coding_type];
9534 CHECK_SYMBOL (coding_type);
9535 ASET (attrs, coding_attr_type, coding_type);
9536
9537 charset_list = args[coding_arg_charset_list];
9538 if (SYMBOLP (charset_list))
9539 {
9540 if (EQ (charset_list, Qiso_2022))
9541 {
9542 if (! EQ (coding_type, Qiso_2022))
9543 error ("Invalid charset-list");
9544 charset_list = Viso_2022_charset_list;
9545 }
9546 else if (EQ (charset_list, Qemacs_mule))
9547 {
9548 if (! EQ (coding_type, Qemacs_mule))
9549 error ("Invalid charset-list");
9550 charset_list = Vemacs_mule_charset_list;
9551 }
9552 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9553 {
9554 if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
9555 error ("Invalid charset-list");
9556 if (max_charset_id < XFASTINT (XCAR (tail)))
9557 max_charset_id = XFASTINT (XCAR (tail));
9558 }
9559 }
9560 else
9561 {
9562 charset_list = Fcopy_sequence (charset_list);
9563 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9564 {
9565 struct charset *charset;
9566
9567 val = XCAR (tail);
9568 CHECK_CHARSET_GET_CHARSET (val, charset);
9569 if (EQ (coding_type, Qiso_2022)
9570 ? CHARSET_ISO_FINAL (charset) < 0
9571 : EQ (coding_type, Qemacs_mule)
9572 ? CHARSET_EMACS_MULE_ID (charset) < 0
9573 : 0)
9574 error ("Can't handle charset `%s'",
9575 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9576
9577 XSETCAR (tail, make_number (charset->id));
9578 if (max_charset_id < charset->id)
9579 max_charset_id = charset->id;
9580 }
9581 }
9582 ASET (attrs, coding_attr_charset_list, charset_list);
9583
9584 safe_charsets = make_uninit_string (max_charset_id + 1);
9585 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
9586 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9587 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9588 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
9589
9590 ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
9591
9592 val = args[coding_arg_decode_translation_table];
9593 if (! CHAR_TABLE_P (val) && ! CONSP (val))
9594 CHECK_SYMBOL (val);
9595 ASET (attrs, coding_attr_decode_tbl, val);
9596
9597 val = args[coding_arg_encode_translation_table];
9598 if (! CHAR_TABLE_P (val) && ! CONSP (val))
9599 CHECK_SYMBOL (val);
9600 ASET (attrs, coding_attr_encode_tbl, val);
9601
9602 val = args[coding_arg_post_read_conversion];
9603 CHECK_SYMBOL (val);
9604 ASET (attrs, coding_attr_post_read, val);
9605
9606 val = args[coding_arg_pre_write_conversion];
9607 CHECK_SYMBOL (val);
9608 ASET (attrs, coding_attr_pre_write, val);
9609
9610 val = args[coding_arg_default_char];
9611 if (NILP (val))
9612 ASET (attrs, coding_attr_default_char, make_number (' '));
9613 else
9614 {
9615 CHECK_CHARACTER (val);
9616 ASET (attrs, coding_attr_default_char, val);
9617 }
9618
9619 val = args[coding_arg_for_unibyte];
9620 ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
9621
9622 val = args[coding_arg_plist];
9623 CHECK_LIST (val);
9624 ASET (attrs, coding_attr_plist, val);
9625
9626 if (EQ (coding_type, Qcharset))
9627 {
9628 /* Generate a lisp vector of 256 elements. Each element is nil,
9629 integer, or a list of charset IDs.
9630
9631 If Nth element is nil, the byte code N is invalid in this
9632 coding system.
9633
9634 If Nth element is a number NUM, N is the first byte of a
9635 charset whose ID is NUM.
9636
9637 If Nth element is a list of charset IDs, N is the first byte
9638 of one of them. The list is sorted by dimensions of the
9639 charsets. A charset of smaller dimension comes first. */
9640 val = Fmake_vector (make_number (256), Qnil);
9641
9642 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9643 {
9644 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9645 int dim = CHARSET_DIMENSION (charset);
9646 int idx = (dim - 1) * 4;
9647
9648 if (CHARSET_ASCII_COMPATIBLE_P (charset))
9649 ASET (attrs, coding_attr_ascii_compat, Qt);
9650
9651 for (i = charset->code_space[idx];
9652 i <= charset->code_space[idx + 1]; i++)
9653 {
9654 Lisp_Object tmp, tmp2;
9655 int dim2;
9656
9657 tmp = AREF (val, i);
9658 if (NILP (tmp))
9659 tmp = XCAR (tail);
9660 else if (NUMBERP (tmp))
9661 {
9662 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9663 if (dim < dim2)
9664 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
9665 else
9666 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
9667 }
9668 else
9669 {
9670 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9671 {
9672 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9673 if (dim < dim2)
9674 break;
9675 }
9676 if (NILP (tmp2))
9677 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9678 else
9679 {
9680 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9681 XSETCAR (tmp2, XCAR (tail));
9682 }
9683 }
9684 ASET (val, i, tmp);
9685 }
9686 }
9687 ASET (attrs, coding_attr_charset_valids, val);
9688 category = coding_category_charset;
9689 }
9690 else if (EQ (coding_type, Qccl))
9691 {
9692 Lisp_Object valids;
9693
9694 if (nargs < coding_arg_ccl_max)
9695 goto short_args;
9696
9697 val = args[coding_arg_ccl_decoder];
9698 CHECK_CCL_PROGRAM (val);
9699 if (VECTORP (val))
9700 val = Fcopy_sequence (val);
9701 ASET (attrs, coding_attr_ccl_decoder, val);
9702
9703 val = args[coding_arg_ccl_encoder];
9704 CHECK_CCL_PROGRAM (val);
9705 if (VECTORP (val))
9706 val = Fcopy_sequence (val);
9707 ASET (attrs, coding_attr_ccl_encoder, val);
9708
9709 val = args[coding_arg_ccl_valids];
9710 valids = Fmake_string (make_number (256), make_number (0));
9711 for (tail = val; CONSP (tail); tail = XCDR (tail))
9712 {
9713 int from, to;
9714
9715 val = XCAR (tail);
9716 if (INTEGERP (val))
9717 {
9718 if (! (0 <= XINT (val) && XINT (val) <= 255))
9719 args_out_of_range_3 (val, make_number (0), make_number (255));
9720 from = to = XINT (val);
9721 }
9722 else
9723 {
9724 CHECK_CONS (val);
9725 CHECK_NATNUM_CAR (val);
9726 CHECK_NUMBER_CDR (val);
9727 if (XINT (XCAR (val)) > 255)
9728 args_out_of_range_3 (XCAR (val),
9729 make_number (0), make_number (255));
9730 from = XINT (XCAR (val));
9731 if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
9732 args_out_of_range_3 (XCDR (val),
9733 XCAR (val), make_number (255));
9734 to = XINT (XCDR (val));
9735 }
9736 for (i = from; i <= to; i++)
9737 SSET (valids, i, 1);
9738 }
9739 ASET (attrs, coding_attr_ccl_valids, valids);
9740
9741 category = coding_category_ccl;
9742 }
9743 else if (EQ (coding_type, Qutf_16))
9744 {
9745 Lisp_Object bom, endian;
9746
9747 ASET (attrs, coding_attr_ascii_compat, Qnil);
9748
9749 if (nargs < coding_arg_utf16_max)
9750 goto short_args;
9751
9752 bom = args[coding_arg_utf16_bom];
9753 if (! NILP (bom) && ! EQ (bom, Qt))
9754 {
9755 CHECK_CONS (bom);
9756 val = XCAR (bom);
9757 CHECK_CODING_SYSTEM (val);
9758 val = XCDR (bom);
9759 CHECK_CODING_SYSTEM (val);
9760 }
9761 ASET (attrs, coding_attr_utf_bom, bom);
9762
9763 endian = args[coding_arg_utf16_endian];
9764 CHECK_SYMBOL (endian);
9765 if (NILP (endian))
9766 endian = Qbig;
9767 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
9768 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
9769 ASET (attrs, coding_attr_utf_16_endian, endian);
9770
9771 category = (CONSP (bom)
9772 ? coding_category_utf_16_auto
9773 : NILP (bom)
9774 ? (EQ (endian, Qbig)
9775 ? coding_category_utf_16_be_nosig
9776 : coding_category_utf_16_le_nosig)
9777 : (EQ (endian, Qbig)
9778 ? coding_category_utf_16_be
9779 : coding_category_utf_16_le));
9780 }
9781 else if (EQ (coding_type, Qiso_2022))
9782 {
9783 Lisp_Object initial, reg_usage, request, flags;
9784
9785 if (nargs < coding_arg_iso2022_max)
9786 goto short_args;
9787
9788 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9789 CHECK_VECTOR (initial);
9790 for (i = 0; i < 4; i++)
9791 {
9792 val = Faref (initial, make_number (i));
9793 if (! NILP (val))
9794 {
9795 struct charset *charset;
9796
9797 CHECK_CHARSET_GET_CHARSET (val, charset);
9798 ASET (initial, i, make_number (CHARSET_ID (charset)));
9799 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9800 ASET (attrs, coding_attr_ascii_compat, Qt);
9801 }
9802 else
9803 ASET (initial, i, make_number (-1));
9804 }
9805
9806 reg_usage = args[coding_arg_iso2022_reg_usage];
9807 CHECK_CONS (reg_usage);
9808 CHECK_NUMBER_CAR (reg_usage);
9809 CHECK_NUMBER_CDR (reg_usage);
9810
9811 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9812 for (tail = request; CONSP (tail); tail = XCDR (tail))
9813 {
9814 int id;
9815 Lisp_Object tmp1;
9816
9817 val = XCAR (tail);
9818 CHECK_CONS (val);
9819 tmp1 = XCAR (val);
9820 CHECK_CHARSET_GET_ID (tmp1, id);
9821 CHECK_NATNUM_CDR (val);
9822 if (XINT (XCDR (val)) >= 4)
9823 error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
9824 XSETCAR (val, make_number (id));
9825 }
9826
9827 flags = args[coding_arg_iso2022_flags];
9828 CHECK_NATNUM (flags);
9829 i = XINT (flags) & INT_MAX;
9830 if (EQ (args[coding_arg_charset_list], Qiso_2022))
9831 i |= CODING_ISO_FLAG_FULL_SUPPORT;
9832 flags = make_number (i);
9833
9834 ASET (attrs, coding_attr_iso_initial, initial);
9835 ASET (attrs, coding_attr_iso_usage, reg_usage);
9836 ASET (attrs, coding_attr_iso_request, request);
9837 ASET (attrs, coding_attr_iso_flags, flags);
9838 setup_iso_safe_charsets (attrs);
9839
9840 if (i & CODING_ISO_FLAG_SEVEN_BITS)
9841 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9842 | CODING_ISO_FLAG_SINGLE_SHIFT))
9843 ? coding_category_iso_7_else
9844 : EQ (args[coding_arg_charset_list], Qiso_2022)
9845 ? coding_category_iso_7
9846 : coding_category_iso_7_tight);
9847 else
9848 {
9849 int id = XINT (AREF (initial, 1));
9850
9851 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
9852 || EQ (args[coding_arg_charset_list], Qiso_2022)
9853 || id < 0)
9854 ? coding_category_iso_8_else
9855 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9856 ? coding_category_iso_8_1
9857 : coding_category_iso_8_2);
9858 }
9859 if (category != coding_category_iso_8_1
9860 && category != coding_category_iso_8_2)
9861 ASET (attrs, coding_attr_ascii_compat, Qnil);
9862 }
9863 else if (EQ (coding_type, Qemacs_mule))
9864 {
9865 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
9866 ASET (attrs, coding_attr_emacs_mule_full, Qt);
9867 ASET (attrs, coding_attr_ascii_compat, Qt);
9868 category = coding_category_emacs_mule;
9869 }
9870 else if (EQ (coding_type, Qshift_jis))
9871 {
9872
9873 struct charset *charset;
9874
9875 if (XINT (Flength (charset_list)) != 3
9876 && XINT (Flength (charset_list)) != 4)
9877 error ("There should be three or four charsets");
9878
9879 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9880 if (CHARSET_DIMENSION (charset) != 1)
9881 error ("Dimension of charset %s is not one",
9882 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9883 if (CHARSET_ASCII_COMPATIBLE_P (charset))
9884 ASET (attrs, coding_attr_ascii_compat, Qt);
9885
9886 charset_list = XCDR (charset_list);
9887 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9888 if (CHARSET_DIMENSION (charset) != 1)
9889 error ("Dimension of charset %s is not one",
9890 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9891
9892 charset_list = XCDR (charset_list);
9893 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9894 if (CHARSET_DIMENSION (charset) != 2)
9895 error ("Dimension of charset %s is not two",
9896 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9897
9898 charset_list = XCDR (charset_list);
9899 if (! NILP (charset_list))
9900 {
9901 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9902 if (CHARSET_DIMENSION (charset) != 2)
9903 error ("Dimension of charset %s is not two",
9904 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9905 }
9906
9907 category = coding_category_sjis;
9908 Vsjis_coding_system = name;
9909 }
9910 else if (EQ (coding_type, Qbig5))
9911 {
9912 struct charset *charset;
9913
9914 if (XINT (Flength (charset_list)) != 2)
9915 error ("There should be just two charsets");
9916
9917 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9918 if (CHARSET_DIMENSION (charset) != 1)
9919 error ("Dimension of charset %s is not one",
9920 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9921 if (CHARSET_ASCII_COMPATIBLE_P (charset))
9922 ASET (attrs, coding_attr_ascii_compat, Qt);
9923
9924 charset_list = XCDR (charset_list);
9925 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9926 if (CHARSET_DIMENSION (charset) != 2)
9927 error ("Dimension of charset %s is not two",
9928 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9929
9930 category = coding_category_big5;
9931 Vbig5_coding_system = name;
9932 }
9933 else if (EQ (coding_type, Qraw_text))
9934 {
9935 category = coding_category_raw_text;
9936 ASET (attrs, coding_attr_ascii_compat, Qt);
9937 }
9938 else if (EQ (coding_type, Qutf_8))
9939 {
9940 Lisp_Object bom;
9941
9942 if (nargs < coding_arg_utf8_max)
9943 goto short_args;
9944
9945 bom = args[coding_arg_utf8_bom];
9946 if (! NILP (bom) && ! EQ (bom, Qt))
9947 {
9948 CHECK_CONS (bom);
9949 val = XCAR (bom);
9950 CHECK_CODING_SYSTEM (val);
9951 val = XCDR (bom);
9952 CHECK_CODING_SYSTEM (val);
9953 }
9954 ASET (attrs, coding_attr_utf_bom, bom);
9955 if (NILP (bom))
9956 ASET (attrs, coding_attr_ascii_compat, Qt);
9957
9958 category = (CONSP (bom) ? coding_category_utf_8_auto
9959 : NILP (bom) ? coding_category_utf_8_nosig
9960 : coding_category_utf_8_sig);
9961 }
9962 else if (EQ (coding_type, Qundecided))
9963 category = coding_category_undecided;
9964 else
9965 error ("Invalid coding system type: %s",
9966 SDATA (SYMBOL_NAME (coding_type)));
9967
9968 ASET (attrs, coding_attr_category, make_number (category));
9969 ASET (attrs, coding_attr_plist,
9970 Fcons (QCcategory,
9971 Fcons (AREF (Vcoding_category_table, category),
9972 CODING_ATTR_PLIST (attrs))));
9973 ASET (attrs, coding_attr_plist,
9974 Fcons (QCascii_compatible_p,
9975 Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
9976 CODING_ATTR_PLIST (attrs))));
9977
9978 eol_type = args[coding_arg_eol_type];
9979 if (! NILP (eol_type)
9980 && ! EQ (eol_type, Qunix)
9981 && ! EQ (eol_type, Qdos)
9982 && ! EQ (eol_type, Qmac))
9983 error ("Invalid eol-type");
9984
9985 aliases = Fcons (name, Qnil);
9986
9987 if (NILP (eol_type))
9988 {
9989 eol_type = make_subsidiaries (name);
9990 for (i = 0; i < 3; i++)
9991 {
9992 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
9993
9994 this_name = AREF (eol_type, i);
9995 this_aliases = Fcons (this_name, Qnil);
9996 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
9997 this_spec = Fmake_vector (make_number (3), attrs);
9998 ASET (this_spec, 1, this_aliases);
9999 ASET (this_spec, 2, this_eol_type);
10000 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10001 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10002 val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10003 if (NILP (val))
10004 Vcoding_system_alist
10005 = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10006 Vcoding_system_alist);
10007 }
10008 }
10009
10010 spec_vec = Fmake_vector (make_number (3), attrs);
10011 ASET (spec_vec, 1, aliases);
10012 ASET (spec_vec, 2, eol_type);
10013
10014 Fputhash (name, spec_vec, Vcoding_system_hash_table);
10015 Vcoding_system_list = Fcons (name, Vcoding_system_list);
10016 val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10017 if (NILP (val))
10018 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10019 Vcoding_system_alist);
10020
10021 {
10022 int id = coding_categories[category].id;
10023
10024 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10025 setup_coding_system (name, &coding_categories[category]);
10026 }
10027
10028 return Qnil;
10029
10030 short_args:
10031 return Fsignal (Qwrong_number_of_arguments,
10032 Fcons (intern ("define-coding-system-internal"),
10033 make_number (nargs)));
10034 }
10035
10036
10037 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10038 3, 3, 0,
10039 doc: /* Change value in CODING-SYSTEM's property list PROP to VAL. */)
10040 (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10041 {
10042 Lisp_Object spec, attrs;
10043
10044 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10045 attrs = AREF (spec, 0);
10046 if (EQ (prop, QCmnemonic))
10047 {
10048 if (! STRINGP (val))
10049 CHECK_CHARACTER (val);
10050 ASET (attrs, coding_attr_mnemonic, val);
10051 }
10052 else if (EQ (prop, QCdefault_char))
10053 {
10054 if (NILP (val))
10055 val = make_number (' ');
10056 else
10057 CHECK_CHARACTER (val);
10058 ASET (attrs, coding_attr_default_char, val);
10059 }
10060 else if (EQ (prop, QCdecode_translation_table))
10061 {
10062 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10063 CHECK_SYMBOL (val);
10064 ASET (attrs, coding_attr_decode_tbl, val);
10065 }
10066 else if (EQ (prop, QCencode_translation_table))
10067 {
10068 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10069 CHECK_SYMBOL (val);
10070 ASET (attrs, coding_attr_encode_tbl, val);
10071 }
10072 else if (EQ (prop, QCpost_read_conversion))
10073 {
10074 CHECK_SYMBOL (val);
10075 ASET (attrs, coding_attr_post_read, val);
10076 }
10077 else if (EQ (prop, QCpre_write_conversion))
10078 {
10079 CHECK_SYMBOL (val);
10080 ASET (attrs, coding_attr_pre_write, val);
10081 }
10082 else if (EQ (prop, QCascii_compatible_p))
10083 {
10084 ASET (attrs, coding_attr_ascii_compat, val);
10085 }
10086
10087 ASET (attrs, coding_attr_plist,
10088 Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10089 return val;
10090 }
10091
10092
10093 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10094 Sdefine_coding_system_alias, 2, 2, 0,
10095 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
10096 (Lisp_Object alias, Lisp_Object coding_system)
10097 {
10098 Lisp_Object spec, aliases, eol_type, val;
10099
10100 CHECK_SYMBOL (alias);
10101 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10102 aliases = AREF (spec, 1);
10103 /* ALIASES should be a list of length more than zero, and the first
10104 element is a base coding system. Append ALIAS at the tail of the
10105 list. */
10106 while (!NILP (XCDR (aliases)))
10107 aliases = XCDR (aliases);
10108 XSETCDR (aliases, Fcons (alias, Qnil));
10109
10110 eol_type = AREF (spec, 2);
10111 if (VECTORP (eol_type))
10112 {
10113 Lisp_Object subsidiaries;
10114 int i;
10115
10116 subsidiaries = make_subsidiaries (alias);
10117 for (i = 0; i < 3; i++)
10118 Fdefine_coding_system_alias (AREF (subsidiaries, i),
10119 AREF (eol_type, i));
10120 }
10121
10122 Fputhash (alias, spec, Vcoding_system_hash_table);
10123 Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10124 val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10125 if (NILP (val))
10126 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10127 Vcoding_system_alist);
10128
10129 return Qnil;
10130 }
10131
10132 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10133 1, 1, 0,
10134 doc: /* Return the base of CODING-SYSTEM.
10135 Any alias or subsidiary coding system is not a base coding system. */)
10136 (Lisp_Object coding_system)
10137 {
10138 Lisp_Object spec, attrs;
10139
10140 if (NILP (coding_system))
10141 return (Qno_conversion);
10142 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10143 attrs = AREF (spec, 0);
10144 return CODING_ATTR_BASE_NAME (attrs);
10145 }
10146
10147 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10148 1, 1, 0,
10149 doc: "Return the property list of CODING-SYSTEM.")
10150 (Lisp_Object coding_system)
10151 {
10152 Lisp_Object spec, attrs;
10153
10154 if (NILP (coding_system))
10155 coding_system = Qno_conversion;
10156 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10157 attrs = AREF (spec, 0);
10158 return CODING_ATTR_PLIST (attrs);
10159 }
10160
10161
10162 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10163 1, 1, 0,
10164 doc: /* Return the list of aliases of CODING-SYSTEM. */)
10165 (Lisp_Object coding_system)
10166 {
10167 Lisp_Object spec;
10168
10169 if (NILP (coding_system))
10170 coding_system = Qno_conversion;
10171 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10172 return AREF (spec, 1);
10173 }
10174
10175 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10176 Scoding_system_eol_type, 1, 1, 0,
10177 doc: /* Return eol-type of CODING-SYSTEM.
10178 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10179
10180 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10181 and CR respectively.
10182
10183 A vector value indicates that a format of end-of-line should be
10184 detected automatically. Nth element of the vector is the subsidiary
10185 coding system whose eol-type is N. */)
10186 (Lisp_Object coding_system)
10187 {
10188 Lisp_Object spec, eol_type;
10189 int n;
10190
10191 if (NILP (coding_system))
10192 coding_system = Qno_conversion;
10193 if (! CODING_SYSTEM_P (coding_system))
10194 return Qnil;
10195 spec = CODING_SYSTEM_SPEC (coding_system);
10196 eol_type = AREF (spec, 2);
10197 if (VECTORP (eol_type))
10198 return Fcopy_sequence (eol_type);
10199 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10200 return make_number (n);
10201 }
10202
10203 #endif /* emacs */
10204
10205 \f
10206 /*** 9. Post-amble ***/
10207
10208 void
10209 init_coding_once (void)
10210 {
10211 int i;
10212
10213 for (i = 0; i < coding_category_max; i++)
10214 {
10215 coding_categories[i].id = -1;
10216 coding_priorities[i] = i;
10217 }
10218
10219 /* ISO2022 specific initialize routine. */
10220 for (i = 0; i < 0x20; i++)
10221 iso_code_class[i] = ISO_control_0;
10222 for (i = 0x21; i < 0x7F; i++)
10223 iso_code_class[i] = ISO_graphic_plane_0;
10224 for (i = 0x80; i < 0xA0; i++)
10225 iso_code_class[i] = ISO_control_1;
10226 for (i = 0xA1; i < 0xFF; i++)
10227 iso_code_class[i] = ISO_graphic_plane_1;
10228 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10229 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10230 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10231 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10232 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10233 iso_code_class[ISO_CODE_ESC] = ISO_escape;
10234 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10235 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10236 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10237
10238 for (i = 0; i < 256; i++)
10239 {
10240 emacs_mule_bytes[i] = 1;
10241 }
10242 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10243 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10244 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10245 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10246 }
10247
10248 #ifdef emacs
10249
10250 void
10251 syms_of_coding (void)
10252 {
10253 staticpro (&Vcoding_system_hash_table);
10254 {
10255 Lisp_Object args[2];
10256 args[0] = QCtest;
10257 args[1] = Qeq;
10258 Vcoding_system_hash_table = Fmake_hash_table (2, args);
10259 }
10260
10261 staticpro (&Vsjis_coding_system);
10262 Vsjis_coding_system = Qnil;
10263
10264 staticpro (&Vbig5_coding_system);
10265 Vbig5_coding_system = Qnil;
10266
10267 staticpro (&Vcode_conversion_reused_workbuf);
10268 Vcode_conversion_reused_workbuf = Qnil;
10269
10270 staticpro (&Vcode_conversion_workbuf_name);
10271 Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10272
10273 reused_workbuf_in_use = 0;
10274
10275 DEFSYM (Qcharset, "charset");
10276 DEFSYM (Qtarget_idx, "target-idx");
10277 DEFSYM (Qcoding_system_history, "coding-system-history");
10278 Fset (Qcoding_system_history, Qnil);
10279
10280 /* Target FILENAME is the first argument. */
10281 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10282 /* Target FILENAME is the third argument. */
10283 Fput (Qwrite_region, Qtarget_idx, make_number (2));
10284
10285 DEFSYM (Qcall_process, "call-process");
10286 /* Target PROGRAM is the first argument. */
10287 Fput (Qcall_process, Qtarget_idx, make_number (0));
10288
10289 DEFSYM (Qcall_process_region, "call-process-region");
10290 /* Target PROGRAM is the third argument. */
10291 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10292
10293 DEFSYM (Qstart_process, "start-process");
10294 /* Target PROGRAM is the third argument. */
10295 Fput (Qstart_process, Qtarget_idx, make_number (2));
10296
10297 DEFSYM (Qopen_network_stream, "open-network-stream");
10298 /* Target SERVICE is the fourth argument. */
10299 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10300
10301 DEFSYM (Qcoding_system, "coding-system");
10302 DEFSYM (Qcoding_aliases, "coding-aliases");
10303
10304 DEFSYM (Qeol_type, "eol-type");
10305 DEFSYM (Qunix, "unix");
10306 DEFSYM (Qdos, "dos");
10307 DEFSYM (Qmac, "mac");
10308
10309 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10310 DEFSYM (Qpost_read_conversion, "post-read-conversion");
10311 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10312 DEFSYM (Qdefault_char, "default-char");
10313 DEFSYM (Qundecided, "undecided");
10314 DEFSYM (Qno_conversion, "no-conversion");
10315 DEFSYM (Qraw_text, "raw-text");
10316
10317 DEFSYM (Qiso_2022, "iso-2022");
10318
10319 DEFSYM (Qutf_8, "utf-8");
10320 DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10321
10322 #if defined (WINDOWSNT) || defined (CYGWIN)
10323 /* No, not utf-16-le: that one has a BOM. */
10324 DEFSYM (Qutf_16le, "utf-16le");
10325 #endif
10326
10327 DEFSYM (Qutf_16, "utf-16");
10328 DEFSYM (Qbig, "big");
10329 DEFSYM (Qlittle, "little");
10330
10331 DEFSYM (Qshift_jis, "shift-jis");
10332 DEFSYM (Qbig5, "big5");
10333
10334 DEFSYM (Qcoding_system_p, "coding-system-p");
10335
10336 DEFSYM (Qcoding_system_error, "coding-system-error");
10337 Fput (Qcoding_system_error, Qerror_conditions,
10338 listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10339 Fput (Qcoding_system_error, Qerror_message,
10340 build_pure_c_string ("Invalid coding system"));
10341
10342 /* Intern this now in case it isn't already done.
10343 Setting this variable twice is harmless.
10344 But don't staticpro it here--that is done in alloc.c. */
10345 Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10346
10347 DEFSYM (Qtranslation_table, "translation-table");
10348 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10349 DEFSYM (Qtranslation_table_id, "translation-table-id");
10350 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10351 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10352
10353 DEFSYM (Qvalid_codes, "valid-codes");
10354
10355 DEFSYM (Qemacs_mule, "emacs-mule");
10356
10357 DEFSYM (QCcategory, ":category");
10358 DEFSYM (QCmnemonic, ":mnemonic");
10359 DEFSYM (QCdefault_char, ":default-char");
10360 DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10361 DEFSYM (QCencode_translation_table, ":encode-translation-table");
10362 DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10363 DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10364 DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10365
10366 Vcoding_category_table
10367 = Fmake_vector (make_number (coding_category_max), Qnil);
10368 staticpro (&Vcoding_category_table);
10369 /* Followings are target of code detection. */
10370 ASET (Vcoding_category_table, coding_category_iso_7,
10371 intern_c_string ("coding-category-iso-7"));
10372 ASET (Vcoding_category_table, coding_category_iso_7_tight,
10373 intern_c_string ("coding-category-iso-7-tight"));
10374 ASET (Vcoding_category_table, coding_category_iso_8_1,
10375 intern_c_string ("coding-category-iso-8-1"));
10376 ASET (Vcoding_category_table, coding_category_iso_8_2,
10377 intern_c_string ("coding-category-iso-8-2"));
10378 ASET (Vcoding_category_table, coding_category_iso_7_else,
10379 intern_c_string ("coding-category-iso-7-else"));
10380 ASET (Vcoding_category_table, coding_category_iso_8_else,
10381 intern_c_string ("coding-category-iso-8-else"));
10382 ASET (Vcoding_category_table, coding_category_utf_8_auto,
10383 intern_c_string ("coding-category-utf-8-auto"));
10384 ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10385 intern_c_string ("coding-category-utf-8"));
10386 ASET (Vcoding_category_table, coding_category_utf_8_sig,
10387 intern_c_string ("coding-category-utf-8-sig"));
10388 ASET (Vcoding_category_table, coding_category_utf_16_be,
10389 intern_c_string ("coding-category-utf-16-be"));
10390 ASET (Vcoding_category_table, coding_category_utf_16_auto,
10391 intern_c_string ("coding-category-utf-16-auto"));
10392 ASET (Vcoding_category_table, coding_category_utf_16_le,
10393 intern_c_string ("coding-category-utf-16-le"));
10394 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10395 intern_c_string ("coding-category-utf-16-be-nosig"));
10396 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10397 intern_c_string ("coding-category-utf-16-le-nosig"));
10398 ASET (Vcoding_category_table, coding_category_charset,
10399 intern_c_string ("coding-category-charset"));
10400 ASET (Vcoding_category_table, coding_category_sjis,
10401 intern_c_string ("coding-category-sjis"));
10402 ASET (Vcoding_category_table, coding_category_big5,
10403 intern_c_string ("coding-category-big5"));
10404 ASET (Vcoding_category_table, coding_category_ccl,
10405 intern_c_string ("coding-category-ccl"));
10406 ASET (Vcoding_category_table, coding_category_emacs_mule,
10407 intern_c_string ("coding-category-emacs-mule"));
10408 /* Followings are NOT target of code detection. */
10409 ASET (Vcoding_category_table, coding_category_raw_text,
10410 intern_c_string ("coding-category-raw-text"));
10411 ASET (Vcoding_category_table, coding_category_undecided,
10412 intern_c_string ("coding-category-undecided"));
10413
10414 DEFSYM (Qinsufficient_source, "insufficient-source");
10415 DEFSYM (Qinconsistent_eol, "inconsistent-eol");
10416 DEFSYM (Qinvalid_source, "invalid-source");
10417 DEFSYM (Qinterrupted, "interrupted");
10418 DEFSYM (Qinsufficient_memory, "insufficient-memory");
10419 DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10420
10421 defsubr (&Scoding_system_p);
10422 defsubr (&Sread_coding_system);
10423 defsubr (&Sread_non_nil_coding_system);
10424 defsubr (&Scheck_coding_system);
10425 defsubr (&Sdetect_coding_region);
10426 defsubr (&Sdetect_coding_string);
10427 defsubr (&Sfind_coding_systems_region_internal);
10428 defsubr (&Sunencodable_char_position);
10429 defsubr (&Scheck_coding_systems_region);
10430 defsubr (&Sdecode_coding_region);
10431 defsubr (&Sencode_coding_region);
10432 defsubr (&Sdecode_coding_string);
10433 defsubr (&Sencode_coding_string);
10434 defsubr (&Sdecode_sjis_char);
10435 defsubr (&Sencode_sjis_char);
10436 defsubr (&Sdecode_big5_char);
10437 defsubr (&Sencode_big5_char);
10438 defsubr (&Sset_terminal_coding_system_internal);
10439 defsubr (&Sset_safe_terminal_coding_system_internal);
10440 defsubr (&Sterminal_coding_system);
10441 defsubr (&Sset_keyboard_coding_system_internal);
10442 defsubr (&Skeyboard_coding_system);
10443 defsubr (&Sfind_operation_coding_system);
10444 defsubr (&Sset_coding_system_priority);
10445 defsubr (&Sdefine_coding_system_internal);
10446 defsubr (&Sdefine_coding_system_alias);
10447 defsubr (&Scoding_system_put);
10448 defsubr (&Scoding_system_base);
10449 defsubr (&Scoding_system_plist);
10450 defsubr (&Scoding_system_aliases);
10451 defsubr (&Scoding_system_eol_type);
10452 defsubr (&Scoding_system_priority_list);
10453
10454 DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10455 doc: /* List of coding systems.
10456
10457 Do not alter the value of this variable manually. This variable should be
10458 updated by the functions `define-coding-system' and
10459 `define-coding-system-alias'. */);
10460 Vcoding_system_list = Qnil;
10461
10462 DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10463 doc: /* Alist of coding system names.
10464 Each element is one element list of coding system name.
10465 This variable is given to `completing-read' as COLLECTION argument.
10466
10467 Do not alter the value of this variable manually. This variable should be
10468 updated by the functions `make-coding-system' and
10469 `define-coding-system-alias'. */);
10470 Vcoding_system_alist = Qnil;
10471
10472 DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10473 doc: /* List of coding-categories (symbols) ordered by priority.
10474
10475 On detecting a coding system, Emacs tries code detection algorithms
10476 associated with each coding-category one by one in this order. When
10477 one algorithm agrees with a byte sequence of source text, the coding
10478 system bound to the corresponding coding-category is selected.
10479
10480 Don't modify this variable directly, but use `set-coding-system-priority'. */);
10481 {
10482 int i;
10483
10484 Vcoding_category_list = Qnil;
10485 for (i = coding_category_max - 1; i >= 0; i--)
10486 Vcoding_category_list
10487 = Fcons (AREF (Vcoding_category_table, i),
10488 Vcoding_category_list);
10489 }
10490
10491 DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
10492 doc: /* Specify the coding system for read operations.
10493 It is useful to bind this variable with `let', but do not set it globally.
10494 If the value is a coding system, it is used for decoding on read operation.
10495 If not, an appropriate element is used from one of the coding system alists.
10496 There are three such tables: `file-coding-system-alist',
10497 `process-coding-system-alist', and `network-coding-system-alist'. */);
10498 Vcoding_system_for_read = Qnil;
10499
10500 DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
10501 doc: /* Specify the coding system for write operations.
10502 Programs bind this variable with `let', but you should not set it globally.
10503 If the value is a coding system, it is used for encoding of output,
10504 when writing it to a file and when sending it to a file or subprocess.
10505
10506 If this does not specify a coding system, an appropriate element
10507 is used from one of the coding system alists.
10508 There are three such tables: `file-coding-system-alist',
10509 `process-coding-system-alist', and `network-coding-system-alist'.
10510 For output to files, if the above procedure does not specify a coding system,
10511 the value of `buffer-file-coding-system' is used. */);
10512 Vcoding_system_for_write = Qnil;
10513
10514 DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
10515 doc: /*
10516 Coding system used in the latest file or process I/O. */);
10517 Vlast_coding_system_used = Qnil;
10518
10519 DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
10520 doc: /*
10521 Error status of the last code conversion.
10522
10523 When an error was detected in the last code conversion, this variable
10524 is set to one of the following symbols.
10525 `insufficient-source'
10526 `inconsistent-eol'
10527 `invalid-source'
10528 `interrupted'
10529 `insufficient-memory'
10530 When no error was detected, the value doesn't change. So, to check
10531 the error status of a code conversion by this variable, you must
10532 explicitly set this variable to nil before performing code
10533 conversion. */);
10534 Vlast_code_conversion_error = Qnil;
10535
10536 DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
10537 doc: /*
10538 *Non-nil means always inhibit code conversion of end-of-line format.
10539 See info node `Coding Systems' and info node `Text and Binary' concerning
10540 such conversion. */);
10541 inhibit_eol_conversion = 0;
10542
10543 DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
10544 doc: /*
10545 Non-nil means process buffer inherits coding system of process output.
10546 Bind it to t if the process output is to be treated as if it were a file
10547 read from some filesystem. */);
10548 inherit_process_coding_system = 0;
10549
10550 DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
10551 doc: /*
10552 Alist to decide a coding system to use for a file I/O operation.
10553 The format is ((PATTERN . VAL) ...),
10554 where PATTERN is a regular expression matching a file name,
10555 VAL is a coding system, a cons of coding systems, or a function symbol.
10556 If VAL is a coding system, it is used for both decoding and encoding
10557 the file contents.
10558 If VAL is a cons of coding systems, the car part is used for decoding,
10559 and the cdr part is used for encoding.
10560 If VAL is a function symbol, the function must return a coding system
10561 or a cons of coding systems which are used as above. The function is
10562 called with an argument that is a list of the arguments with which
10563 `find-operation-coding-system' was called. If the function can't decide
10564 a coding system, it can return `undecided' so that the normal
10565 code-detection is performed.
10566
10567 See also the function `find-operation-coding-system'
10568 and the variable `auto-coding-alist'. */);
10569 Vfile_coding_system_alist = Qnil;
10570
10571 DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
10572 doc: /*
10573 Alist to decide a coding system to use for a process I/O operation.
10574 The format is ((PATTERN . VAL) ...),
10575 where PATTERN is a regular expression matching a program name,
10576 VAL is a coding system, a cons of coding systems, or a function symbol.
10577 If VAL is a coding system, it is used for both decoding what received
10578 from the program and encoding what sent to the program.
10579 If VAL is a cons of coding systems, the car part is used for decoding,
10580 and the cdr part is used for encoding.
10581 If VAL is a function symbol, the function must return a coding system
10582 or a cons of coding systems which are used as above.
10583
10584 See also the function `find-operation-coding-system'. */);
10585 Vprocess_coding_system_alist = Qnil;
10586
10587 DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
10588 doc: /*
10589 Alist to decide a coding system to use for a network I/O operation.
10590 The format is ((PATTERN . VAL) ...),
10591 where PATTERN is a regular expression matching a network service name
10592 or is a port number to connect to,
10593 VAL is a coding system, a cons of coding systems, or a function symbol.
10594 If VAL is a coding system, it is used for both decoding what received
10595 from the network stream and encoding what sent to the network stream.
10596 If VAL is a cons of coding systems, the car part is used for decoding,
10597 and the cdr part is used for encoding.
10598 If VAL is a function symbol, the function must return a coding system
10599 or a cons of coding systems which are used as above.
10600
10601 See also the function `find-operation-coding-system'. */);
10602 Vnetwork_coding_system_alist = Qnil;
10603
10604 DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
10605 doc: /* Coding system to use with system messages.
10606 Also used for decoding keyboard input on X Window system. */);
10607 Vlocale_coding_system = Qnil;
10608
10609 /* The eol mnemonics are reset in startup.el system-dependently. */
10610 DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
10611 doc: /*
10612 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
10613 eol_mnemonic_unix = build_pure_c_string (":");
10614
10615 DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
10616 doc: /*
10617 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
10618 eol_mnemonic_dos = build_pure_c_string ("\\");
10619
10620 DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
10621 doc: /*
10622 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
10623 eol_mnemonic_mac = build_pure_c_string ("/");
10624
10625 DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
10626 doc: /*
10627 *String displayed in mode line when end-of-line format is not yet determined. */);
10628 eol_mnemonic_undecided = build_pure_c_string (":");
10629
10630 DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
10631 doc: /*
10632 *Non-nil enables character translation while encoding and decoding. */);
10633 Venable_character_translation = Qt;
10634
10635 DEFVAR_LISP ("standard-translation-table-for-decode",
10636 Vstandard_translation_table_for_decode,
10637 doc: /* Table for translating characters while decoding. */);
10638 Vstandard_translation_table_for_decode = Qnil;
10639
10640 DEFVAR_LISP ("standard-translation-table-for-encode",
10641 Vstandard_translation_table_for_encode,
10642 doc: /* Table for translating characters while encoding. */);
10643 Vstandard_translation_table_for_encode = Qnil;
10644
10645 DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
10646 doc: /* Alist of charsets vs revision numbers.
10647 While encoding, if a charset (car part of an element) is found,
10648 designate it with the escape sequence identifying revision (cdr part
10649 of the element). */);
10650 Vcharset_revision_table = Qnil;
10651
10652 DEFVAR_LISP ("default-process-coding-system",
10653 Vdefault_process_coding_system,
10654 doc: /* Cons of coding systems used for process I/O by default.
10655 The car part is used for decoding a process output,
10656 the cdr part is used for encoding a text to be sent to a process. */);
10657 Vdefault_process_coding_system = Qnil;
10658
10659 DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
10660 doc: /*
10661 Table of extra Latin codes in the range 128..159 (inclusive).
10662 This is a vector of length 256.
10663 If Nth element is non-nil, the existence of code N in a file
10664 \(or output of subprocess) doesn't prevent it to be detected as
10665 a coding system of ISO 2022 variant which has a flag
10666 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10667 or reading output of a subprocess.
10668 Only 128th through 159th elements have a meaning. */);
10669 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
10670
10671 DEFVAR_LISP ("select-safe-coding-system-function",
10672 Vselect_safe_coding_system_function,
10673 doc: /*
10674 Function to call to select safe coding system for encoding a text.
10675
10676 If set, this function is called to force a user to select a proper
10677 coding system which can encode the text in the case that a default
10678 coding system used in each operation can't encode the text. The
10679 function should take care that the buffer is not modified while
10680 the coding system is being selected.
10681
10682 The default value is `select-safe-coding-system' (which see). */);
10683 Vselect_safe_coding_system_function = Qnil;
10684
10685 DEFVAR_BOOL ("coding-system-require-warning",
10686 coding_system_require_warning,
10687 doc: /* Internal use only.
10688 If non-nil, on writing a file, `select-safe-coding-system-function' is
10689 called even if `coding-system-for-write' is non-nil. The command
10690 `universal-coding-system-argument' binds this variable to t temporarily. */);
10691 coding_system_require_warning = 0;
10692
10693
10694 DEFVAR_BOOL ("inhibit-iso-escape-detection",
10695 inhibit_iso_escape_detection,
10696 doc: /*
10697 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
10698
10699 When Emacs reads text, it tries to detect how the text is encoded.
10700 This code detection is sensitive to escape sequences. If Emacs sees
10701 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10702 of the ISO2022 encodings, and decodes text by the corresponding coding
10703 system (e.g. `iso-2022-7bit').
10704
10705 However, there may be a case that you want to read escape sequences in
10706 a file as is. In such a case, you can set this variable to non-nil.
10707 Then the code detection will ignore any escape sequences, and no text is
10708 detected as encoded in some ISO-2022 encoding. The result is that all
10709 escape sequences become visible in a buffer.
10710
10711 The default value is nil, and it is strongly recommended not to change
10712 it. That is because many Emacs Lisp source files that contain
10713 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10714 in Emacs's distribution, and they won't be decoded correctly on
10715 reading if you suppress escape sequence detection.
10716
10717 The other way to read escape sequences in a file without decoding is
10718 to explicitly specify some coding system that doesn't use ISO-2022
10719 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
10720 inhibit_iso_escape_detection = 0;
10721
10722 DEFVAR_BOOL ("inhibit-null-byte-detection",
10723 inhibit_null_byte_detection,
10724 doc: /* If non-nil, Emacs ignores null bytes on code detection.
10725 By default, Emacs treats it as binary data, and does not attempt to
10726 decode it. The effect is as if you specified `no-conversion' for
10727 reading that text.
10728
10729 Set this to non-nil when a regular text happens to include null bytes.
10730 Examples are Index nodes of Info files and null-byte delimited output
10731 from GNU Find and GNU Grep. Emacs will then ignore the null bytes and
10732 decode text as usual. */);
10733 inhibit_null_byte_detection = 0;
10734
10735 DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
10736 doc: /* Char table for translating self-inserting characters.
10737 This is applied to the result of input methods, not their input.
10738 See also `keyboard-translate-table'.
10739
10740 Use of this variable for character code unification was rendered
10741 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
10742 internal character representation. */);
10743 Vtranslation_table_for_input = Qnil;
10744
10745 {
10746 Lisp_Object args[coding_arg_max];
10747 Lisp_Object plist[16];
10748 int i;
10749
10750 for (i = 0; i < coding_arg_max; i++)
10751 args[i] = Qnil;
10752
10753 plist[0] = intern_c_string (":name");
10754 plist[1] = args[coding_arg_name] = Qno_conversion;
10755 plist[2] = intern_c_string (":mnemonic");
10756 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10757 plist[4] = intern_c_string (":coding-type");
10758 plist[5] = args[coding_arg_coding_type] = Qraw_text;
10759 plist[6] = intern_c_string (":ascii-compatible-p");
10760 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10761 plist[8] = intern_c_string (":default-char");
10762 plist[9] = args[coding_arg_default_char] = make_number (0);
10763 plist[10] = intern_c_string (":for-unibyte");
10764 plist[11] = args[coding_arg_for_unibyte] = Qt;
10765 plist[12] = intern_c_string (":docstring");
10766 plist[13] = build_pure_c_string ("Do no conversion.\n\
10767 \n\
10768 When you visit a file with this coding, the file is read into a\n\
10769 unibyte buffer as is, thus each byte of a file is treated as a\n\
10770 character.");
10771 plist[14] = intern_c_string (":eol-type");
10772 plist[15] = args[coding_arg_eol_type] = Qunix;
10773 args[coding_arg_plist] = Flist (16, plist);
10774 Fdefine_coding_system_internal (coding_arg_max, args);
10775
10776 plist[1] = args[coding_arg_name] = Qundecided;
10777 plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10778 plist[5] = args[coding_arg_coding_type] = Qundecided;
10779 /* This is already set.
10780 plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
10781 plist[8] = intern_c_string (":charset-list");
10782 plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10783 plist[11] = args[coding_arg_for_unibyte] = Qnil;
10784 plist[13] = build_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
10785 plist[15] = args[coding_arg_eol_type] = Qnil;
10786 args[coding_arg_plist] = Flist (16, plist);
10787 Fdefine_coding_system_internal (coding_arg_max, args);
10788 }
10789
10790 setup_coding_system (Qno_conversion, &safe_terminal_coding);
10791
10792 {
10793 int i;
10794
10795 for (i = 0; i < coding_category_max; i++)
10796 Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10797 }
10798 #if defined (DOS_NT)
10799 system_eol_type = Qdos;
10800 #else
10801 system_eol_type = Qunix;
10802 #endif
10803 staticpro (&system_eol_type);
10804 }
10805
10806 char *
10807 emacs_strerror (int error_number)
10808 {
10809 char *str;
10810
10811 synchronize_system_messages_locale ();
10812 str = strerror (error_number);
10813
10814 if (! NILP (Vlocale_coding_system))
10815 {
10816 Lisp_Object dec = code_convert_string_norecord (build_string (str),
10817 Vlocale_coding_system,
10818 0);
10819 str = SSDATA (dec);
10820 }
10821
10822 return str;
10823 }
10824
10825 #endif /* emacs */