]> code.delx.au - gnu-emacs/blob - src/coding.c
*** empty log message ***
[gnu-emacs] / src / coding.c
1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001, 2002 Free Software Foundation, Inc.
5 Copyright (C) 2001, 2002
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
8
9 This file is part of GNU Emacs.
10
11 GNU Emacs is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
14 any later version.
15
16 GNU Emacs is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License
22 along with GNU Emacs; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 Boston, MA 02111-1307, USA. */
25
26 /*** TABLE OF CONTENTS ***
27
28 0. General comments
29 1. Preamble
30 2. Emacs' internal format (emacs-utf-8) handlers
31 3. UTF-8 handlers
32 4. UTF-16 handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
35 7. ISO2022 handlers
36 8. Shift-JIS and BIG5 handlers
37 9. CCL handlers
38 10. C library functions
39 11. Emacs Lisp library functions
40 12. Postamble
41
42 */
43
44 /*** 0. General comments ***
45
46
47 CODING SYSTEM
48
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
56 coding system.
57
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
59 C level, a coding system is represented by a vector of attributes
60 stored in the hash table Vcharset_hash_table. The conversion from
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
63
64 Coding systems are classified into the following types depending on
65 the encoding mechanism. Here's a brief description of the types.
66
67 o UTF-8
68
69 o UTF-16
70
71 o Charset-base coding system
72
73 A coding system defined by one or more (coded) character sets.
74 Decoding and encoding are done by a code converter defined for each
75 character set.
76
77 o Old Emacs internal format (emacs-mule)
78
79 The coding system adopted by old versions of Emacs (20 and 21).
80
81 o ISO2022-base coding system
82
83 The most famous coding system for multiple character sets. X's
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
86 variants of ISO2022.
87
88 o SJIS (or Shift-JIS or MS-Kanji-Code)
89
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
92 section 8.
93
94 o BIG5
95
96 A coding system to encode character sets: ASCII and Big5. Widely
97 used for Chinese (mainly in Taiwan and Hong Kong). Details are
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
101
102 o CCL
103
104 If a user wants to decode/encode text encoded in a coding system
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
108
109 o Raw-text
110
111 A coding system for text containing raw eight-bit data. Emacs
112 treats each byte of source text as a character (except for
113 end-of-line conversion).
114
115 o No-conversion
116
117 Like raw text, but don't do end-of-line conversion.
118
119
120 END-OF-LINE FORMAT
121
122 How text end-of-line is encoded depends on operating system. For
123 instance, Unix's format is just one byte of LF (line-feed) code,
124 whereas DOS's format is two-byte sequence of `carriage-return' and
125 `line-feed' codes. MacOS's format is usually one byte of
126 `carriage-return'.
127
128 Since text character encoding and end-of-line encoding are
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
131
132 STRUCT CODING_SYSTEM
133
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
137 conversion (e.g. the location of source and destination data).
138
139 */
140
141 /* COMMON MACROS */
142
143
144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
145
146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX, and update the members of
148 DETECT_INFO.
149
150 Return 1 if the byte sequence conforms to XXX, otherwise return 0.
151
152 Below is the template of these functions. */
153
154 #if 0
155 static int
156 detect_coding_XXX (coding, detect_info)
157 struct coding_system *coding;
158 struct coding_detection_info *detect_info;
159 {
160 unsigned char *src = coding->source;
161 unsigned char *src_end = coding->source + coding->src_bytes;
162 int multibytep = coding->src_multibyte;
163 int consumed_chars = 0;
164 int found = 0;
165 ...;
166
167 while (1)
168 {
169 /* Get one byte from the source. If the souce is exausted, jump
170 to no_more_source:. */
171 ONE_MORE_BYTE (c);
172
173 if (! __C_conforms_to_XXX___ (c))
174 break;
175 if (! __C_strongly_suggests_XXX__ (c))
176 found = CATEGORY_MASK_XXX;
177 }
178 /* The byte sequence is invalid for XXX. */
179 detect_info->rejected |= CATEGORY_MASK_XXX;
180 return 0;
181
182 no_more_source:
183 /* The source exausted successfully. */
184 detect_info->found |= found;
185 return 1;
186 }
187 #endif
188
189 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
190
191 These functions decode a byte sequence specified as a source by
192 CODING. The resulting multibyte text goes to a place pointed to by
193 CODING->charbuf, the length of which should not exceed
194 CODING->charbuf_size;
195
196 These functions set the information of original and decoded texts in
197 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
198 They also set CODING->result to one of CODING_RESULT_XXX indicating
199 how the decoding is finished.
200
201 Below is the template of these functions. */
202
203 #if 0
204 static void
205 decode_coding_XXXX (coding)
206 struct coding_system *coding;
207 {
208 unsigned char *src = coding->source + coding->consumed;
209 unsigned char *src_end = coding->source + coding->src_bytes;
210 /* SRC_BASE remembers the start position in source in each loop.
211 The loop will be exited when there's not enough source code, or
212 when there's no room in CHARBUF for a decoded character. */
213 unsigned char *src_base;
214 /* A buffer to produce decoded characters. */
215 int *charbuf = coding->charbuf;
216 int *charbuf_end = charbuf + coding->charbuf_size;
217 int multibytep = coding->src_multibyte;
218
219 while (1)
220 {
221 src_base = src;
222 if (charbuf < charbuf_end)
223 /* No more room to produce a decoded character. */
224 break;
225 ONE_MORE_BYTE (c);
226 /* Decode it. */
227 }
228
229 no_more_source:
230 if (src_base < src_end
231 && coding->mode & CODING_MODE_LAST_BLOCK)
232 /* If the source ends by partial bytes to construct a character,
233 treat them as eight-bit raw data. */
234 while (src_base < src_end && charbuf < charbuf_end)
235 *charbuf++ = *src_base++;
236 /* Remember how many bytes and characters we consumed. If the
237 source is multibyte, the bytes and chars are not identical. */
238 coding->consumed = coding->consumed_char = src_base - coding->source;
239 /* Remember how many characters we produced. */
240 coding->charbuf_used = charbuf - coding->charbuf;
241 }
242 #endif
243
244 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
245
246 These functions encode SRC_BYTES length text at SOURCE of Emacs'
247 internal multibyte format by CODING. The resulting byte sequence
248 goes to a place pointed to by DESTINATION, the length of which
249 should not exceed DST_BYTES.
250
251 These functions set the information of original and encoded texts in
252 the members produced, produced_char, consumed, and consumed_char of
253 the structure *CODING. They also set the member result to one of
254 CODING_RESULT_XXX indicating how the encoding finished.
255
256 DST_BYTES zero means that source area and destination area are
257 overlapped, which means that we can produce a encoded text until it
258 reaches at the head of not-yet-encoded source text.
259
260 Below is a template of these functions. */
261 #if 0
262 static void
263 encode_coding_XXX (coding)
264 struct coding_system *coding;
265 {
266 int multibytep = coding->dst_multibyte;
267 int *charbuf = coding->charbuf;
268 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
269 unsigned char *dst = coding->destination + coding->produced;
270 unsigned char *dst_end = coding->destination + coding->dst_bytes;
271 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
272 int produced_chars = 0;
273
274 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
275 {
276 int c = *charbuf;
277 /* Encode C into DST, and increment DST. */
278 }
279 label_no_more_destination:
280 /* How many chars and bytes we produced. */
281 coding->produced_char += produced_chars;
282 coding->produced = dst - coding->destination;
283 }
284 #endif
285
286 \f
287 /*** 1. Preamble ***/
288
289 #include <config.h>
290 #include <stdio.h>
291
292 #include "lisp.h"
293 #include "buffer.h"
294 #include "character.h"
295 #include "charset.h"
296 #include "ccl.h"
297 #include "composite.h"
298 #include "coding.h"
299 #include "window.h"
300
301 Lisp_Object Vcoding_system_hash_table;
302
303 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
304 Lisp_Object Qunix, Qdos;
305 extern Lisp_Object Qmac; /* frame.c */
306 Lisp_Object Qbuffer_file_coding_system;
307 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
308 Lisp_Object Qdefault_char;
309 Lisp_Object Qno_conversion, Qundecided;
310 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
311 Lisp_Object Qbig, Qlittle;
312 Lisp_Object Qcoding_system_history;
313 Lisp_Object Qvalid_codes;
314 Lisp_Object QCcategory;
315
316 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
317 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
318 Lisp_Object Qstart_process, Qopen_network_stream;
319 Lisp_Object Qtarget_idx;
320
321 Lisp_Object Vselect_safe_coding_system_function;
322
323 /* Mnemonic string for each format of end-of-line. */
324 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
325 /* Mnemonic string to indicate format of end-of-line is not yet
326 decided. */
327 Lisp_Object eol_mnemonic_undecided;
328
329 #ifdef emacs
330
331 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
332
333 Lisp_Object Qcoding_system_p, Qcoding_system_error;
334
335 /* Coding system emacs-mule and raw-text are for converting only
336 end-of-line format. */
337 Lisp_Object Qemacs_mule, Qraw_text;
338
339 /* Coding-systems are handed between Emacs Lisp programs and C internal
340 routines by the following three variables. */
341 /* Coding-system for reading files and receiving data from process. */
342 Lisp_Object Vcoding_system_for_read;
343 /* Coding-system for writing files and sending data to process. */
344 Lisp_Object Vcoding_system_for_write;
345 /* Coding-system actually used in the latest I/O. */
346 Lisp_Object Vlast_coding_system_used;
347
348 /* A vector of length 256 which contains information about special
349 Latin codes (especially for dealing with Microsoft codes). */
350 Lisp_Object Vlatin_extra_code_table;
351
352 /* Flag to inhibit code conversion of end-of-line format. */
353 int inhibit_eol_conversion;
354
355 /* Flag to inhibit ISO2022 escape sequence detection. */
356 int inhibit_iso_escape_detection;
357
358 /* Flag to make buffer-file-coding-system inherit from process-coding. */
359 int inherit_process_coding_system;
360
361 /* Coding system to be used to encode text for terminal display. */
362 struct coding_system terminal_coding;
363
364 /* Coding system to be used to encode text for terminal display when
365 terminal coding system is nil. */
366 struct coding_system safe_terminal_coding;
367
368 /* Coding system of what is sent from terminal keyboard. */
369 struct coding_system keyboard_coding;
370
371 Lisp_Object Vfile_coding_system_alist;
372 Lisp_Object Vprocess_coding_system_alist;
373 Lisp_Object Vnetwork_coding_system_alist;
374
375 Lisp_Object Vlocale_coding_system;
376
377 #endif /* emacs */
378
379 /* Flag to tell if we look up translation table on character code
380 conversion. */
381 Lisp_Object Venable_character_translation;
382 /* Standard translation table to look up on decoding (reading). */
383 Lisp_Object Vstandard_translation_table_for_decode;
384 /* Standard translation table to look up on encoding (writing). */
385 Lisp_Object Vstandard_translation_table_for_encode;
386
387 Lisp_Object Qtranslation_table;
388 Lisp_Object Qtranslation_table_id;
389 Lisp_Object Qtranslation_table_for_decode;
390 Lisp_Object Qtranslation_table_for_encode;
391
392 /* Alist of charsets vs revision number. */
393 static Lisp_Object Vcharset_revision_table;
394
395 /* Default coding systems used for process I/O. */
396 Lisp_Object Vdefault_process_coding_system;
397
398 /* Global flag to tell that we can't call post-read-conversion and
399 pre-write-conversion functions. Usually the value is zero, but it
400 is set to 1 temporarily while such functions are running. This is
401 to avoid infinite recursive call. */
402 static int inhibit_pre_post_conversion;
403
404 /* Two special coding systems. */
405 Lisp_Object Vsjis_coding_system;
406 Lisp_Object Vbig5_coding_system;
407
408
409 static int detect_coding_utf_8 P_ ((struct coding_system *,
410 struct coding_detection_info *info));
411 static void decode_coding_utf_8 P_ ((struct coding_system *));
412 static int encode_coding_utf_8 P_ ((struct coding_system *));
413
414 static int detect_coding_utf_16 P_ ((struct coding_system *,
415 struct coding_detection_info *info));
416 static void decode_coding_utf_16 P_ ((struct coding_system *));
417 static int encode_coding_utf_16 P_ ((struct coding_system *));
418
419 static int detect_coding_iso_2022 P_ ((struct coding_system *,
420 struct coding_detection_info *info));
421 static void decode_coding_iso_2022 P_ ((struct coding_system *));
422 static int encode_coding_iso_2022 P_ ((struct coding_system *));
423
424 static int detect_coding_emacs_mule P_ ((struct coding_system *,
425 struct coding_detection_info *info));
426 static void decode_coding_emacs_mule P_ ((struct coding_system *));
427 static int encode_coding_emacs_mule P_ ((struct coding_system *));
428
429 static int detect_coding_sjis P_ ((struct coding_system *,
430 struct coding_detection_info *info));
431 static void decode_coding_sjis P_ ((struct coding_system *));
432 static int encode_coding_sjis P_ ((struct coding_system *));
433
434 static int detect_coding_big5 P_ ((struct coding_system *,
435 struct coding_detection_info *info));
436 static void decode_coding_big5 P_ ((struct coding_system *));
437 static int encode_coding_big5 P_ ((struct coding_system *));
438
439 static int detect_coding_ccl P_ ((struct coding_system *,
440 struct coding_detection_info *info));
441 static void decode_coding_ccl P_ ((struct coding_system *));
442 static int encode_coding_ccl P_ ((struct coding_system *));
443
444 static void decode_coding_raw_text P_ ((struct coding_system *));
445 static int encode_coding_raw_text P_ ((struct coding_system *));
446
447
448 /* ISO2022 section */
449
450 #define CODING_ISO_INITIAL(coding, reg) \
451 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
452 coding_attr_iso_initial), \
453 reg)))
454
455
456 #define CODING_ISO_REQUEST(coding, charset_id) \
457 ((charset_id <= (coding)->max_charset_id \
458 ? (coding)->safe_charsets[charset_id] \
459 : -1))
460
461
462 #define CODING_ISO_FLAGS(coding) \
463 ((coding)->spec.iso_2022.flags)
464 #define CODING_ISO_DESIGNATION(coding, reg) \
465 ((coding)->spec.iso_2022.current_designation[reg])
466 #define CODING_ISO_INVOCATION(coding, plane) \
467 ((coding)->spec.iso_2022.current_invocation[plane])
468 #define CODING_ISO_SINGLE_SHIFTING(coding) \
469 ((coding)->spec.iso_2022.single_shifting)
470 #define CODING_ISO_BOL(coding) \
471 ((coding)->spec.iso_2022.bol)
472 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
473 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
474
475 /* Control characters of ISO2022. */
476 /* code */ /* function */
477 #define ISO_CODE_LF 0x0A /* line-feed */
478 #define ISO_CODE_CR 0x0D /* carriage-return */
479 #define ISO_CODE_SO 0x0E /* shift-out */
480 #define ISO_CODE_SI 0x0F /* shift-in */
481 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
482 #define ISO_CODE_ESC 0x1B /* escape */
483 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
484 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
485 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
486
487 /* All code (1-byte) of ISO2022 is classified into one of the
488 followings. */
489 enum iso_code_class_type
490 {
491 ISO_control_0, /* Control codes in the range
492 0x00..0x1F and 0x7F, except for the
493 following 5 codes. */
494 ISO_carriage_return, /* ISO_CODE_CR (0x0D) */
495 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
496 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
497 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
498 ISO_escape, /* ISO_CODE_SO (0x1B) */
499 ISO_control_1, /* Control codes in the range
500 0x80..0x9F, except for the
501 following 3 codes. */
502 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
503 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
504 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
505 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
506 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
507 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
508 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
509 };
510
511 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
512 `iso-flags' attribute of an iso2022 coding system. */
513
514 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
515 instead of the correct short-form sequence (e.g. ESC $ A). */
516 #define CODING_ISO_FLAG_LONG_FORM 0x0001
517
518 /* If set, reset graphic planes and registers at end-of-line to the
519 initial state. */
520 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
521
522 /* If set, reset graphic planes and registers before any control
523 characters to the initial state. */
524 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
525
526 /* If set, encode by 7-bit environment. */
527 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
528
529 /* If set, use locking-shift function. */
530 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
531
532 /* If set, use single-shift function. Overwrite
533 CODING_ISO_FLAG_LOCKING_SHIFT. */
534 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
535
536 /* If set, use designation escape sequence. */
537 #define CODING_ISO_FLAG_DESIGNATION 0x0040
538
539 /* If set, produce revision number sequence. */
540 #define CODING_ISO_FLAG_REVISION 0x0080
541
542 /* If set, produce ISO6429's direction specifying sequence. */
543 #define CODING_ISO_FLAG_DIRECTION 0x0100
544
545 /* If set, assume designation states are reset at beginning of line on
546 output. */
547 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
548
549 /* If set, designation sequence should be placed at beginning of line
550 on output. */
551 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
552
553 /* If set, do not encode unsafe charactes on output. */
554 #define CODING_ISO_FLAG_SAFE 0x0800
555
556 /* If set, extra latin codes (128..159) are accepted as a valid code
557 on input. */
558 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
559
560 #define CODING_ISO_FLAG_COMPOSITION 0x2000
561
562 #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
563
564 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
565
566 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
567
568 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
569
570 /* A character to be produced on output if encoding of the original
571 character is prohibited by CODING_ISO_FLAG_SAFE. */
572 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
573
574
575 /* UTF-16 section */
576 #define CODING_UTF_16_BOM(coding) \
577 ((coding)->spec.utf_16.bom)
578
579 #define CODING_UTF_16_ENDIAN(coding) \
580 ((coding)->spec.utf_16.endian)
581
582 #define CODING_UTF_16_SURROGATE(coding) \
583 ((coding)->spec.utf_16.surrogate)
584
585
586 /* CCL section */
587 #define CODING_CCL_DECODER(coding) \
588 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
589 #define CODING_CCL_ENCODER(coding) \
590 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
591 #define CODING_CCL_VALIDS(coding) \
592 (XSTRING (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)) \
593 ->data)
594
595 /* Index for each coding category in `coding_categories' */
596
597 enum coding_category
598 {
599 coding_category_iso_7,
600 coding_category_iso_7_tight,
601 coding_category_iso_8_1,
602 coding_category_iso_8_2,
603 coding_category_iso_7_else,
604 coding_category_iso_8_else,
605 coding_category_utf_8,
606 coding_category_utf_16_auto,
607 coding_category_utf_16_be,
608 coding_category_utf_16_le,
609 coding_category_utf_16_be_nosig,
610 coding_category_utf_16_le_nosig,
611 coding_category_charset,
612 coding_category_sjis,
613 coding_category_big5,
614 coding_category_ccl,
615 coding_category_emacs_mule,
616 /* All above are targets of code detection. */
617 coding_category_raw_text,
618 coding_category_undecided,
619 coding_category_max
620 };
621
622 /* Definitions of flag bits used in detect_coding_XXXX. */
623 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
624 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
625 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
626 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
627 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
628 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
629 #define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8)
630 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
631 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
632 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
633 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
634 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
635 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
636 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
637 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
638 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
639 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
640 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
641
642 /* This value is returned if detect_coding_mask () find nothing other
643 than ASCII characters. */
644 #define CATEGORY_MASK_ANY \
645 (CATEGORY_MASK_ISO_7 \
646 | CATEGORY_MASK_ISO_7_TIGHT \
647 | CATEGORY_MASK_ISO_8_1 \
648 | CATEGORY_MASK_ISO_8_2 \
649 | CATEGORY_MASK_ISO_7_ELSE \
650 | CATEGORY_MASK_ISO_8_ELSE \
651 | CATEGORY_MASK_UTF_8 \
652 | CATEGORY_MASK_UTF_16_BE \
653 | CATEGORY_MASK_UTF_16_LE \
654 | CATEGORY_MASK_UTF_16_BE_NOSIG \
655 | CATEGORY_MASK_UTF_16_LE_NOSIG \
656 | CATEGORY_MASK_CHARSET \
657 | CATEGORY_MASK_SJIS \
658 | CATEGORY_MASK_BIG5 \
659 | CATEGORY_MASK_CCL \
660 | CATEGORY_MASK_EMACS_MULE)
661
662
663 #define CATEGORY_MASK_ISO_7BIT \
664 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
665
666 #define CATEGORY_MASK_ISO_8BIT \
667 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
668
669 #define CATEGORY_MASK_ISO_ELSE \
670 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
671
672 #define CATEGORY_MASK_ISO_ESCAPE \
673 (CATEGORY_MASK_ISO_7 \
674 | CATEGORY_MASK_ISO_7_TIGHT \
675 | CATEGORY_MASK_ISO_7_ELSE \
676 | CATEGORY_MASK_ISO_8_ELSE)
677
678 #define CATEGORY_MASK_ISO \
679 ( CATEGORY_MASK_ISO_7BIT \
680 | CATEGORY_MASK_ISO_8BIT \
681 | CATEGORY_MASK_ISO_ELSE)
682
683 #define CATEGORY_MASK_UTF_16 \
684 (CATEGORY_MASK_UTF_16_BE \
685 | CATEGORY_MASK_UTF_16_LE \
686 | CATEGORY_MASK_UTF_16_BE_NOSIG \
687 | CATEGORY_MASK_UTF_16_LE_NOSIG)
688
689
690 /* List of symbols `coding-category-xxx' ordered by priority. This
691 variable is exposed to Emacs Lisp. */
692 static Lisp_Object Vcoding_category_list;
693
694 /* Table of coding categories (Lisp symbols). This variable is for
695 internal use oly. */
696 static Lisp_Object Vcoding_category_table;
697
698 /* Table of coding-categories ordered by priority. */
699 static enum coding_category coding_priorities[coding_category_max];
700
701 /* Nth element is a coding context for the coding system bound to the
702 Nth coding category. */
703 static struct coding_system coding_categories[coding_category_max];
704
705 /*** Commonly used macros and functions ***/
706
707 #ifndef min
708 #define min(a, b) ((a) < (b) ? (a) : (b))
709 #endif
710 #ifndef max
711 #define max(a, b) ((a) > (b) ? (a) : (b))
712 #endif
713
714 #define CODING_GET_INFO(coding, attrs, eol_type, charset_list) \
715 do { \
716 attrs = CODING_ID_ATTRS (coding->id); \
717 eol_type = CODING_ID_EOL_TYPE (coding->id); \
718 if (VECTORP (eol_type)) \
719 eol_type = Qunix; \
720 charset_list = CODING_ATTR_CHARSET_LIST (attrs); \
721 } while (0)
722
723
724 /* Safely get one byte from the source text pointed by SRC which ends
725 at SRC_END, and set C to that byte. If there are not enough bytes
726 in the source, it jumps to `no_more_source'. The caller
727 should declare and set these variables appropriately in advance:
728 src, src_end, multibytep
729 */
730
731 #define ONE_MORE_BYTE(c) \
732 do { \
733 if (src == src_end) \
734 { \
735 if (src_base < src) \
736 coding->result = CODING_RESULT_INSUFFICIENT_SRC; \
737 goto no_more_source; \
738 } \
739 c = *src++; \
740 if (multibytep && (c & 0x80)) \
741 { \
742 if ((c & 0xFE) != 0xC0) \
743 error ("Undecodable char found"); \
744 c = ((c & 1) << 6) | *src++; \
745 } \
746 consumed_chars++; \
747 } while (0)
748
749
750 #define ONE_MORE_BYTE_NO_CHECK(c) \
751 do { \
752 c = *src++; \
753 if (multibytep && (c & 0x80)) \
754 { \
755 if ((c & 0xFE) != 0xC0) \
756 error ("Undecodable char found"); \
757 c = ((c & 1) << 6) | *src++; \
758 } \
759 consumed_chars++; \
760 } while (0)
761
762
763 /* Store a byte C in the place pointed by DST and increment DST to the
764 next free point, and increment PRODUCED_CHARS. The caller should
765 assure that C is 0..127, and declare and set the variable `dst'
766 appropriately in advance.
767 */
768
769
770 #define EMIT_ONE_ASCII_BYTE(c) \
771 do { \
772 produced_chars++; \
773 *dst++ = (c); \
774 } while (0)
775
776
777 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
778
779 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
780 do { \
781 produced_chars += 2; \
782 *dst++ = (c1), *dst++ = (c2); \
783 } while (0)
784
785
786 /* Store a byte C in the place pointed by DST and increment DST to the
787 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
788 nonzero, store in an appropriate multibyte from. The caller should
789 declare and set the variables `dst' and `multibytep' appropriately
790 in advance. */
791
792 #define EMIT_ONE_BYTE(c) \
793 do { \
794 produced_chars++; \
795 if (multibytep) \
796 { \
797 int ch = (c); \
798 if (ch >= 0x80) \
799 ch = BYTE8_TO_CHAR (ch); \
800 CHAR_STRING_ADVANCE (ch, dst); \
801 } \
802 else \
803 *dst++ = (c); \
804 } while (0)
805
806
807 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
808
809 #define EMIT_TWO_BYTES(c1, c2) \
810 do { \
811 produced_chars += 2; \
812 if (multibytep) \
813 { \
814 int ch; \
815 \
816 ch = (c1); \
817 if (ch >= 0x80) \
818 ch = BYTE8_TO_CHAR (ch); \
819 CHAR_STRING_ADVANCE (ch, dst); \
820 ch = (c2); \
821 if (ch >= 0x80) \
822 ch = BYTE8_TO_CHAR (ch); \
823 CHAR_STRING_ADVANCE (ch, dst); \
824 } \
825 else \
826 { \
827 *dst++ = (c1); \
828 *dst++ = (c2); \
829 } \
830 } while (0)
831
832
833 #define EMIT_THREE_BYTES(c1, c2, c3) \
834 do { \
835 EMIT_ONE_BYTE (c1); \
836 EMIT_TWO_BYTES (c2, c3); \
837 } while (0)
838
839
840 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
841 do { \
842 EMIT_TWO_BYTES (c1, c2); \
843 EMIT_TWO_BYTES (c3, c4); \
844 } while (0)
845
846
847 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
848 do { \
849 charset_map_loaded = 0; \
850 c = DECODE_CHAR (charset, code); \
851 if (charset_map_loaded) \
852 { \
853 unsigned char *orig = coding->source; \
854 EMACS_INT offset; \
855 \
856 coding_set_source (coding); \
857 offset = coding->source - orig; \
858 src += offset; \
859 src_base += offset; \
860 src_end += offset; \
861 } \
862 } while (0)
863
864
865 #define ASSURE_DESTINATION(bytes) \
866 do { \
867 if (dst + (bytes) >= dst_end) \
868 { \
869 int more_bytes = charbuf_end - charbuf + (bytes); \
870 \
871 dst = alloc_destination (coding, more_bytes, dst); \
872 dst_end = coding->destination + coding->dst_bytes; \
873 } \
874 } while (0)
875
876
877
878 static void
879 coding_set_source (coding)
880 struct coding_system *coding;
881 {
882 if (BUFFERP (coding->src_object))
883 {
884 struct buffer *buf = XBUFFER (coding->src_object);
885
886 if (coding->src_pos < 0)
887 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
888 else
889 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
890 }
891 else if (STRINGP (coding->src_object))
892 {
893 coding->source = (XSTRING (coding->src_object)->data
894 + coding->src_pos_byte);
895 }
896 else
897 /* Otherwise, the source is C string and is never relocated
898 automatically. Thus we don't have to update anything. */
899 ;
900 }
901
902 static void
903 coding_set_destination (coding)
904 struct coding_system *coding;
905 {
906 if (BUFFERP (coding->dst_object))
907 {
908 if (coding->src_pos < 0)
909 {
910 coding->destination = BEG_ADDR + coding->dst_pos_byte - 1;
911 coding->dst_bytes = (GAP_END_ADDR
912 - (coding->src_bytes - coding->consumed)
913 - coding->destination);
914 }
915 else
916 {
917 /* We are sure that coding->dst_pos_byte is before the gap
918 of the buffer. */
919 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
920 + coding->dst_pos_byte - 1);
921 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
922 - coding->destination);
923 }
924 }
925 else
926 /* Otherwise, the destination is C string and is never relocated
927 automatically. Thus we don't have to update anything. */
928 ;
929 }
930
931
932 static void
933 coding_alloc_by_realloc (coding, bytes)
934 struct coding_system *coding;
935 EMACS_INT bytes;
936 {
937 coding->destination = (unsigned char *) xrealloc (coding->destination,
938 coding->dst_bytes + bytes);
939 coding->dst_bytes += bytes;
940 }
941
942 static void
943 coding_alloc_by_making_gap (coding, bytes)
944 struct coding_system *coding;
945 EMACS_INT bytes;
946 {
947 if (BUFFERP (coding->dst_object)
948 && EQ (coding->src_object, coding->dst_object))
949 {
950 EMACS_INT add = coding->src_bytes - coding->consumed;
951
952 GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
953 make_gap (bytes);
954 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
955 }
956 else
957 {
958 Lisp_Object this_buffer;
959
960 this_buffer = Fcurrent_buffer ();
961 set_buffer_internal (XBUFFER (coding->dst_object));
962 make_gap (bytes);
963 set_buffer_internal (XBUFFER (this_buffer));
964 }
965 }
966
967
968 static unsigned char *
969 alloc_destination (coding, nbytes, dst)
970 struct coding_system *coding;
971 int nbytes;
972 unsigned char *dst;
973 {
974 EMACS_INT offset = dst - coding->destination;
975
976 if (BUFFERP (coding->dst_object))
977 coding_alloc_by_making_gap (coding, nbytes);
978 else
979 coding_alloc_by_realloc (coding, nbytes);
980 coding->result = CODING_RESULT_SUCCESS;
981 coding_set_destination (coding);
982 dst = coding->destination + offset;
983 return dst;
984 }
985
986 /** Macros for annotations. */
987
988 /* Maximum length of annotation data (sum of annotations for
989 composition and charset). */
990 #define MAX_ANNOTATION_LENGTH (5 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 5)
991
992 /* An annotation data is stored in the array coding->charbuf in this
993 format:
994 [ -LENGTH ANNOTATION_MASK FROM TO ... ]
995 LENGTH is the number of elements in the annotation.
996 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
997 FROM and TO specify the range of text annotated. They are relative
998 to coding->src_pos (on encoding) or coding->dst_pos (on decoding).
999
1000 The format of the following elements depend on ANNOTATION_MASK.
1001
1002 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1003 follows:
1004 ... METHOD [ COMPOSITION-COMPONENTS ... ]
1005 METHOD is one of enum composition_method.
1006 Optionnal COMPOSITION-COMPONENTS are characters and composition
1007 rules.
1008
1009 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1010 follows. */
1011
1012 #define ADD_ANNOTATION_DATA(buf, len, mask, from, to) \
1013 do { \
1014 *(buf)++ = -(len); \
1015 *(buf)++ = (mask); \
1016 *(buf)++ = (from); \
1017 *(buf)++ = (to); \
1018 coding->annotated = 1; \
1019 } while (0);
1020
1021 #define ADD_COMPOSITION_DATA(buf, from, to, method) \
1022 do { \
1023 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, from, to); \
1024 *buf++ = method; \
1025 } while (0)
1026
1027
1028 #define ADD_CHARSET_DATA(buf, from, to, id) \
1029 do { \
1030 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_CHARSET_MASK, from, to); \
1031 *buf++ = id; \
1032 } while (0)
1033
1034 \f
1035 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1036
1037
1038
1039 \f
1040 /*** 3. UTF-8 ***/
1041
1042 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1043 Check if a text is encoded in UTF-8. If it is, return 1, else
1044 return 0. */
1045
1046 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1047 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1048 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1049 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1050 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1051 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1052
1053 static int
1054 detect_coding_utf_8 (coding, detect_info)
1055 struct coding_system *coding;
1056 struct coding_detection_info *detect_info;
1057 {
1058 unsigned char *src = coding->source, *src_base = src;
1059 unsigned char *src_end = coding->source + coding->src_bytes;
1060 int multibytep = coding->src_multibyte;
1061 int consumed_chars = 0;
1062 int found = 0;
1063 int incomplete;
1064
1065 detect_info->checked |= CATEGORY_MASK_UTF_8;
1066 /* A coding system of this category is always ASCII compatible. */
1067 src += coding->head_ascii;
1068
1069 while (1)
1070 {
1071 int c, c1, c2, c3, c4;
1072
1073 incomplete = 0;
1074 ONE_MORE_BYTE (c);
1075 if (UTF_8_1_OCTET_P (c))
1076 continue;
1077 incomplete = 1;
1078 ONE_MORE_BYTE (c1);
1079 if (! UTF_8_EXTRA_OCTET_P (c1))
1080 break;
1081 if (UTF_8_2_OCTET_LEADING_P (c))
1082 {
1083 found = CATEGORY_MASK_UTF_8;
1084 continue;
1085 }
1086 ONE_MORE_BYTE (c2);
1087 if (! UTF_8_EXTRA_OCTET_P (c2))
1088 break;
1089 if (UTF_8_3_OCTET_LEADING_P (c))
1090 {
1091 found = CATEGORY_MASK_UTF_8;
1092 continue;
1093 }
1094 ONE_MORE_BYTE (c3);
1095 if (! UTF_8_EXTRA_OCTET_P (c3))
1096 break;
1097 if (UTF_8_4_OCTET_LEADING_P (c))
1098 {
1099 found = CATEGORY_MASK_UTF_8;
1100 continue;
1101 }
1102 ONE_MORE_BYTE (c4);
1103 if (! UTF_8_EXTRA_OCTET_P (c4))
1104 break;
1105 if (UTF_8_5_OCTET_LEADING_P (c))
1106 {
1107 found = CATEGORY_MASK_UTF_8;
1108 continue;
1109 }
1110 break;
1111 }
1112 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1113 return 0;
1114
1115 no_more_source:
1116 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
1117 {
1118 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1119 return 0;
1120 }
1121 detect_info->found |= found;
1122 return 1;
1123 }
1124
1125
1126 static void
1127 decode_coding_utf_8 (coding)
1128 struct coding_system *coding;
1129 {
1130 unsigned char *src = coding->source + coding->consumed;
1131 unsigned char *src_end = coding->source + coding->src_bytes;
1132 unsigned char *src_base;
1133 int *charbuf = coding->charbuf;
1134 int *charbuf_end = charbuf + coding->charbuf_size;
1135 int consumed_chars = 0, consumed_chars_base;
1136 int multibytep = coding->src_multibyte;
1137 Lisp_Object attr, eol_type, charset_list;
1138
1139 CODING_GET_INFO (coding, attr, eol_type, charset_list);
1140
1141 while (1)
1142 {
1143 int c, c1, c2, c3, c4, c5;
1144
1145 src_base = src;
1146 consumed_chars_base = consumed_chars;
1147
1148 if (charbuf >= charbuf_end)
1149 break;
1150
1151 ONE_MORE_BYTE (c1);
1152 if (UTF_8_1_OCTET_P(c1))
1153 {
1154 c = c1;
1155 if (c == '\r')
1156 {
1157 if (EQ (eol_type, Qdos))
1158 {
1159 if (src == src_end)
1160 {
1161 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
1162 goto no_more_source;
1163 }
1164 if (*src == '\n')
1165 ONE_MORE_BYTE (c);
1166 }
1167 else if (EQ (eol_type, Qmac))
1168 c = '\n';
1169 }
1170 }
1171 else
1172 {
1173 ONE_MORE_BYTE (c2);
1174 if (! UTF_8_EXTRA_OCTET_P (c2))
1175 goto invalid_code;
1176 if (UTF_8_2_OCTET_LEADING_P (c1))
1177 {
1178 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1179 /* Reject overlong sequences here and below. Encoders
1180 producing them are incorrect, they can be misleading,
1181 and they mess up read/write invariance. */
1182 if (c < 128)
1183 goto invalid_code;
1184 }
1185 else
1186 {
1187 ONE_MORE_BYTE (c3);
1188 if (! UTF_8_EXTRA_OCTET_P (c3))
1189 goto invalid_code;
1190 if (UTF_8_3_OCTET_LEADING_P (c1))
1191 {
1192 c = (((c1 & 0xF) << 12)
1193 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1194 if (c < 0x800
1195 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1196 goto invalid_code;
1197 }
1198 else
1199 {
1200 ONE_MORE_BYTE (c4);
1201 if (! UTF_8_EXTRA_OCTET_P (c4))
1202 goto invalid_code;
1203 if (UTF_8_4_OCTET_LEADING_P (c1))
1204 {
1205 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1206 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1207 if (c < 0x10000)
1208 goto invalid_code;
1209 }
1210 else
1211 {
1212 ONE_MORE_BYTE (c5);
1213 if (! UTF_8_EXTRA_OCTET_P (c5))
1214 goto invalid_code;
1215 if (UTF_8_5_OCTET_LEADING_P (c1))
1216 {
1217 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1218 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1219 | (c5 & 0x3F));
1220 if ((c > MAX_CHAR) || (c < 0x200000))
1221 goto invalid_code;
1222 }
1223 else
1224 goto invalid_code;
1225 }
1226 }
1227 }
1228 }
1229
1230 *charbuf++ = c;
1231 continue;
1232
1233 invalid_code:
1234 src = src_base;
1235 consumed_chars = consumed_chars_base;
1236 ONE_MORE_BYTE (c);
1237 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1238 coding->errors++;
1239 }
1240
1241 no_more_source:
1242 coding->consumed_char += consumed_chars_base;
1243 coding->consumed = src_base - coding->source;
1244 coding->charbuf_used = charbuf - coding->charbuf;
1245 }
1246
1247
1248 static int
1249 encode_coding_utf_8 (coding)
1250 struct coding_system *coding;
1251 {
1252 int multibytep = coding->dst_multibyte;
1253 int *charbuf = coding->charbuf;
1254 int *charbuf_end = charbuf + coding->charbuf_used;
1255 unsigned char *dst = coding->destination + coding->produced;
1256 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1257 int produced_chars = 0;
1258 int c;
1259
1260 if (multibytep)
1261 {
1262 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1263
1264 while (charbuf < charbuf_end)
1265 {
1266 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1267
1268 ASSURE_DESTINATION (safe_room);
1269 c = *charbuf++;
1270 if (CHAR_BYTE8_P (c))
1271 {
1272 c = CHAR_TO_BYTE8 (c);
1273 EMIT_ONE_BYTE (c);
1274 }
1275 else
1276 {
1277 CHAR_STRING_ADVANCE (c, pend);
1278 for (p = str; p < pend; p++)
1279 EMIT_ONE_BYTE (*p);
1280 }
1281 }
1282 }
1283 else
1284 {
1285 int safe_room = MAX_MULTIBYTE_LENGTH;
1286
1287 while (charbuf < charbuf_end)
1288 {
1289 ASSURE_DESTINATION (safe_room);
1290 c = *charbuf++;
1291 dst += CHAR_STRING (c, dst);
1292 produced_chars++;
1293 }
1294 }
1295 coding->result = CODING_RESULT_SUCCESS;
1296 coding->produced_char += produced_chars;
1297 coding->produced = dst - coding->destination;
1298 return 0;
1299 }
1300
1301
1302 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1303 Check if a text is encoded in one of UTF-16 based coding systems.
1304 If it is, return 1, else return 0. */
1305
1306 #define UTF_16_HIGH_SURROGATE_P(val) \
1307 (((val) & 0xFC00) == 0xD800)
1308
1309 #define UTF_16_LOW_SURROGATE_P(val) \
1310 (((val) & 0xFC00) == 0xDC00)
1311
1312 #define UTF_16_INVALID_P(val) \
1313 (((val) == 0xFFFE) \
1314 || ((val) == 0xFFFF) \
1315 || UTF_16_LOW_SURROGATE_P (val))
1316
1317
1318 static int
1319 detect_coding_utf_16 (coding, detect_info)
1320 struct coding_system *coding;
1321 struct coding_detection_info *detect_info;
1322 {
1323 unsigned char *src = coding->source, *src_base = src;
1324 unsigned char *src_end = coding->source + coding->src_bytes;
1325 int multibytep = coding->src_multibyte;
1326 int consumed_chars = 0;
1327 int c1, c2;
1328
1329 detect_info->checked |= CATEGORY_MASK_UTF_16;
1330
1331 if (coding->mode & CODING_MODE_LAST_BLOCK
1332 && (coding->src_bytes & 1))
1333 {
1334 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1335 return 0;
1336 }
1337 ONE_MORE_BYTE (c1);
1338 ONE_MORE_BYTE (c2);
1339
1340 if ((c1 == 0xFF) && (c2 == 0xFE))
1341 {
1342 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1343 | CATEGORY_MASK_UTF_16_AUTO);
1344 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE;
1345 }
1346 else if ((c1 == 0xFE) && (c2 == 0xFF))
1347 {
1348 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1349 | CATEGORY_MASK_UTF_16_AUTO);
1350 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE;
1351 }
1352 no_more_source:
1353 return 1;
1354 }
1355
1356 static void
1357 decode_coding_utf_16 (coding)
1358 struct coding_system *coding;
1359 {
1360 unsigned char *src = coding->source + coding->consumed;
1361 unsigned char *src_end = coding->source + coding->src_bytes;
1362 unsigned char *src_base;
1363 int *charbuf = coding->charbuf;
1364 int *charbuf_end = charbuf + coding->charbuf_size;
1365 int consumed_chars = 0, consumed_chars_base;
1366 int multibytep = coding->src_multibyte;
1367 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1368 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1369 int surrogate = CODING_UTF_16_SURROGATE (coding);
1370 Lisp_Object attr, eol_type, charset_list;
1371
1372 CODING_GET_INFO (coding, attr, eol_type, charset_list);
1373
1374 if (bom == utf_16_with_bom)
1375 {
1376 int c, c1, c2;
1377
1378 src_base = src;
1379 ONE_MORE_BYTE (c1);
1380 ONE_MORE_BYTE (c2);
1381 c = (c1 << 8) | c2;
1382
1383 if (endian == utf_16_big_endian
1384 ? c != 0xFEFF : c != 0xFFFE)
1385 {
1386 /* The first two bytes are not BOM. Treat them as bytes
1387 for a normal character. */
1388 src = src_base;
1389 coding->errors++;
1390 }
1391 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1392 }
1393 else if (bom == utf_16_detect_bom)
1394 {
1395 /* We have already tried to detect BOM and failed in
1396 detect_coding. */
1397 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1398 }
1399
1400 while (1)
1401 {
1402 int c, c1, c2;
1403
1404 src_base = src;
1405 consumed_chars_base = consumed_chars;
1406
1407 if (charbuf + 2 >= charbuf_end)
1408 break;
1409
1410 ONE_MORE_BYTE (c1);
1411 ONE_MORE_BYTE (c2);
1412 c = (endian == utf_16_big_endian
1413 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1414 if (surrogate)
1415 {
1416 if (! UTF_16_LOW_SURROGATE_P (c))
1417 {
1418 if (endian == utf_16_big_endian)
1419 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1420 else
1421 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1422 *charbuf++ = c1;
1423 *charbuf++ = c2;
1424 coding->errors++;
1425 if (UTF_16_HIGH_SURROGATE_P (c))
1426 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1427 else
1428 *charbuf++ = c;
1429 }
1430 else
1431 {
1432 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1433 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1434 *charbuf++ = c;
1435 }
1436 }
1437 else
1438 {
1439 if (UTF_16_HIGH_SURROGATE_P (c))
1440 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1441 else
1442 *charbuf++ = c;
1443 }
1444 }
1445
1446 no_more_source:
1447 coding->consumed_char += consumed_chars_base;
1448 coding->consumed = src_base - coding->source;
1449 coding->charbuf_used = charbuf - coding->charbuf;
1450 }
1451
1452 static int
1453 encode_coding_utf_16 (coding)
1454 struct coding_system *coding;
1455 {
1456 int multibytep = coding->dst_multibyte;
1457 int *charbuf = coding->charbuf;
1458 int *charbuf_end = charbuf + coding->charbuf_used;
1459 unsigned char *dst = coding->destination + coding->produced;
1460 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1461 int safe_room = 8;
1462 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1463 int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1464 int produced_chars = 0;
1465 Lisp_Object attrs, eol_type, charset_list;
1466 int c;
1467
1468 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1469
1470 if (bom != utf_16_without_bom)
1471 {
1472 ASSURE_DESTINATION (safe_room);
1473 if (big_endian)
1474 EMIT_TWO_BYTES (0xFE, 0xFF);
1475 else
1476 EMIT_TWO_BYTES (0xFF, 0xFE);
1477 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1478 }
1479
1480 while (charbuf < charbuf_end)
1481 {
1482 ASSURE_DESTINATION (safe_room);
1483 c = *charbuf++;
1484 if (c >= MAX_UNICODE_CHAR)
1485 c = coding->default_char;
1486
1487 if (c < 0x10000)
1488 {
1489 if (big_endian)
1490 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1491 else
1492 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1493 }
1494 else
1495 {
1496 int c1, c2;
1497
1498 c -= 0x10000;
1499 c1 = (c >> 10) + 0xD800;
1500 c2 = (c & 0x3FF) + 0xDC00;
1501 if (big_endian)
1502 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1503 else
1504 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1505 }
1506 }
1507 coding->result = CODING_RESULT_SUCCESS;
1508 coding->produced = dst - coding->destination;
1509 coding->produced_char += produced_chars;
1510 return 0;
1511 }
1512
1513 \f
1514 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1515
1516 /* Emacs' internal format for representation of multiple character
1517 sets is a kind of multi-byte encoding, i.e. characters are
1518 represented by variable-length sequences of one-byte codes.
1519
1520 ASCII characters and control characters (e.g. `tab', `newline') are
1521 represented by one-byte sequences which are their ASCII codes, in
1522 the range 0x00 through 0x7F.
1523
1524 8-bit characters of the range 0x80..0x9F are represented by
1525 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1526 code + 0x20).
1527
1528 8-bit characters of the range 0xA0..0xFF are represented by
1529 one-byte sequences which are their 8-bit code.
1530
1531 The other characters are represented by a sequence of `base
1532 leading-code', optional `extended leading-code', and one or two
1533 `position-code's. The length of the sequence is determined by the
1534 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1535 whereas extended leading-code and position-code take the range 0xA0
1536 through 0xFF. See `charset.h' for more details about leading-code
1537 and position-code.
1538
1539 --- CODE RANGE of Emacs' internal format ---
1540 character set range
1541 ------------- -----
1542 ascii 0x00..0x7F
1543 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1544 eight-bit-graphic 0xA0..0xBF
1545 ELSE 0x81..0x9D + [0xA0..0xFF]+
1546 ---------------------------------------------
1547
1548 As this is the internal character representation, the format is
1549 usually not used externally (i.e. in a file or in a data sent to a
1550 process). But, it is possible to have a text externally in this
1551 format (i.e. by encoding by the coding system `emacs-mule').
1552
1553 In that case, a sequence of one-byte codes has a slightly different
1554 form.
1555
1556 At first, all characters in eight-bit-control are represented by
1557 one-byte sequences which are their 8-bit code.
1558
1559 Next, character composition data are represented by the byte
1560 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1561 where,
1562 METHOD is 0xF0 plus one of composition method (enum
1563 composition_method),
1564
1565 BYTES is 0xA0 plus a byte length of this composition data,
1566
1567 CHARS is 0x20 plus a number of characters composed by this
1568 data,
1569
1570 COMPONENTs are characters of multibye form or composition
1571 rules encoded by two-byte of ASCII codes.
1572
1573 In addition, for backward compatibility, the following formats are
1574 also recognized as composition data on decoding.
1575
1576 0x80 MSEQ ...
1577 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1578
1579 Here,
1580 MSEQ is a multibyte form but in these special format:
1581 ASCII: 0xA0 ASCII_CODE+0x80,
1582 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1583 RULE is a one byte code of the range 0xA0..0xF0 that
1584 represents a composition rule.
1585 */
1586
1587 char emacs_mule_bytes[256];
1588
1589 int
1590 emacs_mule_char (coding, src, nbytes, nchars, id)
1591 struct coding_system *coding;
1592 unsigned char *src;
1593 int *nbytes, *nchars, *id;
1594 {
1595 unsigned char *src_end = coding->source + coding->src_bytes;
1596 int multibytep = coding->src_multibyte;
1597 unsigned char *src_base = src;
1598 struct charset *charset;
1599 unsigned code;
1600 int c;
1601 int consumed_chars = 0;
1602
1603 ONE_MORE_BYTE (c);
1604 switch (emacs_mule_bytes[c])
1605 {
1606 case 2:
1607 if (! (charset = emacs_mule_charset[c]))
1608 goto invalid_code;
1609 ONE_MORE_BYTE (c);
1610 code = c & 0x7F;
1611 break;
1612
1613 case 3:
1614 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1615 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1616 {
1617 ONE_MORE_BYTE (c);
1618 if (! (charset = emacs_mule_charset[c]))
1619 goto invalid_code;
1620 ONE_MORE_BYTE (c);
1621 code = c & 0x7F;
1622 }
1623 else
1624 {
1625 if (! (charset = emacs_mule_charset[c]))
1626 goto invalid_code;
1627 ONE_MORE_BYTE (c);
1628 code = (c & 0x7F) << 8;
1629 ONE_MORE_BYTE (c);
1630 code |= c & 0x7F;
1631 }
1632 break;
1633
1634 case 4:
1635 ONE_MORE_BYTE (c);
1636 if (! (charset = emacs_mule_charset[c]))
1637 goto invalid_code;
1638 ONE_MORE_BYTE (c);
1639 code = (c & 0x7F) << 8;
1640 ONE_MORE_BYTE (c);
1641 code |= c & 0x7F;
1642 break;
1643
1644 case 1:
1645 code = c;
1646 charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
1647 ? charset_ascii : charset_eight_bit);
1648 break;
1649
1650 default:
1651 abort ();
1652 }
1653 c = DECODE_CHAR (charset, code);
1654 if (c < 0)
1655 goto invalid_code;
1656 *nbytes = src - src_base;
1657 *nchars = consumed_chars;
1658 if (id)
1659 *id = charset->id;
1660 return c;
1661
1662 no_more_source:
1663 return -2;
1664
1665 invalid_code:
1666 return -1;
1667 }
1668
1669
1670 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1671 Check if a text is encoded in `emacs-mule'. If it is, return 1,
1672 else return 0. */
1673
1674 static int
1675 detect_coding_emacs_mule (coding, detect_info)
1676 struct coding_system *coding;
1677 struct coding_detection_info *detect_info;
1678 {
1679 unsigned char *src = coding->source, *src_base = src;
1680 unsigned char *src_end = coding->source + coding->src_bytes;
1681 int multibytep = coding->src_multibyte;
1682 int consumed_chars = 0;
1683 int c;
1684 int found = 0;
1685 int incomplete;
1686
1687 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1688 /* A coding system of this category is always ASCII compatible. */
1689 src += coding->head_ascii;
1690
1691 while (1)
1692 {
1693 incomplete = 0;
1694 ONE_MORE_BYTE (c);
1695 incomplete = 1;
1696
1697 if (c == 0x80)
1698 {
1699 /* Perhaps the start of composite character. We simple skip
1700 it because analyzing it is too heavy for detecting. But,
1701 at least, we check that the composite character
1702 constitues of more than 4 bytes. */
1703 unsigned char *src_base;
1704
1705 repeat:
1706 src_base = src;
1707 do
1708 {
1709 ONE_MORE_BYTE (c);
1710 }
1711 while (c >= 0xA0);
1712
1713 if (src - src_base <= 4)
1714 break;
1715 found = CATEGORY_MASK_EMACS_MULE;
1716 if (c == 0x80)
1717 goto repeat;
1718 }
1719
1720 if (c < 0x80)
1721 {
1722 if (c < 0x20
1723 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1724 break;
1725 }
1726 else
1727 {
1728 unsigned char *src_base = src - 1;
1729
1730 do
1731 {
1732 ONE_MORE_BYTE (c);
1733 }
1734 while (c >= 0xA0);
1735 if (src - src_base != emacs_mule_bytes[*src_base])
1736 break;
1737 found = CATEGORY_MASK_EMACS_MULE;
1738 }
1739 }
1740 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1741 return 0;
1742
1743 no_more_source:
1744 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
1745 {
1746 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1747 return 0;
1748 }
1749 detect_info->found |= found;
1750 return 1;
1751 }
1752
1753
1754 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1755
1756 /* Decode a character represented as a component of composition
1757 sequence of Emacs 20/21 style at SRC. Set C to that character and
1758 update SRC to the head of next character (or an encoded composition
1759 rule). If SRC doesn't points a composition component, set C to -1.
1760 If SRC points an invalid byte sequence, global exit by a return
1761 value 0. */
1762
1763 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
1764 if (1) \
1765 { \
1766 int c; \
1767 int nbytes, nchars; \
1768 \
1769 if (src == src_end) \
1770 break; \
1771 c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
1772 if (c < 0) \
1773 { \
1774 if (c == -2) \
1775 break; \
1776 goto invalid_code; \
1777 } \
1778 *buf++ = c; \
1779 src += nbytes; \
1780 consumed_chars += nchars; \
1781 } \
1782 else
1783
1784
1785 /* Decode a composition rule represented as a component of composition
1786 sequence of Emacs 20 style at SRC. Store the decoded rule in *BUF,
1787 and increment BUF. If SRC points an invalid byte sequence, set C
1788 to -1. */
1789
1790 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf) \
1791 do { \
1792 int c, gref, nref; \
1793 \
1794 if (src >= src_end) \
1795 goto invalid_code; \
1796 ONE_MORE_BYTE_NO_CHECK (c); \
1797 c -= 0x20; \
1798 if (c < 0 || c >= 81) \
1799 goto invalid_code; \
1800 \
1801 gref = c / 9, nref = c % 9; \
1802 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1803 } while (0)
1804
1805
1806 /* Decode a composition rule represented as a component of composition
1807 sequence of Emacs 21 style at SRC. Store the decoded rule in *BUF,
1808 and increment BUF. If SRC points an invalid byte sequence, set C
1809 to -1. */
1810
1811 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf) \
1812 do { \
1813 int gref, nref; \
1814 \
1815 if (src + 1>= src_end) \
1816 goto invalid_code; \
1817 ONE_MORE_BYTE_NO_CHECK (gref); \
1818 gref -= 0x20; \
1819 ONE_MORE_BYTE_NO_CHECK (nref); \
1820 nref -= 0x20; \
1821 if (gref < 0 || gref >= 81 \
1822 || nref < 0 || nref >= 81) \
1823 goto invalid_code; \
1824 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1825 } while (0)
1826
1827
1828 #define DECODE_EMACS_MULE_21_COMPOSITION(c) \
1829 do { \
1830 /* Emacs 21 style format. The first three bytes at SRC are \
1831 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
1832 the byte length of this composition information, CHARS is the \
1833 number of characters composed by this composition. */ \
1834 enum composition_method method = c - 0xF2; \
1835 int *charbuf_base = charbuf; \
1836 int from, to; \
1837 int consumed_chars_limit; \
1838 int nbytes, nchars; \
1839 \
1840 ONE_MORE_BYTE (c); \
1841 nbytes = c - 0xA0; \
1842 if (nbytes < 3) \
1843 goto invalid_code; \
1844 ONE_MORE_BYTE (c); \
1845 nchars = c - 0xA0; \
1846 from = coding->produced + char_offset; \
1847 to = from + nchars; \
1848 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
1849 consumed_chars_limit = consumed_chars_base + nbytes; \
1850 if (method != COMPOSITION_RELATIVE) \
1851 { \
1852 int i = 0; \
1853 while (consumed_chars < consumed_chars_limit) \
1854 { \
1855 if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
1856 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf); \
1857 else \
1858 DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
1859 i++; \
1860 } \
1861 if (consumed_chars < consumed_chars_limit) \
1862 goto invalid_code; \
1863 charbuf_base[0] -= i; \
1864 } \
1865 } while (0)
1866
1867
1868 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
1869 do { \
1870 /* Emacs 20 style format for relative composition. */ \
1871 /* Store multibyte form of characters to be composed. */ \
1872 enum composition_method method = COMPOSITION_RELATIVE; \
1873 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1874 int *buf = components; \
1875 int i, j; \
1876 int from, to; \
1877 \
1878 src = src_base; \
1879 ONE_MORE_BYTE (c); /* skip 0x80 */ \
1880 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1881 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1882 if (i < 2) \
1883 goto invalid_code; \
1884 from = coding->produced_char + char_offset; \
1885 to = from + i; \
1886 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
1887 for (j = 0; j < i; j++) \
1888 *charbuf++ = components[j]; \
1889 } while (0)
1890
1891
1892 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
1893 do { \
1894 /* Emacs 20 style format for rule-base composition. */ \
1895 /* Store multibyte form of characters to be composed. */ \
1896 enum composition_method method = COMPOSITION_WITH_RULE; \
1897 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1898 int *buf = components; \
1899 int i, j; \
1900 int from, to; \
1901 \
1902 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1903 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1904 { \
1905 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \
1906 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1907 } \
1908 if (i < 1 || (buf - components) % 2 == 0) \
1909 goto invalid_code; \
1910 if (charbuf + i + (i / 2) + 1 < charbuf_end) \
1911 goto no_more_source; \
1912 from = coding->produced_char + char_offset; \
1913 to = from + i; \
1914 ADD_COMPOSITION_DATA (buf, from, to, method); \
1915 for (j = 0; j < i; j++) \
1916 *charbuf++ = components[j]; \
1917 for (j = 0; j < i; j += 2) \
1918 *charbuf++ = components[j]; \
1919 } while (0)
1920
1921
1922 static void
1923 decode_coding_emacs_mule (coding)
1924 struct coding_system *coding;
1925 {
1926 unsigned char *src = coding->source + coding->consumed;
1927 unsigned char *src_end = coding->source + coding->src_bytes;
1928 unsigned char *src_base;
1929 int *charbuf = coding->charbuf;
1930 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
1931 int consumed_chars = 0, consumed_chars_base;
1932 int multibytep = coding->src_multibyte;
1933 Lisp_Object attrs, eol_type, charset_list;
1934 int char_offset = coding->produced_char;
1935 int last_offset = char_offset;
1936 int last_id = charset_ascii;
1937
1938 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1939
1940 while (1)
1941 {
1942 int c;
1943
1944 src_base = src;
1945 consumed_chars_base = consumed_chars;
1946
1947 if (charbuf >= charbuf_end)
1948 break;
1949
1950 ONE_MORE_BYTE (c);
1951
1952 if (c < 0x80)
1953 {
1954 if (c == '\r')
1955 {
1956 if (EQ (eol_type, Qdos))
1957 {
1958 if (src == src_end)
1959 {
1960 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
1961 goto no_more_source;
1962 }
1963 if (*src == '\n')
1964 ONE_MORE_BYTE (c);
1965 }
1966 else if (EQ (eol_type, Qmac))
1967 c = '\n';
1968 }
1969 *charbuf++ = c;
1970 char_offset++;
1971 }
1972 else if (c == 0x80)
1973 {
1974 ONE_MORE_BYTE (c);
1975 if (c - 0xF2 >= COMPOSITION_RELATIVE
1976 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
1977 DECODE_EMACS_MULE_21_COMPOSITION (c);
1978 else if (c < 0xC0)
1979 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
1980 else if (c == 0xFF)
1981 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
1982 else
1983 goto invalid_code;
1984 }
1985 else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
1986 {
1987 int nbytes, nchars;
1988 int id;
1989
1990 src = src_base;
1991 consumed_chars = consumed_chars_base;
1992 c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
1993 if (c < 0)
1994 {
1995 if (c == -2)
1996 break;
1997 goto invalid_code;
1998 }
1999 if (last_id != id)
2000 {
2001 if (last_id != charset_ascii)
2002 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
2003 last_id = id;
2004 last_offset = char_offset;
2005 }
2006 *charbuf++ = c;
2007 src += nbytes;
2008 consumed_chars += nchars;
2009 char_offset++;
2010 }
2011 continue;
2012
2013 invalid_code:
2014 src = src_base;
2015 consumed_chars = consumed_chars_base;
2016 ONE_MORE_BYTE (c);
2017 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2018 char_offset++;
2019 coding->errors++;
2020 }
2021
2022 no_more_source:
2023 if (last_id != charset_ascii)
2024 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
2025 coding->consumed_char += consumed_chars_base;
2026 coding->consumed = src_base - coding->source;
2027 coding->charbuf_used = charbuf - coding->charbuf;
2028 }
2029
2030
2031 #define EMACS_MULE_LEADING_CODES(id, codes) \
2032 do { \
2033 if (id < 0xA0) \
2034 codes[0] = id, codes[1] = 0; \
2035 else if (id < 0xE0) \
2036 codes[0] = 0x9A, codes[1] = id; \
2037 else if (id < 0xF0) \
2038 codes[0] = 0x9B, codes[1] = id; \
2039 else if (id < 0xF5) \
2040 codes[0] = 0x9C, codes[1] = id; \
2041 else \
2042 codes[0] = 0x9D, codes[1] = id; \
2043 } while (0);
2044
2045
2046 static int
2047 encode_coding_emacs_mule (coding)
2048 struct coding_system *coding;
2049 {
2050 int multibytep = coding->dst_multibyte;
2051 int *charbuf = coding->charbuf;
2052 int *charbuf_end = charbuf + coding->charbuf_used;
2053 unsigned char *dst = coding->destination + coding->produced;
2054 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2055 int safe_room = 8;
2056 int produced_chars = 0;
2057 Lisp_Object attrs, eol_type, charset_list;
2058 int c;
2059 int preferred_charset_id = -1;
2060
2061 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
2062
2063 while (charbuf < charbuf_end)
2064 {
2065 ASSURE_DESTINATION (safe_room);
2066 c = *charbuf++;
2067
2068 if (c < 0)
2069 {
2070 /* Handle an annotation. */
2071 switch (*charbuf)
2072 {
2073 case CODING_ANNOTATE_COMPOSITION_MASK:
2074 /* Not yet implemented. */
2075 break;
2076 case CODING_ANNOTATE_CHARSET_MASK:
2077 preferred_charset_id = charbuf[3];
2078 if (preferred_charset_id >= 0
2079 && NILP (Fmemq (make_number (preferred_charset_id),
2080 charset_list)))
2081 preferred_charset_id = -1;
2082 break;
2083 default:
2084 abort ();
2085 }
2086 charbuf += -c - 1;
2087 continue;
2088 }
2089
2090 if (ASCII_CHAR_P (c))
2091 EMIT_ONE_ASCII_BYTE (c);
2092 else if (CHAR_BYTE8_P (c))
2093 {
2094 c = CHAR_TO_BYTE8 (c);
2095 EMIT_ONE_BYTE (c);
2096 }
2097 else
2098 {
2099 struct charset *charset;
2100 unsigned code;
2101 int dimension;
2102 int emacs_mule_id;
2103 unsigned char leading_codes[2];
2104
2105 if (preferred_charset_id >= 0)
2106 {
2107 charset = CHARSET_FROM_ID (preferred_charset_id);
2108 if (! CHAR_CHARSET_P (c, charset))
2109 charset = char_charset (c, charset_list, NULL);
2110 }
2111 else
2112 charset = char_charset (c, charset_list, &code);
2113 if (! charset)
2114 {
2115 c = coding->default_char;
2116 if (ASCII_CHAR_P (c))
2117 {
2118 EMIT_ONE_ASCII_BYTE (c);
2119 continue;
2120 }
2121 charset = char_charset (c, charset_list, &code);
2122 }
2123 dimension = CHARSET_DIMENSION (charset);
2124 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2125 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2126 EMIT_ONE_BYTE (leading_codes[0]);
2127 if (leading_codes[1])
2128 EMIT_ONE_BYTE (leading_codes[1]);
2129 if (dimension == 1)
2130 EMIT_ONE_BYTE (code);
2131 else
2132 {
2133 EMIT_ONE_BYTE (code >> 8);
2134 EMIT_ONE_BYTE (code & 0xFF);
2135 }
2136 }
2137 }
2138 coding->result = CODING_RESULT_SUCCESS;
2139 coding->produced_char += produced_chars;
2140 coding->produced = dst - coding->destination;
2141 return 0;
2142 }
2143
2144 \f
2145 /*** 7. ISO2022 handlers ***/
2146
2147 /* The following note describes the coding system ISO2022 briefly.
2148 Since the intention of this note is to help understand the
2149 functions in this file, some parts are NOT ACCURATE or are OVERLY
2150 SIMPLIFIED. For thorough understanding, please refer to the
2151 original document of ISO2022. This is equivalent to the standard
2152 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2153
2154 ISO2022 provides many mechanisms to encode several character sets
2155 in 7-bit and 8-bit environments. For 7-bit environments, all text
2156 is encoded using bytes less than 128. This may make the encoded
2157 text a little bit longer, but the text passes more easily through
2158 several types of gateway, some of which strip off the MSB (Most
2159 Significant Bit).
2160
2161 There are two kinds of character sets: control character sets and
2162 graphic character sets. The former contain control characters such
2163 as `newline' and `escape' to provide control functions (control
2164 functions are also provided by escape sequences). The latter
2165 contain graphic characters such as 'A' and '-'. Emacs recognizes
2166 two control character sets and many graphic character sets.
2167
2168 Graphic character sets are classified into one of the following
2169 four classes, according to the number of bytes (DIMENSION) and
2170 number of characters in one dimension (CHARS) of the set:
2171 - DIMENSION1_CHARS94
2172 - DIMENSION1_CHARS96
2173 - DIMENSION2_CHARS94
2174 - DIMENSION2_CHARS96
2175
2176 In addition, each character set is assigned an identification tag,
2177 unique for each set, called the "final character" (denoted as <F>
2178 hereafter). The <F> of each character set is decided by ECMA(*)
2179 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2180 (0x30..0x3F are for private use only).
2181
2182 Note (*): ECMA = European Computer Manufacturers Association
2183
2184 Here are examples of graphic character sets [NAME(<F>)]:
2185 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2186 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2187 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2188 o DIMENSION2_CHARS96 -- none for the moment
2189
2190 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2191 C0 [0x00..0x1F] -- control character plane 0
2192 GL [0x20..0x7F] -- graphic character plane 0
2193 C1 [0x80..0x9F] -- control character plane 1
2194 GR [0xA0..0xFF] -- graphic character plane 1
2195
2196 A control character set is directly designated and invoked to C0 or
2197 C1 by an escape sequence. The most common case is that:
2198 - ISO646's control character set is designated/invoked to C0, and
2199 - ISO6429's control character set is designated/invoked to C1,
2200 and usually these designations/invocations are omitted in encoded
2201 text. In a 7-bit environment, only C0 can be used, and a control
2202 character for C1 is encoded by an appropriate escape sequence to
2203 fit into the environment. All control characters for C1 are
2204 defined to have corresponding escape sequences.
2205
2206 A graphic character set is at first designated to one of four
2207 graphic registers (G0 through G3), then these graphic registers are
2208 invoked to GL or GR. These designations and invocations can be
2209 done independently. The most common case is that G0 is invoked to
2210 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2211 these invocations and designations are omitted in encoded text.
2212 In a 7-bit environment, only GL can be used.
2213
2214 When a graphic character set of CHARS94 is invoked to GL, codes
2215 0x20 and 0x7F of the GL area work as control characters SPACE and
2216 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2217 be used.
2218
2219 There are two ways of invocation: locking-shift and single-shift.
2220 With locking-shift, the invocation lasts until the next different
2221 invocation, whereas with single-shift, the invocation affects the
2222 following character only and doesn't affect the locking-shift
2223 state. Invocations are done by the following control characters or
2224 escape sequences:
2225
2226 ----------------------------------------------------------------------
2227 abbrev function cntrl escape seq description
2228 ----------------------------------------------------------------------
2229 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2230 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2231 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2232 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2233 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2234 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2235 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2236 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2237 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2238 ----------------------------------------------------------------------
2239 (*) These are not used by any known coding system.
2240
2241 Control characters for these functions are defined by macros
2242 ISO_CODE_XXX in `coding.h'.
2243
2244 Designations are done by the following escape sequences:
2245 ----------------------------------------------------------------------
2246 escape sequence description
2247 ----------------------------------------------------------------------
2248 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2249 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2250 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2251 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2252 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2253 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2254 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2255 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2256 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2257 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2258 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2259 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2260 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2261 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2262 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2263 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2264 ----------------------------------------------------------------------
2265
2266 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2267 of dimension 1, chars 94, and final character <F>, etc...
2268
2269 Note (*): Although these designations are not allowed in ISO2022,
2270 Emacs accepts them on decoding, and produces them on encoding
2271 CHARS96 character sets in a coding system which is characterized as
2272 7-bit environment, non-locking-shift, and non-single-shift.
2273
2274 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2275 '(' must be omitted. We refer to this as "short-form" hereafter.
2276
2277 Now you may notice that there are a lot of ways of encoding the
2278 same multilingual text in ISO2022. Actually, there exist many
2279 coding systems such as Compound Text (used in X11's inter client
2280 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2281 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2282 localized platforms), and all of these are variants of ISO2022.
2283
2284 In addition to the above, Emacs handles two more kinds of escape
2285 sequences: ISO6429's direction specification and Emacs' private
2286 sequence for specifying character composition.
2287
2288 ISO6429's direction specification takes the following form:
2289 o CSI ']' -- end of the current direction
2290 o CSI '0' ']' -- end of the current direction
2291 o CSI '1' ']' -- start of left-to-right text
2292 o CSI '2' ']' -- start of right-to-left text
2293 The control character CSI (0x9B: control sequence introducer) is
2294 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2295
2296 Character composition specification takes the following form:
2297 o ESC '0' -- start relative composition
2298 o ESC '1' -- end composition
2299 o ESC '2' -- start rule-base composition (*)
2300 o ESC '3' -- start relative composition with alternate chars (**)
2301 o ESC '4' -- start rule-base composition with alternate chars (**)
2302 Since these are not standard escape sequences of any ISO standard,
2303 the use of them with these meanings is restricted to Emacs only.
2304
2305 (*) This form is used only in Emacs 20.7 and older versions,
2306 but newer versions can safely decode it.
2307 (**) This form is used only in Emacs 21.1 and newer versions,
2308 and older versions can't decode it.
2309
2310 Here's a list of example usages of these composition escape
2311 sequences (categorized by `enum composition_method').
2312
2313 COMPOSITION_RELATIVE:
2314 ESC 0 CHAR [ CHAR ] ESC 1
2315 COMPOSITION_WITH_RULE:
2316 ESC 2 CHAR [ RULE CHAR ] ESC 1
2317 COMPOSITION_WITH_ALTCHARS:
2318 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2319 COMPOSITION_WITH_RULE_ALTCHARS:
2320 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2321
2322 enum iso_code_class_type iso_code_class[256];
2323
2324 #define SAFE_CHARSET_P(coding, id) \
2325 ((id) <= (coding)->max_charset_id \
2326 && (coding)->safe_charsets[id] >= 0)
2327
2328
2329 #define SHIFT_OUT_OK(category) \
2330 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2331
2332 static void
2333 setup_iso_safe_charsets (attrs)
2334 Lisp_Object attrs;
2335 {
2336 Lisp_Object charset_list, safe_charsets;
2337 Lisp_Object request;
2338 Lisp_Object reg_usage;
2339 Lisp_Object tail;
2340 int reg94, reg96;
2341 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2342 int max_charset_id;
2343
2344 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2345 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2346 && ! EQ (charset_list, Viso_2022_charset_list))
2347 {
2348 CODING_ATTR_CHARSET_LIST (attrs)
2349 = charset_list = Viso_2022_charset_list;
2350 ASET (attrs, coding_attr_safe_charsets, Qnil);
2351 }
2352
2353 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2354 return;
2355
2356 max_charset_id = 0;
2357 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2358 {
2359 int id = XINT (XCAR (tail));
2360 if (max_charset_id < id)
2361 max_charset_id = id;
2362 }
2363
2364 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2365 make_number (255));
2366 request = AREF (attrs, coding_attr_iso_request);
2367 reg_usage = AREF (attrs, coding_attr_iso_usage);
2368 reg94 = XINT (XCAR (reg_usage));
2369 reg96 = XINT (XCDR (reg_usage));
2370
2371 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2372 {
2373 Lisp_Object id;
2374 Lisp_Object reg;
2375 struct charset *charset;
2376
2377 id = XCAR (tail);
2378 charset = CHARSET_FROM_ID (XINT (id));
2379 reg = Fcdr (Fassq (id, request));
2380 if (! NILP (reg))
2381 XSTRING (safe_charsets)->data[XINT (id)] = XINT (reg);
2382 else if (charset->iso_chars_96)
2383 {
2384 if (reg96 < 4)
2385 XSTRING (safe_charsets)->data[XINT (id)] = reg96;
2386 }
2387 else
2388 {
2389 if (reg94 < 4)
2390 XSTRING (safe_charsets)->data[XINT (id)] = reg94;
2391 }
2392 }
2393 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2394 }
2395
2396
2397 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2398 Check if a text is encoded in one of ISO-2022 based codig systems.
2399 If it is, return 1, else return 0. */
2400
2401 static int
2402 detect_coding_iso_2022 (coding, detect_info)
2403 struct coding_system *coding;
2404 struct coding_detection_info *detect_info;
2405 {
2406 unsigned char *src = coding->source, *src_base = src;
2407 unsigned char *src_end = coding->source + coding->src_bytes;
2408 int multibytep = coding->src_multibyte;
2409 int single_shifting = 0;
2410 int id;
2411 int c, c1;
2412 int consumed_chars = 0;
2413 int i;
2414 int rejected = 0;
2415 int found = 0;
2416
2417 detect_info->checked |= CATEGORY_MASK_ISO;
2418
2419 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2420 {
2421 struct coding_system *this = &(coding_categories[i]);
2422 Lisp_Object attrs, val;
2423
2424 attrs = CODING_ID_ATTRS (this->id);
2425 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2426 && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2427 setup_iso_safe_charsets (attrs);
2428 val = CODING_ATTR_SAFE_CHARSETS (attrs);
2429 this->max_charset_id = XSTRING (val)->size - 1;
2430 this->safe_charsets = (char *) XSTRING (val)->data;
2431 }
2432
2433 /* A coding system of this category is always ASCII compatible. */
2434 src += coding->head_ascii;
2435
2436 while (rejected != CATEGORY_MASK_ISO)
2437 {
2438 ONE_MORE_BYTE (c);
2439 switch (c)
2440 {
2441 case ISO_CODE_ESC:
2442 if (inhibit_iso_escape_detection)
2443 break;
2444 single_shifting = 0;
2445 ONE_MORE_BYTE (c);
2446 if (c >= '(' && c <= '/')
2447 {
2448 /* Designation sequence for a charset of dimension 1. */
2449 ONE_MORE_BYTE (c1);
2450 if (c1 < ' ' || c1 >= 0x80
2451 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2452 /* Invalid designation sequence. Just ignore. */
2453 break;
2454 }
2455 else if (c == '$')
2456 {
2457 /* Designation sequence for a charset of dimension 2. */
2458 ONE_MORE_BYTE (c);
2459 if (c >= '@' && c <= 'B')
2460 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
2461 id = iso_charset_table[1][0][c];
2462 else if (c >= '(' && c <= '/')
2463 {
2464 ONE_MORE_BYTE (c1);
2465 if (c1 < ' ' || c1 >= 0x80
2466 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2467 /* Invalid designation sequence. Just ignore. */
2468 break;
2469 }
2470 else
2471 /* Invalid designation sequence. Just ignore it. */
2472 break;
2473 }
2474 else if (c == 'N' || c == 'O')
2475 {
2476 /* ESC <Fe> for SS2 or SS3. */
2477 single_shifting = 1;
2478 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2479 break;
2480 }
2481 else if (c >= '0' && c <= '4')
2482 {
2483 /* ESC <Fp> for start/end composition. */
2484 found |= CATEGORY_MASK_ISO;
2485 break;
2486 }
2487 else
2488 {
2489 /* Invalid escape sequence. Just ignore it. */
2490 break;
2491 }
2492
2493 /* We found a valid designation sequence for CHARSET. */
2494 rejected |= CATEGORY_MASK_ISO_8BIT;
2495 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2496 id))
2497 found |= CATEGORY_MASK_ISO_7;
2498 else
2499 rejected |= CATEGORY_MASK_ISO_7;
2500 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2501 id))
2502 found |= CATEGORY_MASK_ISO_7_TIGHT;
2503 else
2504 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
2505 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2506 id))
2507 found |= CATEGORY_MASK_ISO_7_ELSE;
2508 else
2509 rejected |= CATEGORY_MASK_ISO_7_ELSE;
2510 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2511 id))
2512 found |= CATEGORY_MASK_ISO_8_ELSE;
2513 else
2514 rejected |= CATEGORY_MASK_ISO_8_ELSE;
2515 break;
2516
2517 case ISO_CODE_SO:
2518 case ISO_CODE_SI:
2519 /* Locking shift out/in. */
2520 if (inhibit_iso_escape_detection)
2521 break;
2522 single_shifting = 0;
2523 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2524 found |= CATEGORY_MASK_ISO_ELSE;
2525 break;
2526
2527 case ISO_CODE_CSI:
2528 /* Control sequence introducer. */
2529 single_shifting = 0;
2530 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2531 found |= CATEGORY_MASK_ISO_8_ELSE;
2532 goto check_extra_latin;
2533
2534
2535 case ISO_CODE_SS2:
2536 case ISO_CODE_SS3:
2537 /* Single shift. */
2538 if (inhibit_iso_escape_detection)
2539 break;
2540 single_shifting = 1;
2541 rejected |= CATEGORY_MASK_ISO_7BIT;
2542 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2543 & CODING_ISO_FLAG_SINGLE_SHIFT)
2544 found |= CATEGORY_MASK_ISO_8_1;
2545 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2546 & CODING_ISO_FLAG_SINGLE_SHIFT)
2547 found |= CATEGORY_MASK_ISO_8_2;
2548 goto check_extra_latin;
2549
2550 default:
2551 if (c < 0x80)
2552 {
2553 single_shifting = 0;
2554 break;
2555 }
2556 if (c >= 0xA0)
2557 {
2558 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2559 found |= CATEGORY_MASK_ISO_8_1;
2560 /* Check the length of succeeding codes of the range
2561 0xA0..0FF. If the byte length is even, we include
2562 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
2563 only when we are not single shifting. */
2564 if (! single_shifting
2565 && ! (rejected & CATEGORY_MASK_ISO_8_2))
2566 {
2567 int i = 1;
2568 while (src < src_end)
2569 {
2570 ONE_MORE_BYTE (c);
2571 if (c < 0xA0)
2572 break;
2573 i++;
2574 }
2575
2576 if (i & 1 && src < src_end)
2577 rejected |= CATEGORY_MASK_ISO_8_2;
2578 else
2579 found |= CATEGORY_MASK_ISO_8_2;
2580 }
2581 break;
2582 }
2583 check_extra_latin:
2584 single_shifting = 0;
2585 if (! VECTORP (Vlatin_extra_code_table)
2586 || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2587 {
2588 rejected = CATEGORY_MASK_ISO;
2589 break;
2590 }
2591 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2592 & CODING_ISO_FLAG_LATIN_EXTRA)
2593 found |= CATEGORY_MASK_ISO_8_1;
2594 else
2595 rejected |= CATEGORY_MASK_ISO_8_1;
2596 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2597 & CODING_ISO_FLAG_LATIN_EXTRA)
2598 found |= CATEGORY_MASK_ISO_8_2;
2599 else
2600 rejected |= CATEGORY_MASK_ISO_8_2;
2601 }
2602 }
2603 detect_info->rejected |= CATEGORY_MASK_ISO;
2604 return 0;
2605
2606 no_more_source:
2607 detect_info->rejected |= rejected;
2608 detect_info->found |= (found & ~rejected);
2609 return 1;
2610 }
2611
2612
2613 /* Set designation state into CODING. */
2614 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
2615 do { \
2616 int id, prev; \
2617 \
2618 if (final < '0' || final >= 128 \
2619 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
2620 || !SAFE_CHARSET_P (coding, id)) \
2621 { \
2622 CODING_ISO_DESIGNATION (coding, reg) = -2; \
2623 goto invalid_code; \
2624 } \
2625 prev = CODING_ISO_DESIGNATION (coding, reg); \
2626 if (id == charset_jisx0201_roman) \
2627 { \
2628 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
2629 id = charset_ascii; \
2630 } \
2631 else if (id == charset_jisx0208_1978) \
2632 { \
2633 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
2634 id = charset_jisx0208; \
2635 } \
2636 CODING_ISO_DESIGNATION (coding, reg) = id; \
2637 /* If there was an invalid designation to REG previously, and this \
2638 designation is ASCII to REG, we should keep this designation \
2639 sequence. */ \
2640 if (prev == -2 && id == charset_ascii) \
2641 goto invalid_code; \
2642 } while (0)
2643
2644
2645 #define MAYBE_FINISH_COMPOSITION() \
2646 do { \
2647 int i; \
2648 if (composition_state == COMPOSING_NO) \
2649 break; \
2650 /* It is assured that we have enough room for producing \
2651 characters stored in the table `components'. */ \
2652 if (charbuf + component_idx > charbuf_end) \
2653 goto no_more_source; \
2654 composition_state = COMPOSING_NO; \
2655 if (method == COMPOSITION_RELATIVE \
2656 || method == COMPOSITION_WITH_ALTCHARS) \
2657 { \
2658 for (i = 0; i < component_idx; i++) \
2659 *charbuf++ = components[i]; \
2660 char_offset += component_idx; \
2661 } \
2662 else \
2663 { \
2664 for (i = 0; i < component_idx; i += 2) \
2665 *charbuf++ = components[i]; \
2666 char_offset += (component_idx / 2) + 1; \
2667 } \
2668 } while (0)
2669
2670
2671 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2672 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2673 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2674 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2675 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2676 */
2677
2678 #define DECODE_COMPOSITION_START(c1) \
2679 do { \
2680 if (c1 == '0' \
2681 && composition_state == COMPOSING_COMPONENT_RULE) \
2682 { \
2683 component_len = component_idx; \
2684 composition_state = COMPOSING_CHAR; \
2685 } \
2686 else \
2687 { \
2688 unsigned char *p; \
2689 \
2690 MAYBE_FINISH_COMPOSITION (); \
2691 if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \
2692 goto no_more_source; \
2693 for (p = src; p < src_end - 1; p++) \
2694 if (*p == ISO_CODE_ESC && p[1] == '1') \
2695 break; \
2696 if (p == src_end - 1) \
2697 { \
2698 if (coding->mode & CODING_MODE_LAST_BLOCK) \
2699 goto invalid_code; \
2700 goto no_more_source; \
2701 } \
2702 \
2703 /* This is surely the start of a composition. */ \
2704 method = (c1 == '0' ? COMPOSITION_RELATIVE \
2705 : c1 == '2' ? COMPOSITION_WITH_RULE \
2706 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
2707 : COMPOSITION_WITH_RULE_ALTCHARS); \
2708 composition_state = (c1 <= '2' ? COMPOSING_CHAR \
2709 : COMPOSING_COMPONENT_CHAR); \
2710 component_idx = component_len = 0; \
2711 } \
2712 } while (0)
2713
2714
2715 /* Handle compositoin end sequence ESC 1. */
2716
2717 #define DECODE_COMPOSITION_END() \
2718 do { \
2719 int nchars = (component_len > 0 ? component_idx - component_len \
2720 : method == COMPOSITION_RELATIVE ? component_idx \
2721 : (component_idx + 1) / 2); \
2722 int i; \
2723 int *saved_charbuf = charbuf; \
2724 int from = coding->produced_char + char_offset; \
2725 int to = from + nchars; \
2726 \
2727 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
2728 if (method != COMPOSITION_RELATIVE) \
2729 { \
2730 if (component_len == 0) \
2731 for (i = 0; i < component_idx; i++) \
2732 *charbuf++ = components[i]; \
2733 else \
2734 for (i = 0; i < component_len; i++) \
2735 *charbuf++ = components[i]; \
2736 *saved_charbuf = saved_charbuf - charbuf; \
2737 } \
2738 if (method == COMPOSITION_WITH_RULE) \
2739 for (i = 0; i < component_idx; i += 2, char_offset++) \
2740 *charbuf++ = components[i]; \
2741 else \
2742 for (i = component_len; i < component_idx; i++, char_offset++) \
2743 *charbuf++ = components[i]; \
2744 coding->annotated = 1; \
2745 composition_state = COMPOSING_NO; \
2746 } while (0)
2747
2748
2749 /* Decode a composition rule from the byte C1 (and maybe one more byte
2750 from SRC) and store one encoded composition rule in
2751 coding->cmp_data. */
2752
2753 #define DECODE_COMPOSITION_RULE(c1) \
2754 do { \
2755 (c1) -= 32; \
2756 if (c1 < 81) /* old format (before ver.21) */ \
2757 { \
2758 int gref = (c1) / 9; \
2759 int nref = (c1) % 9; \
2760 if (gref == 4) gref = 10; \
2761 if (nref == 4) nref = 10; \
2762 c1 = COMPOSITION_ENCODE_RULE (gref, nref); \
2763 } \
2764 else if (c1 < 93) /* new format (after ver.21) */ \
2765 { \
2766 ONE_MORE_BYTE (c2); \
2767 c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
2768 } \
2769 else \
2770 c1 = 0; \
2771 } while (0)
2772
2773
2774 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2775
2776 static void
2777 decode_coding_iso_2022 (coding)
2778 struct coding_system *coding;
2779 {
2780 unsigned char *src = coding->source + coding->consumed;
2781 unsigned char *src_end = coding->source + coding->src_bytes;
2782 unsigned char *src_base;
2783 int *charbuf = coding->charbuf;
2784 int *charbuf_end
2785 = charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
2786 int consumed_chars = 0, consumed_chars_base;
2787 int multibytep = coding->src_multibyte;
2788 /* Charsets invoked to graphic plane 0 and 1 respectively. */
2789 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2790 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2791 struct charset *charset;
2792 int c;
2793 /* For handling composition sequence. */
2794 #define COMPOSING_NO 0
2795 #define COMPOSING_CHAR 1
2796 #define COMPOSING_RULE 2
2797 #define COMPOSING_COMPONENT_CHAR 3
2798 #define COMPOSING_COMPONENT_RULE 4
2799
2800 int composition_state = COMPOSING_NO;
2801 enum composition_method method;
2802 int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
2803 int component_idx;
2804 int component_len;
2805 Lisp_Object attrs, eol_type, charset_list;
2806 int char_offset = coding->produced_char;
2807 int last_offset = char_offset;
2808 int last_id = charset_ascii;
2809
2810 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
2811 setup_iso_safe_charsets (attrs);
2812
2813 while (1)
2814 {
2815 int c1, c2;
2816
2817 src_base = src;
2818 consumed_chars_base = consumed_chars;
2819
2820 if (charbuf >= charbuf_end)
2821 break;
2822
2823 ONE_MORE_BYTE (c1);
2824
2825 /* We produce at most one character. */
2826 switch (iso_code_class [c1])
2827 {
2828 case ISO_0x20_or_0x7F:
2829 if (composition_state != COMPOSING_NO)
2830 {
2831 if (composition_state == COMPOSING_RULE
2832 || composition_state == COMPOSING_COMPONENT_RULE)
2833 {
2834 DECODE_COMPOSITION_RULE (c1);
2835 components[component_idx++] = c1;
2836 composition_state--;
2837 continue;
2838 }
2839 }
2840 if (charset_id_0 < 0
2841 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
2842 /* This is SPACE or DEL. */
2843 charset = CHARSET_FROM_ID (charset_ascii);
2844 else
2845 charset = CHARSET_FROM_ID (charset_id_0);
2846 break;
2847
2848 case ISO_graphic_plane_0:
2849 if (composition_state != COMPOSING_NO)
2850 {
2851 if (composition_state == COMPOSING_RULE
2852 || composition_state == COMPOSING_COMPONENT_RULE)
2853 {
2854 DECODE_COMPOSITION_RULE (c1);
2855 components[component_idx++] = c1;
2856 composition_state--;
2857 continue;
2858 }
2859 }
2860 charset = CHARSET_FROM_ID (charset_id_0);
2861 break;
2862
2863 case ISO_0xA0_or_0xFF:
2864 if (charset_id_1 < 0
2865 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
2866 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
2867 goto invalid_code;
2868 /* This is a graphic character, we fall down ... */
2869
2870 case ISO_graphic_plane_1:
2871 if (charset_id_1 < 0)
2872 goto invalid_code;
2873 charset = CHARSET_FROM_ID (charset_id_1);
2874 break;
2875
2876 case ISO_carriage_return:
2877 if (c1 == '\r')
2878 {
2879 if (EQ (eol_type, Qdos))
2880 {
2881 if (src == src_end)
2882 {
2883 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
2884 goto no_more_source;
2885 }
2886 if (*src == '\n')
2887 ONE_MORE_BYTE (c1);
2888 }
2889 else if (EQ (eol_type, Qmac))
2890 c1 = '\n';
2891 }
2892 /* fall through */
2893
2894 case ISO_control_0:
2895 MAYBE_FINISH_COMPOSITION ();
2896 charset = CHARSET_FROM_ID (charset_ascii);
2897 break;
2898
2899 case ISO_control_1:
2900 MAYBE_FINISH_COMPOSITION ();
2901 goto invalid_code;
2902
2903 case ISO_shift_out:
2904 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2905 || CODING_ISO_DESIGNATION (coding, 1) < 0)
2906 goto invalid_code;
2907 CODING_ISO_INVOCATION (coding, 0) = 1;
2908 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2909 continue;
2910
2911 case ISO_shift_in:
2912 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
2913 goto invalid_code;
2914 CODING_ISO_INVOCATION (coding, 0) = 0;
2915 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2916 continue;
2917
2918 case ISO_single_shift_2_7:
2919 case ISO_single_shift_2:
2920 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2921 goto invalid_code;
2922 /* SS2 is handled as an escape sequence of ESC 'N' */
2923 c1 = 'N';
2924 goto label_escape_sequence;
2925
2926 case ISO_single_shift_3:
2927 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2928 goto invalid_code;
2929 /* SS2 is handled as an escape sequence of ESC 'O' */
2930 c1 = 'O';
2931 goto label_escape_sequence;
2932
2933 case ISO_control_sequence_introducer:
2934 /* CSI is handled as an escape sequence of ESC '[' ... */
2935 c1 = '[';
2936 goto label_escape_sequence;
2937
2938 case ISO_escape:
2939 ONE_MORE_BYTE (c1);
2940 label_escape_sequence:
2941 /* Escape sequences handled here are invocation,
2942 designation, direction specification, and character
2943 composition specification. */
2944 switch (c1)
2945 {
2946 case '&': /* revision of following character set */
2947 ONE_MORE_BYTE (c1);
2948 if (!(c1 >= '@' && c1 <= '~'))
2949 goto invalid_code;
2950 ONE_MORE_BYTE (c1);
2951 if (c1 != ISO_CODE_ESC)
2952 goto invalid_code;
2953 ONE_MORE_BYTE (c1);
2954 goto label_escape_sequence;
2955
2956 case '$': /* designation of 2-byte character set */
2957 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
2958 goto invalid_code;
2959 ONE_MORE_BYTE (c1);
2960 if (c1 >= '@' && c1 <= 'B')
2961 { /* designation of JISX0208.1978, GB2312.1980,
2962 or JISX0208.1980 */
2963 DECODE_DESIGNATION (0, 2, 0, c1);
2964 }
2965 else if (c1 >= 0x28 && c1 <= 0x2B)
2966 { /* designation of DIMENSION2_CHARS94 character set */
2967 ONE_MORE_BYTE (c2);
2968 DECODE_DESIGNATION (c1 - 0x28, 2, 0, c2);
2969 }
2970 else if (c1 >= 0x2C && c1 <= 0x2F)
2971 { /* designation of DIMENSION2_CHARS96 character set */
2972 ONE_MORE_BYTE (c2);
2973 DECODE_DESIGNATION (c1 - 0x2C, 2, 1, c2);
2974 }
2975 else
2976 goto invalid_code;
2977 /* We must update these variables now. */
2978 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2979 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2980 continue;
2981
2982 case 'n': /* invocation of locking-shift-2 */
2983 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2984 || CODING_ISO_DESIGNATION (coding, 2) < 0)
2985 goto invalid_code;
2986 CODING_ISO_INVOCATION (coding, 0) = 2;
2987 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2988 continue;
2989
2990 case 'o': /* invocation of locking-shift-3 */
2991 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2992 || CODING_ISO_DESIGNATION (coding, 3) < 0)
2993 goto invalid_code;
2994 CODING_ISO_INVOCATION (coding, 0) = 3;
2995 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2996 continue;
2997
2998 case 'N': /* invocation of single-shift-2 */
2999 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3000 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3001 goto invalid_code;
3002 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 2));
3003 ONE_MORE_BYTE (c1);
3004 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3005 goto invalid_code;
3006 break;
3007
3008 case 'O': /* invocation of single-shift-3 */
3009 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3010 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3011 goto invalid_code;
3012 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 3));
3013 ONE_MORE_BYTE (c1);
3014 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3015 goto invalid_code;
3016 break;
3017
3018 case '0': case '2': case '3': case '4': /* start composition */
3019 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3020 goto invalid_code;
3021 DECODE_COMPOSITION_START (c1);
3022 continue;
3023
3024 case '1': /* end composition */
3025 if (composition_state == COMPOSING_NO)
3026 goto invalid_code;
3027 DECODE_COMPOSITION_END ();
3028 continue;
3029
3030 case '[': /* specification of direction */
3031 if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3032 goto invalid_code;
3033 /* For the moment, nested direction is not supported.
3034 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3035 left-to-right, and nozero means right-to-left. */
3036 ONE_MORE_BYTE (c1);
3037 switch (c1)
3038 {
3039 case ']': /* end of the current direction */
3040 coding->mode &= ~CODING_MODE_DIRECTION;
3041
3042 case '0': /* end of the current direction */
3043 case '1': /* start of left-to-right direction */
3044 ONE_MORE_BYTE (c1);
3045 if (c1 == ']')
3046 coding->mode &= ~CODING_MODE_DIRECTION;
3047 else
3048 goto invalid_code;
3049 break;
3050
3051 case '2': /* start of right-to-left direction */
3052 ONE_MORE_BYTE (c1);
3053 if (c1 == ']')
3054 coding->mode |= CODING_MODE_DIRECTION;
3055 else
3056 goto invalid_code;
3057 break;
3058
3059 default:
3060 goto invalid_code;
3061 }
3062 continue;
3063
3064 case '%':
3065 ONE_MORE_BYTE (c1);
3066 if (c1 == '/')
3067 {
3068 /* CTEXT extended segment:
3069 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3070 We keep these bytes as is for the moment.
3071 They may be decoded by post-read-conversion. */
3072 int dim, M, L;
3073 int size;
3074
3075 ONE_MORE_BYTE (dim);
3076 ONE_MORE_BYTE (M);
3077 ONE_MORE_BYTE (L);
3078 size = ((M - 128) * 128) + (L - 128);
3079 if (charbuf + 8 + size > charbuf_end)
3080 goto break_loop;
3081 *charbuf++ = ISO_CODE_ESC;
3082 *charbuf++ = '%';
3083 *charbuf++ = '/';
3084 *charbuf++ = dim;
3085 *charbuf++ = BYTE8_TO_CHAR (M);
3086 *charbuf++ = BYTE8_TO_CHAR (L);
3087 while (size-- > 0)
3088 {
3089 ONE_MORE_BYTE (c1);
3090 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3091 }
3092 }
3093 else if (c1 == 'G')
3094 {
3095 /* XFree86 extension for embedding UTF-8 in CTEXT:
3096 ESC % G --UTF-8-BYTES-- ESC % @
3097 We keep these bytes as is for the moment.
3098 They may be decoded by post-read-conversion. */
3099 int *p = charbuf;
3100
3101 if (p + 6 > charbuf_end)
3102 goto break_loop;
3103 *p++ = ISO_CODE_ESC;
3104 *p++ = '%';
3105 *p++ = 'G';
3106 while (p < charbuf_end)
3107 {
3108 ONE_MORE_BYTE (c1);
3109 if (c1 == ISO_CODE_ESC
3110 && src + 1 < src_end
3111 && src[0] == '%'
3112 && src[1] == '@')
3113 break;
3114 *p++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3115 }
3116 if (p + 3 > charbuf_end)
3117 goto break_loop;
3118 *p++ = ISO_CODE_ESC;
3119 *p++ = '%';
3120 *p++ = '@';
3121 charbuf = p;
3122 }
3123 else
3124 goto invalid_code;
3125 continue;
3126 break;
3127
3128 default:
3129 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3130 goto invalid_code;
3131 if (c1 >= 0x28 && c1 <= 0x2B)
3132 { /* designation of DIMENSION1_CHARS94 character set */
3133 ONE_MORE_BYTE (c2);
3134 DECODE_DESIGNATION (c1 - 0x28, 1, 0, c2);
3135 }
3136 else if (c1 >= 0x2C && c1 <= 0x2F)
3137 { /* designation of DIMENSION1_CHARS96 character set */
3138 ONE_MORE_BYTE (c2);
3139 DECODE_DESIGNATION (c1 - 0x2C, 1, 1, c2);
3140 }
3141 else
3142 goto invalid_code;
3143 /* We must update these variables now. */
3144 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3145 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3146 continue;
3147 }
3148 }
3149
3150 if (charset->id != charset_ascii
3151 && last_id != charset->id)
3152 {
3153 if (last_id != charset_ascii)
3154 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3155 last_id = charset->id;
3156 last_offset = char_offset;
3157 }
3158
3159 /* Now we know CHARSET and 1st position code C1 of a character.
3160 Produce a decoded character while getting 2nd position code
3161 C2 if necessary. */
3162 c1 &= 0x7F;
3163 if (CHARSET_DIMENSION (charset) > 1)
3164 {
3165 ONE_MORE_BYTE (c2);
3166 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3167 /* C2 is not in a valid range. */
3168 goto invalid_code;
3169 c1 = (c1 << 8) | (c2 & 0x7F);
3170 if (CHARSET_DIMENSION (charset) > 2)
3171 {
3172 ONE_MORE_BYTE (c2);
3173 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3174 /* C2 is not in a valid range. */
3175 goto invalid_code;
3176 c1 = (c1 << 8) | (c2 & 0x7F);
3177 }
3178 }
3179
3180 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3181 if (c < 0)
3182 {
3183 MAYBE_FINISH_COMPOSITION ();
3184 for (; src_base < src; src_base++, char_offset++)
3185 {
3186 if (ASCII_BYTE_P (*src_base))
3187 *charbuf++ = *src_base;
3188 else
3189 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3190 }
3191 }
3192 else if (composition_state == COMPOSING_NO)
3193 {
3194 *charbuf++ = c;
3195 char_offset++;
3196 }
3197 else
3198 {
3199 components[component_idx++] = c;
3200 if (method == COMPOSITION_WITH_RULE
3201 || (method == COMPOSITION_WITH_RULE_ALTCHARS
3202 && composition_state == COMPOSING_COMPONENT_CHAR))
3203 composition_state++;
3204 }
3205 continue;
3206
3207 invalid_code:
3208 MAYBE_FINISH_COMPOSITION ();
3209 src = src_base;
3210 consumed_chars = consumed_chars_base;
3211 ONE_MORE_BYTE (c);
3212 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3213 char_offset++;
3214 coding->errors++;
3215 continue;
3216
3217 break_loop:
3218 break;
3219 }
3220
3221 no_more_source:
3222 if (last_id != charset_ascii)
3223 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3224 coding->consumed_char += consumed_chars_base;
3225 coding->consumed = src_base - coding->source;
3226 coding->charbuf_used = charbuf - coding->charbuf;
3227 }
3228
3229
3230 /* ISO2022 encoding stuff. */
3231
3232 /*
3233 It is not enough to say just "ISO2022" on encoding, we have to
3234 specify more details. In Emacs, each coding system of ISO2022
3235 variant has the following specifications:
3236 1. Initial designation to G0 thru G3.
3237 2. Allows short-form designation?
3238 3. ASCII should be designated to G0 before control characters?
3239 4. ASCII should be designated to G0 at end of line?
3240 5. 7-bit environment or 8-bit environment?
3241 6. Use locking-shift?
3242 7. Use Single-shift?
3243 And the following two are only for Japanese:
3244 8. Use ASCII in place of JIS0201-1976-Roman?
3245 9. Use JISX0208-1983 in place of JISX0208-1978?
3246 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3247 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
3248 details.
3249 */
3250
3251 /* Produce codes (escape sequence) for designating CHARSET to graphic
3252 register REG at DST, and increment DST. If <final-char> of CHARSET is
3253 '@', 'A', or 'B' and the coding system CODING allows, produce
3254 designation sequence of short-form. */
3255
3256 #define ENCODE_DESIGNATION(charset, reg, coding) \
3257 do { \
3258 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
3259 char *intermediate_char_94 = "()*+"; \
3260 char *intermediate_char_96 = ",-./"; \
3261 int revision = -1; \
3262 int c; \
3263 \
3264 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
3265 revision = CHARSET_ISO_REVISION (charset); \
3266 \
3267 if (revision >= 0) \
3268 { \
3269 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3270 EMIT_ONE_BYTE ('@' + revision); \
3271 } \
3272 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
3273 if (CHARSET_DIMENSION (charset) == 1) \
3274 { \
3275 if (! CHARSET_ISO_CHARS_96 (charset)) \
3276 c = intermediate_char_94[reg]; \
3277 else \
3278 c = intermediate_char_96[reg]; \
3279 EMIT_ONE_ASCII_BYTE (c); \
3280 } \
3281 else \
3282 { \
3283 EMIT_ONE_ASCII_BYTE ('$'); \
3284 if (! CHARSET_ISO_CHARS_96 (charset)) \
3285 { \
3286 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
3287 || reg != 0 \
3288 || final_char < '@' || final_char > 'B') \
3289 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
3290 } \
3291 else \
3292 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
3293 } \
3294 EMIT_ONE_ASCII_BYTE (final_char); \
3295 \
3296 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
3297 } while (0)
3298
3299
3300 /* The following two macros produce codes (control character or escape
3301 sequence) for ISO2022 single-shift functions (single-shift-2 and
3302 single-shift-3). */
3303
3304 #define ENCODE_SINGLE_SHIFT_2 \
3305 do { \
3306 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3307 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3308 else \
3309 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3310 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3311 } while (0)
3312
3313
3314 #define ENCODE_SINGLE_SHIFT_3 \
3315 do { \
3316 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3317 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
3318 else \
3319 EMIT_ONE_BYTE (ISO_CODE_SS3); \
3320 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3321 } while (0)
3322
3323
3324 /* The following four macros produce codes (control character or
3325 escape sequence) for ISO2022 locking-shift functions (shift-in,
3326 shift-out, locking-shift-2, and locking-shift-3). */
3327
3328 #define ENCODE_SHIFT_IN \
3329 do { \
3330 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
3331 CODING_ISO_INVOCATION (coding, 0) = 0; \
3332 } while (0)
3333
3334
3335 #define ENCODE_SHIFT_OUT \
3336 do { \
3337 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
3338 CODING_ISO_INVOCATION (coding, 0) = 1; \
3339 } while (0)
3340
3341
3342 #define ENCODE_LOCKING_SHIFT_2 \
3343 do { \
3344 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3345 CODING_ISO_INVOCATION (coding, 0) = 2; \
3346 } while (0)
3347
3348
3349 #define ENCODE_LOCKING_SHIFT_3 \
3350 do { \
3351 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3352 CODING_ISO_INVOCATION (coding, 0) = 3; \
3353 } while (0)
3354
3355
3356 /* Produce codes for a DIMENSION1 character whose character set is
3357 CHARSET and whose position-code is C1. Designation and invocation
3358 sequences are also produced in advance if necessary. */
3359
3360 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
3361 do { \
3362 int id = CHARSET_ID (charset); \
3363 \
3364 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3365 && id == charset_ascii) \
3366 { \
3367 id = charset_jisx0201_roman; \
3368 charset = CHARSET_FROM_ID (id); \
3369 } \
3370 \
3371 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3372 { \
3373 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3374 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3375 else \
3376 EMIT_ONE_BYTE (c1 | 0x80); \
3377 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3378 break; \
3379 } \
3380 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3381 { \
3382 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3383 break; \
3384 } \
3385 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3386 { \
3387 EMIT_ONE_BYTE (c1 | 0x80); \
3388 break; \
3389 } \
3390 else \
3391 /* Since CHARSET is not yet invoked to any graphic planes, we \
3392 must invoke it, or, at first, designate it to some graphic \
3393 register. Then repeat the loop to actually produce the \
3394 character. */ \
3395 dst = encode_invocation_designation (charset, coding, dst, \
3396 &produced_chars); \
3397 } while (1)
3398
3399
3400 /* Produce codes for a DIMENSION2 character whose character set is
3401 CHARSET and whose position-codes are C1 and C2. Designation and
3402 invocation codes are also produced in advance if necessary. */
3403
3404 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
3405 do { \
3406 int id = CHARSET_ID (charset); \
3407 \
3408 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3409 && id == charset_jisx0208) \
3410 { \
3411 id = charset_jisx0208_1978; \
3412 charset = CHARSET_FROM_ID (id); \
3413 } \
3414 \
3415 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3416 { \
3417 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3418 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3419 else \
3420 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3421 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3422 break; \
3423 } \
3424 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3425 { \
3426 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3427 break; \
3428 } \
3429 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3430 { \
3431 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3432 break; \
3433 } \
3434 else \
3435 /* Since CHARSET is not yet invoked to any graphic planes, we \
3436 must invoke it, or, at first, designate it to some graphic \
3437 register. Then repeat the loop to actually produce the \
3438 character. */ \
3439 dst = encode_invocation_designation (charset, coding, dst, \
3440 &produced_chars); \
3441 } while (1)
3442
3443
3444 #define ENCODE_ISO_CHARACTER(charset, c) \
3445 do { \
3446 int code = ENCODE_CHAR ((charset),(c)); \
3447 \
3448 if (CHARSET_DIMENSION (charset) == 1) \
3449 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
3450 else \
3451 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3452 } while (0)
3453
3454
3455 /* Produce designation and invocation codes at a place pointed by DST
3456 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
3457 Return new DST. */
3458
3459 unsigned char *
3460 encode_invocation_designation (charset, coding, dst, p_nchars)
3461 struct charset *charset;
3462 struct coding_system *coding;
3463 unsigned char *dst;
3464 int *p_nchars;
3465 {
3466 int multibytep = coding->dst_multibyte;
3467 int produced_chars = *p_nchars;
3468 int reg; /* graphic register number */
3469 int id = CHARSET_ID (charset);
3470
3471 /* At first, check designations. */
3472 for (reg = 0; reg < 4; reg++)
3473 if (id == CODING_ISO_DESIGNATION (coding, reg))
3474 break;
3475
3476 if (reg >= 4)
3477 {
3478 /* CHARSET is not yet designated to any graphic registers. */
3479 /* At first check the requested designation. */
3480 reg = CODING_ISO_REQUEST (coding, id);
3481 if (reg < 0)
3482 /* Since CHARSET requests no special designation, designate it
3483 to graphic register 0. */
3484 reg = 0;
3485
3486 ENCODE_DESIGNATION (charset, reg, coding);
3487 }
3488
3489 if (CODING_ISO_INVOCATION (coding, 0) != reg
3490 && CODING_ISO_INVOCATION (coding, 1) != reg)
3491 {
3492 /* Since the graphic register REG is not invoked to any graphic
3493 planes, invoke it to graphic plane 0. */
3494 switch (reg)
3495 {
3496 case 0: /* graphic register 0 */
3497 ENCODE_SHIFT_IN;
3498 break;
3499
3500 case 1: /* graphic register 1 */
3501 ENCODE_SHIFT_OUT;
3502 break;
3503
3504 case 2: /* graphic register 2 */
3505 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3506 ENCODE_SINGLE_SHIFT_2;
3507 else
3508 ENCODE_LOCKING_SHIFT_2;
3509 break;
3510
3511 case 3: /* graphic register 3 */
3512 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3513 ENCODE_SINGLE_SHIFT_3;
3514 else
3515 ENCODE_LOCKING_SHIFT_3;
3516 break;
3517 }
3518 }
3519
3520 *p_nchars = produced_chars;
3521 return dst;
3522 }
3523
3524 /* The following three macros produce codes for indicating direction
3525 of text. */
3526 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
3527 do { \
3528 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
3529 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
3530 else \
3531 EMIT_ONE_BYTE (ISO_CODE_CSI); \
3532 } while (0)
3533
3534
3535 #define ENCODE_DIRECTION_R2L() \
3536 do { \
3537 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3538 EMIT_TWO_ASCII_BYTES ('2', ']'); \
3539 } while (0)
3540
3541
3542 #define ENCODE_DIRECTION_L2R() \
3543 do { \
3544 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3545 EMIT_TWO_ASCII_BYTES ('0', ']'); \
3546 } while (0)
3547
3548
3549 /* Produce codes for designation and invocation to reset the graphic
3550 planes and registers to initial state. */
3551 #define ENCODE_RESET_PLANE_AND_REGISTER() \
3552 do { \
3553 int reg; \
3554 struct charset *charset; \
3555 \
3556 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
3557 ENCODE_SHIFT_IN; \
3558 for (reg = 0; reg < 4; reg++) \
3559 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
3560 && (CODING_ISO_DESIGNATION (coding, reg) \
3561 != CODING_ISO_INITIAL (coding, reg))) \
3562 { \
3563 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3564 ENCODE_DESIGNATION (charset, reg, coding); \
3565 } \
3566 } while (0)
3567
3568
3569 /* Produce designation sequences of charsets in the line started from
3570 SRC to a place pointed by DST, and return updated DST.
3571
3572 If the current block ends before any end-of-line, we may fail to
3573 find all the necessary designations. */
3574
3575 static unsigned char *
3576 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
3577 struct coding_system *coding;
3578 int *charbuf, *charbuf_end;
3579 unsigned char *dst;
3580 {
3581 struct charset *charset;
3582 /* Table of charsets to be designated to each graphic register. */
3583 int r[4];
3584 int c, found = 0, reg;
3585 int produced_chars = 0;
3586 int multibytep = coding->dst_multibyte;
3587 Lisp_Object attrs;
3588 Lisp_Object charset_list;
3589
3590 attrs = CODING_ID_ATTRS (coding->id);
3591 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3592 if (EQ (charset_list, Qiso_2022))
3593 charset_list = Viso_2022_charset_list;
3594
3595 for (reg = 0; reg < 4; reg++)
3596 r[reg] = -1;
3597
3598 while (found < 4)
3599 {
3600 int id;
3601
3602 c = *charbuf++;
3603 if (c == '\n')
3604 break;
3605 charset = char_charset (c, charset_list, NULL);
3606 id = CHARSET_ID (charset);
3607 reg = CODING_ISO_REQUEST (coding, id);
3608 if (reg >= 0 && r[reg] < 0)
3609 {
3610 found++;
3611 r[reg] = id;
3612 }
3613 }
3614
3615 if (found)
3616 {
3617 for (reg = 0; reg < 4; reg++)
3618 if (r[reg] >= 0
3619 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3620 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
3621 }
3622
3623 return dst;
3624 }
3625
3626 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
3627
3628 static int
3629 encode_coding_iso_2022 (coding)
3630 struct coding_system *coding;
3631 {
3632 int multibytep = coding->dst_multibyte;
3633 int *charbuf = coding->charbuf;
3634 int *charbuf_end = charbuf + coding->charbuf_used;
3635 unsigned char *dst = coding->destination + coding->produced;
3636 unsigned char *dst_end = coding->destination + coding->dst_bytes;
3637 int safe_room = 16;
3638 int bol_designation
3639 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3640 && CODING_ISO_BOL (coding));
3641 int produced_chars = 0;
3642 Lisp_Object attrs, eol_type, charset_list;
3643 int ascii_compatible;
3644 int c;
3645 int preferred_charset_id = -1;
3646
3647 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3648 setup_iso_safe_charsets (attrs);
3649 /* Charset list may have been changed. */
3650 charset_list = CODING_ATTR_CHARSET_LIST (attrs); \
3651 coding->safe_charsets
3652 = (char *) XSTRING (CODING_ATTR_SAFE_CHARSETS(attrs))->data;
3653
3654 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3655
3656 while (charbuf < charbuf_end)
3657 {
3658 ASSURE_DESTINATION (safe_room);
3659
3660 if (bol_designation)
3661 {
3662 unsigned char *dst_prev = dst;
3663
3664 /* We have to produce designation sequences if any now. */
3665 dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3666 bol_designation = 0;
3667 /* We are sure that designation sequences are all ASCII bytes. */
3668 produced_chars += dst - dst_prev;
3669 }
3670
3671 c = *charbuf++;
3672
3673 if (c < 0)
3674 {
3675 /* Handle an annotation. */
3676 switch (*charbuf)
3677 {
3678 case CODING_ANNOTATE_COMPOSITION_MASK:
3679 /* Not yet implemented. */
3680 break;
3681 case CODING_ANNOTATE_CHARSET_MASK:
3682 preferred_charset_id = charbuf[3];
3683 if (preferred_charset_id >= 0
3684 && NILP (Fmemq (make_number (preferred_charset_id),
3685 charset_list)))
3686 preferred_charset_id = -1;
3687 break;
3688 default:
3689 abort ();
3690 }
3691 charbuf += -c - 1;
3692 continue;
3693 }
3694
3695 /* Now encode the character C. */
3696 if (c < 0x20 || c == 0x7F)
3697 {
3698 if (c == '\n'
3699 || (c == '\r' && EQ (eol_type, Qmac)))
3700 {
3701 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3702 ENCODE_RESET_PLANE_AND_REGISTER ();
3703 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
3704 {
3705 int i;
3706
3707 for (i = 0; i < 4; i++)
3708 CODING_ISO_DESIGNATION (coding, i)
3709 = CODING_ISO_INITIAL (coding, i);
3710 }
3711 bol_designation
3712 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
3713 }
3714 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3715 ENCODE_RESET_PLANE_AND_REGISTER ();
3716 EMIT_ONE_ASCII_BYTE (c);
3717 }
3718 else if (ASCII_CHAR_P (c))
3719 {
3720 if (ascii_compatible)
3721 EMIT_ONE_ASCII_BYTE (c);
3722 else
3723 {
3724 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
3725 ENCODE_ISO_CHARACTER (charset, c);
3726 }
3727 }
3728 else if (CHAR_BYTE8_P (c))
3729 {
3730 c = CHAR_TO_BYTE8 (c);
3731 EMIT_ONE_BYTE (c);
3732 }
3733 else
3734 {
3735 struct charset *charset;
3736
3737 if (preferred_charset_id >= 0)
3738 {
3739 charset = CHARSET_FROM_ID (preferred_charset_id);
3740 if (! CHAR_CHARSET_P (c, charset))
3741 charset = char_charset (c, charset_list, NULL);
3742 }
3743 else
3744 charset = char_charset (c, charset_list, NULL);
3745 if (!charset)
3746 {
3747 if (coding->mode & CODING_MODE_SAFE_ENCODING)
3748 {
3749 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3750 charset = CHARSET_FROM_ID (charset_ascii);
3751 }
3752 else
3753 {
3754 c = coding->default_char;
3755 charset = char_charset (c, charset_list, NULL);
3756 }
3757 }
3758 ENCODE_ISO_CHARACTER (charset, c);
3759 }
3760 }
3761
3762 if (coding->mode & CODING_MODE_LAST_BLOCK
3763 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3764 {
3765 ASSURE_DESTINATION (safe_room);
3766 ENCODE_RESET_PLANE_AND_REGISTER ();
3767 }
3768 coding->result = CODING_RESULT_SUCCESS;
3769 CODING_ISO_BOL (coding) = bol_designation;
3770 coding->produced_char += produced_chars;
3771 coding->produced = dst - coding->destination;
3772 return 0;
3773 }
3774
3775 \f
3776 /*** 8,9. SJIS and BIG5 handlers ***/
3777
3778 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3779 quite widely. So, for the moment, Emacs supports them in the bare
3780 C code. But, in the future, they may be supported only by CCL. */
3781
3782 /* SJIS is a coding system encoding three character sets: ASCII, right
3783 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3784 as is. A character of charset katakana-jisx0201 is encoded by
3785 "position-code + 0x80". A character of charset japanese-jisx0208
3786 is encoded in 2-byte but two position-codes are divided and shifted
3787 so that it fit in the range below.
3788
3789 --- CODE RANGE of SJIS ---
3790 (character set) (range)
3791 ASCII 0x00 .. 0x7F
3792 KATAKANA-JISX0201 0xA0 .. 0xDF
3793 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
3794 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3795 -------------------------------
3796
3797 */
3798
3799 /* BIG5 is a coding system encoding two character sets: ASCII and
3800 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3801 character set and is encoded in two-byte.
3802
3803 --- CODE RANGE of BIG5 ---
3804 (character set) (range)
3805 ASCII 0x00 .. 0x7F
3806 Big5 (1st byte) 0xA1 .. 0xFE
3807 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3808 --------------------------
3809
3810 */
3811
3812 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3813 Check if a text is encoded in SJIS. If it is, return
3814 CATEGORY_MASK_SJIS, else return 0. */
3815
3816 static int
3817 detect_coding_sjis (coding, detect_info)
3818 struct coding_system *coding;
3819 struct coding_detection_info *detect_info;
3820 {
3821 unsigned char *src = coding->source, *src_base = src;
3822 unsigned char *src_end = coding->source + coding->src_bytes;
3823 int multibytep = coding->src_multibyte;
3824 int consumed_chars = 0;
3825 int found = 0;
3826 int c;
3827 int incomplete;
3828
3829 detect_info->checked |= CATEGORY_MASK_SJIS;
3830 /* A coding system of this category is always ASCII compatible. */
3831 src += coding->head_ascii;
3832
3833 while (1)
3834 {
3835 incomplete = 0;
3836 ONE_MORE_BYTE (c);
3837 incomplete = 1;
3838 if (c < 0x80)
3839 continue;
3840 if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
3841 {
3842 ONE_MORE_BYTE (c);
3843 if (c < 0x40 || c == 0x7F || c > 0xFC)
3844 break;
3845 found = CATEGORY_MASK_SJIS;
3846 }
3847 else if (c >= 0xA0 && c < 0xE0)
3848 found = CATEGORY_MASK_SJIS;
3849 else
3850 break;
3851 }
3852 detect_info->rejected |= CATEGORY_MASK_SJIS;
3853 return 0;
3854
3855 no_more_source:
3856 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
3857 {
3858 detect_info->rejected |= CATEGORY_MASK_SJIS;
3859 return 0;
3860 }
3861 detect_info->found |= found;
3862 return 1;
3863 }
3864
3865 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3866 Check if a text is encoded in BIG5. If it is, return
3867 CATEGORY_MASK_BIG5, else return 0. */
3868
3869 static int
3870 detect_coding_big5 (coding, detect_info)
3871 struct coding_system *coding;
3872 struct coding_detection_info *detect_info;
3873 {
3874 unsigned char *src = coding->source, *src_base = src;
3875 unsigned char *src_end = coding->source + coding->src_bytes;
3876 int multibytep = coding->src_multibyte;
3877 int consumed_chars = 0;
3878 int found = 0;
3879 int c;
3880 int incomplete;
3881
3882 detect_info->checked |= CATEGORY_MASK_BIG5;
3883 /* A coding system of this category is always ASCII compatible. */
3884 src += coding->head_ascii;
3885
3886 while (1)
3887 {
3888 incomplete = 0;
3889 ONE_MORE_BYTE (c);
3890 incomplete = 1;
3891 if (c < 0x80)
3892 continue;
3893 if (c >= 0xA1)
3894 {
3895 ONE_MORE_BYTE (c);
3896 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
3897 return 0;
3898 found = CATEGORY_MASK_BIG5;
3899 }
3900 else
3901 break;
3902 }
3903 detect_info->rejected |= CATEGORY_MASK_BIG5;
3904 return 0;
3905
3906 no_more_source:
3907 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
3908 {
3909 detect_info->rejected |= CATEGORY_MASK_BIG5;
3910 return 0;
3911 }
3912 detect_info->found |= found;
3913 return 1;
3914 }
3915
3916 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3917 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
3918
3919 static void
3920 decode_coding_sjis (coding)
3921 struct coding_system *coding;
3922 {
3923 unsigned char *src = coding->source + coding->consumed;
3924 unsigned char *src_end = coding->source + coding->src_bytes;
3925 unsigned char *src_base;
3926 int *charbuf = coding->charbuf;
3927 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
3928 int consumed_chars = 0, consumed_chars_base;
3929 int multibytep = coding->src_multibyte;
3930 struct charset *charset_roman, *charset_kanji, *charset_kana;
3931 Lisp_Object attrs, eol_type, charset_list, val;
3932 int char_offset = coding->produced_char;
3933 int last_offset = char_offset;
3934 int last_id = charset_ascii;
3935
3936 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3937
3938 val = charset_list;
3939 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3940 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3941 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
3942
3943 while (1)
3944 {
3945 int c, c1;
3946
3947 src_base = src;
3948 consumed_chars_base = consumed_chars;
3949
3950 if (charbuf >= charbuf_end)
3951 break;
3952
3953 ONE_MORE_BYTE (c);
3954
3955 if (c == '\r')
3956 {
3957 if (EQ (eol_type, Qdos))
3958 {
3959 if (src == src_end)
3960 {
3961 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
3962 goto no_more_source;
3963 }
3964 if (*src == '\n')
3965 ONE_MORE_BYTE (c);
3966 }
3967 else if (EQ (eol_type, Qmac))
3968 c = '\n';
3969 }
3970 else
3971 {
3972 struct charset *charset;
3973
3974 if (c < 0x80)
3975 charset = charset_roman;
3976 else
3977 {
3978 if (c >= 0xF0)
3979 goto invalid_code;
3980 if (c < 0xA0 || c >= 0xE0)
3981 {
3982 /* SJIS -> JISX0208 */
3983 ONE_MORE_BYTE (c1);
3984 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
3985 goto invalid_code;
3986 c = (c << 8) | c1;
3987 SJIS_TO_JIS (c);
3988 charset = charset_kanji;
3989 }
3990 else if (c > 0xA0)
3991 {
3992 /* SJIS -> JISX0201-Kana */
3993 c &= 0x7F;
3994 charset = charset_kana;
3995 }
3996 else
3997 goto invalid_code;
3998 }
3999 if (charset->id != charset_ascii
4000 && last_id != charset->id)
4001 {
4002 if (last_id != charset_ascii)
4003 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4004 last_id = charset->id;
4005 last_offset = char_offset;
4006 }
4007 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4008 }
4009 *charbuf++ = c;
4010 char_offset++;
4011 continue;
4012
4013 invalid_code:
4014 src = src_base;
4015 consumed_chars = consumed_chars_base;
4016 ONE_MORE_BYTE (c);
4017 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4018 char_offset++;
4019 coding->errors++;
4020 }
4021
4022 no_more_source:
4023 if (last_id != charset_ascii)
4024 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4025 coding->consumed_char += consumed_chars_base;
4026 coding->consumed = src_base - coding->source;
4027 coding->charbuf_used = charbuf - coding->charbuf;
4028 }
4029
4030 static void
4031 decode_coding_big5 (coding)
4032 struct coding_system *coding;
4033 {
4034 unsigned char *src = coding->source + coding->consumed;
4035 unsigned char *src_end = coding->source + coding->src_bytes;
4036 unsigned char *src_base;
4037 int *charbuf = coding->charbuf;
4038 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4039 int consumed_chars = 0, consumed_chars_base;
4040 int multibytep = coding->src_multibyte;
4041 struct charset *charset_roman, *charset_big5;
4042 Lisp_Object attrs, eol_type, charset_list, val;
4043 int char_offset = coding->produced_char;
4044 int last_offset = char_offset;
4045 int last_id = charset_ascii;
4046
4047 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4048 val = charset_list;
4049 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4050 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4051
4052 while (1)
4053 {
4054 int c, c1;
4055
4056 src_base = src;
4057 consumed_chars_base = consumed_chars;
4058
4059 if (charbuf >= charbuf_end)
4060 break;
4061
4062 ONE_MORE_BYTE (c);
4063
4064 if (c == '\r')
4065 {
4066 if (EQ (eol_type, Qdos))
4067 {
4068 if (src == src_end)
4069 {
4070 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4071 goto no_more_source;
4072 }
4073 if (*src == '\n')
4074 ONE_MORE_BYTE (c);
4075 }
4076 else if (EQ (eol_type, Qmac))
4077 c = '\n';
4078 }
4079 else
4080 {
4081 struct charset *charset;
4082 if (c < 0x80)
4083 charset = charset_roman;
4084 else
4085 {
4086 /* BIG5 -> Big5 */
4087 if (c < 0xA1 || c > 0xFE)
4088 goto invalid_code;
4089 ONE_MORE_BYTE (c1);
4090 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4091 goto invalid_code;
4092 c = c << 8 | c1;
4093 charset = charset_big5;
4094 }
4095 if (charset->id != charset_ascii
4096 && last_id != charset->id)
4097 {
4098 if (last_id != charset_ascii)
4099 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4100 last_id = charset->id;
4101 last_offset = char_offset;
4102 }
4103 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4104 }
4105
4106 *charbuf++ = c;
4107 char_offset++;
4108 continue;
4109
4110 invalid_code:
4111 src = src_base;
4112 consumed_chars = consumed_chars_base;
4113 ONE_MORE_BYTE (c);
4114 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4115 char_offset++;
4116 coding->errors++;
4117 }
4118
4119 no_more_source:
4120 if (last_id != charset_ascii)
4121 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4122 coding->consumed_char += consumed_chars_base;
4123 coding->consumed = src_base - coding->source;
4124 coding->charbuf_used = charbuf - coding->charbuf;
4125 }
4126
4127 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4128 This function can encode charsets `ascii', `katakana-jisx0201',
4129 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4130 are sure that all these charsets are registered as official charset
4131 (i.e. do not have extended leading-codes). Characters of other
4132 charsets are produced without any encoding. If SJIS_P is 1, encode
4133 SJIS text, else encode BIG5 text. */
4134
4135 static int
4136 encode_coding_sjis (coding)
4137 struct coding_system *coding;
4138 {
4139 int multibytep = coding->dst_multibyte;
4140 int *charbuf = coding->charbuf;
4141 int *charbuf_end = charbuf + coding->charbuf_used;
4142 unsigned char *dst = coding->destination + coding->produced;
4143 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4144 int safe_room = 4;
4145 int produced_chars = 0;
4146 Lisp_Object attrs, eol_type, charset_list, val;
4147 int ascii_compatible;
4148 struct charset *charset_roman, *charset_kanji, *charset_kana;
4149 int c;
4150
4151 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4152 val = charset_list;
4153 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4154 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4155 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
4156
4157 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4158
4159 while (charbuf < charbuf_end)
4160 {
4161 ASSURE_DESTINATION (safe_room);
4162 c = *charbuf++;
4163 /* Now encode the character C. */
4164 if (ASCII_CHAR_P (c) && ascii_compatible)
4165 EMIT_ONE_ASCII_BYTE (c);
4166 else if (CHAR_BYTE8_P (c))
4167 {
4168 c = CHAR_TO_BYTE8 (c);
4169 EMIT_ONE_BYTE (c);
4170 }
4171 else
4172 {
4173 unsigned code;
4174 struct charset *charset = char_charset (c, charset_list, &code);
4175
4176 if (!charset)
4177 {
4178 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4179 {
4180 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4181 charset = CHARSET_FROM_ID (charset_ascii);
4182 }
4183 else
4184 {
4185 c = coding->default_char;
4186 charset = char_charset (c, charset_list, &code);
4187 }
4188 }
4189 if (code == CHARSET_INVALID_CODE (charset))
4190 abort ();
4191 if (charset == charset_kanji)
4192 {
4193 int c1, c2;
4194 JIS_TO_SJIS (code);
4195 c1 = code >> 8, c2 = code & 0xFF;
4196 EMIT_TWO_BYTES (c1, c2);
4197 }
4198 else if (charset == charset_kana)
4199 EMIT_ONE_BYTE (code | 0x80);
4200 else
4201 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4202 }
4203 }
4204 coding->result = CODING_RESULT_SUCCESS;
4205 coding->produced_char += produced_chars;
4206 coding->produced = dst - coding->destination;
4207 return 0;
4208 }
4209
4210 static int
4211 encode_coding_big5 (coding)
4212 struct coding_system *coding;
4213 {
4214 int multibytep = coding->dst_multibyte;
4215 int *charbuf = coding->charbuf;
4216 int *charbuf_end = charbuf + coding->charbuf_used;
4217 unsigned char *dst = coding->destination + coding->produced;
4218 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4219 int safe_room = 4;
4220 int produced_chars = 0;
4221 Lisp_Object attrs, eol_type, charset_list, val;
4222 int ascii_compatible;
4223 struct charset *charset_roman, *charset_big5;
4224 int c;
4225
4226 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4227 val = charset_list;
4228 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4229 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4230 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4231
4232 while (charbuf < charbuf_end)
4233 {
4234 ASSURE_DESTINATION (safe_room);
4235 c = *charbuf++;
4236 /* Now encode the character C. */
4237 if (ASCII_CHAR_P (c) && ascii_compatible)
4238 EMIT_ONE_ASCII_BYTE (c);
4239 else if (CHAR_BYTE8_P (c))
4240 {
4241 c = CHAR_TO_BYTE8 (c);
4242 EMIT_ONE_BYTE (c);
4243 }
4244 else
4245 {
4246 unsigned code;
4247 struct charset *charset = char_charset (c, charset_list, &code);
4248
4249 if (! charset)
4250 {
4251 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4252 {
4253 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4254 charset = CHARSET_FROM_ID (charset_ascii);
4255 }
4256 else
4257 {
4258 c = coding->default_char;
4259 charset = char_charset (c, charset_list, &code);
4260 }
4261 }
4262 if (code == CHARSET_INVALID_CODE (charset))
4263 abort ();
4264 if (charset == charset_big5)
4265 {
4266 int c1, c2;
4267
4268 c1 = code >> 8, c2 = code & 0xFF;
4269 EMIT_TWO_BYTES (c1, c2);
4270 }
4271 else
4272 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4273 }
4274 }
4275 coding->result = CODING_RESULT_SUCCESS;
4276 coding->produced_char += produced_chars;
4277 coding->produced = dst - coding->destination;
4278 return 0;
4279 }
4280
4281 \f
4282 /*** 10. CCL handlers ***/
4283
4284 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4285 Check if a text is encoded in a coding system of which
4286 encoder/decoder are written in CCL program. If it is, return
4287 CATEGORY_MASK_CCL, else return 0. */
4288
4289 static int
4290 detect_coding_ccl (coding, detect_info)
4291 struct coding_system *coding;
4292 struct coding_detection_info *detect_info;
4293 {
4294 unsigned char *src = coding->source, *src_base = src;
4295 unsigned char *src_end = coding->source + coding->src_bytes;
4296 int multibytep = coding->src_multibyte;
4297 int consumed_chars = 0;
4298 int found = 0;
4299 unsigned char *valids = CODING_CCL_VALIDS (coding);
4300 int head_ascii = coding->head_ascii;
4301 Lisp_Object attrs;
4302
4303 detect_info->checked |= CATEGORY_MASK_CCL;
4304
4305 coding = &coding_categories[coding_category_ccl];
4306 attrs = CODING_ID_ATTRS (coding->id);
4307 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4308 src += head_ascii;
4309
4310 while (1)
4311 {
4312 int c;
4313 ONE_MORE_BYTE (c);
4314 if (! valids[c])
4315 break;
4316 if ((valids[c] > 1))
4317 found = CATEGORY_MASK_CCL;
4318 }
4319 detect_info->rejected |= CATEGORY_MASK_CCL;
4320 return 0;
4321
4322 no_more_source:
4323 detect_info->found |= found;
4324 return 1;
4325 }
4326
4327 static void
4328 decode_coding_ccl (coding)
4329 struct coding_system *coding;
4330 {
4331 const unsigned char *src = coding->source + coding->consumed;
4332 unsigned char *src_end = coding->source + coding->src_bytes;
4333 int *charbuf = coding->charbuf;
4334 int *charbuf_end = charbuf + coding->charbuf_size;
4335 int consumed_chars = 0;
4336 int multibytep = coding->src_multibyte;
4337 struct ccl_program ccl;
4338 int source_charbuf[1024];
4339 int source_byteidx[1024];
4340 Lisp_Object attrs, eol_type, charset_list;
4341
4342 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4343 setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4344
4345 while (src < src_end)
4346 {
4347 const unsigned char *p = src;
4348 int *source, *source_end;
4349 int i = 0;
4350
4351 if (multibytep)
4352 while (i < 1024 && p < src_end)
4353 {
4354 source_byteidx[i] = p - src;
4355 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4356 }
4357 else
4358 while (i < 1024 && p < src_end)
4359 source_charbuf[i++] = *p++;
4360
4361 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4362 ccl.last_block = 1;
4363
4364 source = source_charbuf;
4365 source_end = source + i;
4366 while (source < source_end)
4367 {
4368 ccl_driver (&ccl, source, charbuf,
4369 source_end - source, charbuf_end - charbuf,
4370 charset_list);
4371 source += ccl.consumed;
4372 charbuf += ccl.produced;
4373 if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4374 break;
4375 }
4376 if (source < source_end)
4377 src += source_byteidx[source - source_charbuf];
4378 else
4379 src = p;
4380 consumed_chars += source - source_charbuf;
4381
4382 if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4383 && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4384 break;
4385 }
4386
4387 switch (ccl.status)
4388 {
4389 case CCL_STAT_SUSPEND_BY_SRC:
4390 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4391 break;
4392 case CCL_STAT_SUSPEND_BY_DST:
4393 break;
4394 case CCL_STAT_QUIT:
4395 case CCL_STAT_INVALID_CMD:
4396 coding->result = CODING_RESULT_INTERRUPT;
4397 break;
4398 default:
4399 coding->result = CODING_RESULT_SUCCESS;
4400 break;
4401 }
4402 coding->consumed_char += consumed_chars;
4403 coding->consumed = src - coding->source;
4404 coding->charbuf_used = charbuf - coding->charbuf;
4405 }
4406
4407 static int
4408 encode_coding_ccl (coding)
4409 struct coding_system *coding;
4410 {
4411 struct ccl_program ccl;
4412 int multibytep = coding->dst_multibyte;
4413 int *charbuf = coding->charbuf;
4414 int *charbuf_end = charbuf + coding->charbuf_used;
4415 unsigned char *dst = coding->destination + coding->produced;
4416 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4417 unsigned char *adjusted_dst_end = dst_end - 1;
4418 int destination_charbuf[1024];
4419 int i, produced_chars = 0;
4420 Lisp_Object attrs, eol_type, charset_list;
4421
4422 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4423 setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4424
4425 ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4426 ccl.dst_multibyte = coding->dst_multibyte;
4427
4428 while (charbuf < charbuf_end && dst < adjusted_dst_end)
4429 {
4430 int dst_bytes = dst_end - dst;
4431 if (dst_bytes > 1024)
4432 dst_bytes = 1024;
4433
4434 ccl_driver (&ccl, charbuf, destination_charbuf,
4435 charbuf_end - charbuf, dst_bytes, charset_list);
4436 charbuf += ccl.consumed;
4437 if (multibytep)
4438 for (i = 0; i < ccl.produced; i++)
4439 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4440 else
4441 {
4442 for (i = 0; i < ccl.produced; i++)
4443 *dst++ = destination_charbuf[i] & 0xFF;
4444 produced_chars += ccl.produced;
4445 }
4446 }
4447
4448 switch (ccl.status)
4449 {
4450 case CCL_STAT_SUSPEND_BY_SRC:
4451 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4452 break;
4453 case CCL_STAT_SUSPEND_BY_DST:
4454 coding->result = CODING_RESULT_INSUFFICIENT_DST;
4455 break;
4456 case CCL_STAT_QUIT:
4457 case CCL_STAT_INVALID_CMD:
4458 coding->result = CODING_RESULT_INTERRUPT;
4459 break;
4460 default:
4461 coding->result = CODING_RESULT_SUCCESS;
4462 break;
4463 }
4464
4465 coding->produced_char += produced_chars;
4466 coding->produced = dst - coding->destination;
4467 return 0;
4468 }
4469
4470
4471 \f
4472 /*** 10, 11. no-conversion handlers ***/
4473
4474 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4475
4476 static void
4477 decode_coding_raw_text (coding)
4478 struct coding_system *coding;
4479 {
4480 coding->chars_at_source = 1;
4481 coding->consumed_char = 0;
4482 coding->consumed = 0;
4483 coding->result = CODING_RESULT_SUCCESS;
4484 }
4485
4486 static int
4487 encode_coding_raw_text (coding)
4488 struct coding_system *coding;
4489 {
4490 int multibytep = coding->dst_multibyte;
4491 int *charbuf = coding->charbuf;
4492 int *charbuf_end = coding->charbuf + coding->charbuf_used;
4493 unsigned char *dst = coding->destination + coding->produced;
4494 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4495 int produced_chars = 0;
4496 int c;
4497
4498 if (multibytep)
4499 {
4500 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4501
4502 if (coding->src_multibyte)
4503 while (charbuf < charbuf_end)
4504 {
4505 ASSURE_DESTINATION (safe_room);
4506 c = *charbuf++;
4507 if (ASCII_CHAR_P (c))
4508 EMIT_ONE_ASCII_BYTE (c);
4509 else if (CHAR_BYTE8_P (c))
4510 {
4511 c = CHAR_TO_BYTE8 (c);
4512 EMIT_ONE_BYTE (c);
4513 }
4514 else
4515 {
4516 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
4517
4518 CHAR_STRING_ADVANCE (c, p1);
4519 while (p0 < p1)
4520 {
4521 EMIT_ONE_BYTE (*p0);
4522 p0++;
4523 }
4524 }
4525 }
4526 else
4527 while (charbuf < charbuf_end)
4528 {
4529 ASSURE_DESTINATION (safe_room);
4530 c = *charbuf++;
4531 EMIT_ONE_BYTE (c);
4532 }
4533 }
4534 else
4535 {
4536 if (coding->src_multibyte)
4537 {
4538 int safe_room = MAX_MULTIBYTE_LENGTH;
4539
4540 while (charbuf < charbuf_end)
4541 {
4542 ASSURE_DESTINATION (safe_room);
4543 c = *charbuf++;
4544 if (ASCII_CHAR_P (c))
4545 *dst++ = c;
4546 else if (CHAR_BYTE8_P (c))
4547 *dst++ = CHAR_TO_BYTE8 (c);
4548 else
4549 CHAR_STRING_ADVANCE (c, dst);
4550 produced_chars++;
4551 }
4552 }
4553 else
4554 {
4555 ASSURE_DESTINATION (charbuf_end - charbuf);
4556 while (charbuf < charbuf_end && dst < dst_end)
4557 *dst++ = *charbuf++;
4558 produced_chars = dst - (coding->destination + coding->dst_bytes);
4559 }
4560 }
4561 coding->result = CODING_RESULT_SUCCESS;
4562 coding->produced_char += produced_chars;
4563 coding->produced = dst - coding->destination;
4564 return 0;
4565 }
4566
4567 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4568 Check if a text is encoded in a charset-based coding system. If it
4569 is, return 1, else return 0. */
4570
4571 static int
4572 detect_coding_charset (coding, detect_info)
4573 struct coding_system *coding;
4574 struct coding_detection_info *detect_info;
4575 {
4576 unsigned char *src = coding->source, *src_base = src;
4577 unsigned char *src_end = coding->source + coding->src_bytes;
4578 int multibytep = coding->src_multibyte;
4579 int consumed_chars = 0;
4580 Lisp_Object attrs, valids;
4581 int found = 0;
4582
4583 detect_info->checked |= CATEGORY_MASK_CHARSET;
4584
4585 coding = &coding_categories[coding_category_charset];
4586 attrs = CODING_ID_ATTRS (coding->id);
4587 valids = AREF (attrs, coding_attr_charset_valids);
4588
4589 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4590 src += coding->head_ascii;
4591
4592 while (1)
4593 {
4594 int c;
4595
4596 ONE_MORE_BYTE (c);
4597 if (NILP (AREF (valids, c)))
4598 break;
4599 if (c >= 0x80)
4600 found = CATEGORY_MASK_CHARSET;
4601 }
4602 detect_info->rejected |= CATEGORY_MASK_CHARSET;
4603 return 0;
4604
4605 no_more_source:
4606 detect_info->found |= found;
4607 return 1;
4608 }
4609
4610 static void
4611 decode_coding_charset (coding)
4612 struct coding_system *coding;
4613 {
4614 unsigned char *src = coding->source + coding->consumed;
4615 unsigned char *src_end = coding->source + coding->src_bytes;
4616 unsigned char *src_base;
4617 int *charbuf = coding->charbuf;
4618 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4619 int consumed_chars = 0, consumed_chars_base;
4620 int multibytep = coding->src_multibyte;
4621 Lisp_Object attrs, eol_type, charset_list, valids;
4622 int char_offset = coding->produced_char;
4623 int last_offset = char_offset;
4624 int last_id = charset_ascii;
4625
4626 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4627 valids = AREF (attrs, coding_attr_charset_valids);
4628
4629 while (1)
4630 {
4631 int c;
4632
4633 src_base = src;
4634 consumed_chars_base = consumed_chars;
4635
4636 if (charbuf >= charbuf_end)
4637 break;
4638
4639 ONE_MORE_BYTE (c);
4640 if (c == '\r')
4641 {
4642 /* Here we assume that no charset maps '\r' to something
4643 else. */
4644 if (EQ (eol_type, Qdos))
4645 {
4646 if (src == src_end)
4647 {
4648 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4649 goto no_more_source;
4650 }
4651 if (*src == '\n')
4652 ONE_MORE_BYTE (c);
4653 }
4654 else if (EQ (eol_type, Qmac))
4655 c = '\n';
4656 }
4657 else
4658 {
4659 Lisp_Object val;
4660 struct charset *charset;
4661 int dim;
4662 int len = 1;
4663 unsigned code = c;
4664
4665 val = AREF (valids, c);
4666 if (NILP (val))
4667 goto invalid_code;
4668 if (INTEGERP (val))
4669 {
4670 charset = CHARSET_FROM_ID (XFASTINT (val));
4671 dim = CHARSET_DIMENSION (charset);
4672 while (len < dim)
4673 {
4674 ONE_MORE_BYTE (c);
4675 code = (code << 8) | c;
4676 len++;
4677 }
4678 CODING_DECODE_CHAR (coding, src, src_base, src_end,
4679 charset, code, c);
4680 }
4681 else
4682 {
4683 /* VAL is a list of charset IDs. It is assured that the
4684 list is sorted by charset dimensions (smaller one
4685 comes first). */
4686 while (CONSP (val))
4687 {
4688 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
4689 dim = CHARSET_DIMENSION (charset);
4690 while (len < dim)
4691 {
4692 ONE_MORE_BYTE (c);
4693 code = (code << 8) | c;
4694 len++;
4695 }
4696 CODING_DECODE_CHAR (coding, src, src_base,
4697 src_end, charset, code, c);
4698 if (c >= 0)
4699 break;
4700 val = XCDR (val);
4701 }
4702 }
4703 if (c < 0)
4704 goto invalid_code;
4705 if (charset->id != charset_ascii
4706 && last_id != charset->id)
4707 {
4708 if (last_id != charset_ascii)
4709 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4710 last_id = charset->id;
4711 last_offset = char_offset;
4712 }
4713 }
4714 *charbuf++ = c;
4715 char_offset++;
4716 continue;
4717
4718 invalid_code:
4719 src = src_base;
4720 consumed_chars = consumed_chars_base;
4721 ONE_MORE_BYTE (c);
4722 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4723 char_offset++;
4724 coding->errors++;
4725 }
4726
4727 no_more_source:
4728 if (last_id != charset_ascii)
4729 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4730 coding->consumed_char += consumed_chars_base;
4731 coding->consumed = src_base - coding->source;
4732 coding->charbuf_used = charbuf - coding->charbuf;
4733 }
4734
4735 static int
4736 encode_coding_charset (coding)
4737 struct coding_system *coding;
4738 {
4739 int multibytep = coding->dst_multibyte;
4740 int *charbuf = coding->charbuf;
4741 int *charbuf_end = charbuf + coding->charbuf_used;
4742 unsigned char *dst = coding->destination + coding->produced;
4743 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4744 int safe_room = MAX_MULTIBYTE_LENGTH;
4745 int produced_chars = 0;
4746 Lisp_Object attrs, eol_type, charset_list;
4747 int ascii_compatible;
4748 int c;
4749
4750 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4751 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4752
4753 while (charbuf < charbuf_end)
4754 {
4755 struct charset *charset;
4756 unsigned code;
4757
4758 ASSURE_DESTINATION (safe_room);
4759 c = *charbuf++;
4760 if (ascii_compatible && ASCII_CHAR_P (c))
4761 EMIT_ONE_ASCII_BYTE (c);
4762 else if (CHAR_BYTE8_P (c))
4763 {
4764 c = CHAR_TO_BYTE8 (c);
4765 EMIT_ONE_BYTE (c);
4766 }
4767 else
4768 {
4769 charset = char_charset (c, charset_list, &code);
4770 if (charset)
4771 {
4772 if (CHARSET_DIMENSION (charset) == 1)
4773 EMIT_ONE_BYTE (code);
4774 else if (CHARSET_DIMENSION (charset) == 2)
4775 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
4776 else if (CHARSET_DIMENSION (charset) == 3)
4777 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
4778 else
4779 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
4780 (code >> 8) & 0xFF, code & 0xFF);
4781 }
4782 else
4783 {
4784 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4785 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4786 else
4787 c = coding->default_char;
4788 EMIT_ONE_BYTE (c);
4789 }
4790 }
4791 }
4792
4793 coding->result = CODING_RESULT_SUCCESS;
4794 coding->produced_char += produced_chars;
4795 coding->produced = dst - coding->destination;
4796 return 0;
4797 }
4798
4799 \f
4800 /*** 7. C library functions ***/
4801
4802 /* Setup coding context CODING from information about CODING_SYSTEM.
4803 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
4804 CODING_SYSTEM is invalid, signal an error. */
4805
4806 void
4807 setup_coding_system (coding_system, coding)
4808 Lisp_Object coding_system;
4809 struct coding_system *coding;
4810 {
4811 Lisp_Object attrs;
4812 Lisp_Object eol_type;
4813 Lisp_Object coding_type;
4814 Lisp_Object val;
4815
4816 if (NILP (coding_system))
4817 coding_system = Qno_conversion;
4818
4819 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
4820
4821 attrs = CODING_ID_ATTRS (coding->id);
4822 eol_type = CODING_ID_EOL_TYPE (coding->id);
4823
4824 coding->mode = 0;
4825 coding->head_ascii = -1;
4826 coding->common_flags
4827 = (VECTORP (eol_type) ? CODING_REQUIRE_DETECTION_MASK : 0);
4828 if (! NILP (CODING_ATTR_POST_READ (attrs)))
4829 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
4830 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
4831 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
4832
4833 val = CODING_ATTR_SAFE_CHARSETS (attrs);
4834 coding->max_charset_id = XSTRING (val)->size - 1;
4835 coding->safe_charsets = (char *) XSTRING (val)->data;
4836 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4837
4838 coding_type = CODING_ATTR_TYPE (attrs);
4839 if (EQ (coding_type, Qundecided))
4840 {
4841 coding->detector = NULL;
4842 coding->decoder = decode_coding_raw_text;
4843 coding->encoder = encode_coding_raw_text;
4844 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4845 }
4846 else if (EQ (coding_type, Qiso_2022))
4847 {
4848 int i;
4849 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
4850
4851 /* Invoke graphic register 0 to plane 0. */
4852 CODING_ISO_INVOCATION (coding, 0) = 0;
4853 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
4854 CODING_ISO_INVOCATION (coding, 1)
4855 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
4856 /* Setup the initial status of designation. */
4857 for (i = 0; i < 4; i++)
4858 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
4859 /* Not single shifting initially. */
4860 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
4861 /* Beginning of buffer should also be regarded as bol. */
4862 CODING_ISO_BOL (coding) = 1;
4863 coding->detector = detect_coding_iso_2022;
4864 coding->decoder = decode_coding_iso_2022;
4865 coding->encoder = encode_coding_iso_2022;
4866 if (flags & CODING_ISO_FLAG_SAFE)
4867 coding->mode |= CODING_MODE_SAFE_ENCODING;
4868 coding->common_flags
4869 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4870 | CODING_REQUIRE_FLUSHING_MASK);
4871 if (flags & CODING_ISO_FLAG_COMPOSITION)
4872 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
4873 if (flags & CODING_ISO_FLAG_DESIGNATION)
4874 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
4875 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
4876 {
4877 setup_iso_safe_charsets (attrs);
4878 val = CODING_ATTR_SAFE_CHARSETS (attrs);
4879 coding->max_charset_id = XSTRING (val)->size - 1;
4880 coding->safe_charsets = (char *) XSTRING (val)->data;
4881 }
4882 CODING_ISO_FLAGS (coding) = flags;
4883 }
4884 else if (EQ (coding_type, Qcharset))
4885 {
4886 coding->detector = detect_coding_charset;
4887 coding->decoder = decode_coding_charset;
4888 coding->encoder = encode_coding_charset;
4889 coding->common_flags
4890 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4891 }
4892 else if (EQ (coding_type, Qutf_8))
4893 {
4894 coding->detector = detect_coding_utf_8;
4895 coding->decoder = decode_coding_utf_8;
4896 coding->encoder = encode_coding_utf_8;
4897 coding->common_flags
4898 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4899 }
4900 else if (EQ (coding_type, Qutf_16))
4901 {
4902 val = AREF (attrs, coding_attr_utf_16_bom);
4903 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
4904 : EQ (val, Qt) ? utf_16_with_bom
4905 : utf_16_without_bom);
4906 val = AREF (attrs, coding_attr_utf_16_endian);
4907 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
4908 : utf_16_little_endian);
4909 CODING_UTF_16_SURROGATE (coding) = 0;
4910 coding->detector = detect_coding_utf_16;
4911 coding->decoder = decode_coding_utf_16;
4912 coding->encoder = encode_coding_utf_16;
4913 coding->common_flags
4914 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4915 if (CODING_UTF_16_BOM (coding) == utf_16_detect_bom)
4916 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4917 }
4918 else if (EQ (coding_type, Qccl))
4919 {
4920 coding->detector = detect_coding_ccl;
4921 coding->decoder = decode_coding_ccl;
4922 coding->encoder = encode_coding_ccl;
4923 coding->common_flags
4924 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4925 | CODING_REQUIRE_FLUSHING_MASK);
4926 }
4927 else if (EQ (coding_type, Qemacs_mule))
4928 {
4929 coding->detector = detect_coding_emacs_mule;
4930 coding->decoder = decode_coding_emacs_mule;
4931 coding->encoder = encode_coding_emacs_mule;
4932 coding->common_flags
4933 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4934 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
4935 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
4936 {
4937 Lisp_Object tail, safe_charsets;
4938 int max_charset_id = 0;
4939
4940 for (tail = Vemacs_mule_charset_list; CONSP (tail);
4941 tail = XCDR (tail))
4942 if (max_charset_id < XFASTINT (XCAR (tail)))
4943 max_charset_id = XFASTINT (XCAR (tail));
4944 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
4945 make_number (255));
4946 for (tail = Vemacs_mule_charset_list; CONSP (tail);
4947 tail = XCDR (tail))
4948 XSTRING (safe_charsets)->data[XFASTINT (XCAR (tail))] = 0;
4949 coding->max_charset_id = max_charset_id;
4950 coding->safe_charsets = (char *) XSTRING (safe_charsets)->data;
4951 }
4952 }
4953 else if (EQ (coding_type, Qshift_jis))
4954 {
4955 coding->detector = detect_coding_sjis;
4956 coding->decoder = decode_coding_sjis;
4957 coding->encoder = encode_coding_sjis;
4958 coding->common_flags
4959 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4960 }
4961 else if (EQ (coding_type, Qbig5))
4962 {
4963 coding->detector = detect_coding_big5;
4964 coding->decoder = decode_coding_big5;
4965 coding->encoder = encode_coding_big5;
4966 coding->common_flags
4967 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4968 }
4969 else /* EQ (coding_type, Qraw_text) */
4970 {
4971 coding->detector = NULL;
4972 coding->decoder = decode_coding_raw_text;
4973 coding->encoder = encode_coding_raw_text;
4974 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
4975 }
4976
4977 return;
4978 }
4979
4980 /* Return raw-text or one of its subsidiaries that has the same
4981 eol_type as CODING-SYSTEM. */
4982
4983 Lisp_Object
4984 raw_text_coding_system (coding_system)
4985 Lisp_Object coding_system;
4986 {
4987 Lisp_Object spec, attrs;
4988 Lisp_Object eol_type, raw_text_eol_type;
4989
4990 if (NILP (coding_system))
4991 return Qraw_text;
4992 spec = CODING_SYSTEM_SPEC (coding_system);
4993 attrs = AREF (spec, 0);
4994
4995 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
4996 return coding_system;
4997
4998 eol_type = AREF (spec, 2);
4999 if (VECTORP (eol_type))
5000 return Qraw_text;
5001 spec = CODING_SYSTEM_SPEC (Qraw_text);
5002 raw_text_eol_type = AREF (spec, 2);
5003 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5004 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5005 : AREF (raw_text_eol_type, 2));
5006 }
5007
5008
5009 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5010 does, return one of the subsidiary that has the same eol-spec as
5011 PARENT. Otherwise, return CODING_SYSTEM. */
5012
5013 Lisp_Object
5014 coding_inherit_eol_type (coding_system, parent)
5015 Lisp_Object coding_system, parent;
5016 {
5017 Lisp_Object spec, attrs, eol_type;
5018
5019 if (NILP (coding_system))
5020 coding_system = Qraw_text;
5021 spec = CODING_SYSTEM_SPEC (coding_system);
5022 attrs = AREF (spec, 0);
5023 eol_type = AREF (spec, 2);
5024 if (VECTORP (eol_type)
5025 && ! NILP (parent))
5026 {
5027 Lisp_Object parent_spec;
5028 Lisp_Object parent_eol_type;
5029
5030 parent_spec
5031 = CODING_SYSTEM_SPEC (buffer_defaults.buffer_file_coding_system);
5032 parent_eol_type = AREF (parent_spec, 2);
5033 if (EQ (parent_eol_type, Qunix))
5034 coding_system = AREF (eol_type, 0);
5035 else if (EQ (parent_eol_type, Qdos))
5036 coding_system = AREF (eol_type, 1);
5037 else if (EQ (parent_eol_type, Qmac))
5038 coding_system = AREF (eol_type, 2);
5039 }
5040 return coding_system;
5041 }
5042
5043 /* Emacs has a mechanism to automatically detect a coding system if it
5044 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
5045 it's impossible to distinguish some coding systems accurately
5046 because they use the same range of codes. So, at first, coding
5047 systems are categorized into 7, those are:
5048
5049 o coding-category-emacs-mule
5050
5051 The category for a coding system which has the same code range
5052 as Emacs' internal format. Assigned the coding-system (Lisp
5053 symbol) `emacs-mule' by default.
5054
5055 o coding-category-sjis
5056
5057 The category for a coding system which has the same code range
5058 as SJIS. Assigned the coding-system (Lisp
5059 symbol) `japanese-shift-jis' by default.
5060
5061 o coding-category-iso-7
5062
5063 The category for a coding system which has the same code range
5064 as ISO2022 of 7-bit environment. This doesn't use any locking
5065 shift and single shift functions. This can encode/decode all
5066 charsets. Assigned the coding-system (Lisp symbol)
5067 `iso-2022-7bit' by default.
5068
5069 o coding-category-iso-7-tight
5070
5071 Same as coding-category-iso-7 except that this can
5072 encode/decode only the specified charsets.
5073
5074 o coding-category-iso-8-1
5075
5076 The category for a coding system which has the same code range
5077 as ISO2022 of 8-bit environment and graphic plane 1 used only
5078 for DIMENSION1 charset. This doesn't use any locking shift
5079 and single shift functions. Assigned the coding-system (Lisp
5080 symbol) `iso-latin-1' by default.
5081
5082 o coding-category-iso-8-2
5083
5084 The category for a coding system which has the same code range
5085 as ISO2022 of 8-bit environment and graphic plane 1 used only
5086 for DIMENSION2 charset. This doesn't use any locking shift
5087 and single shift functions. Assigned the coding-system (Lisp
5088 symbol) `japanese-iso-8bit' by default.
5089
5090 o coding-category-iso-7-else
5091
5092 The category for a coding system which has the same code range
5093 as ISO2022 of 7-bit environemnt but uses locking shift or
5094 single shift functions. Assigned the coding-system (Lisp
5095 symbol) `iso-2022-7bit-lock' by default.
5096
5097 o coding-category-iso-8-else
5098
5099 The category for a coding system which has the same code range
5100 as ISO2022 of 8-bit environemnt but uses locking shift or
5101 single shift functions. Assigned the coding-system (Lisp
5102 symbol) `iso-2022-8bit-ss2' by default.
5103
5104 o coding-category-big5
5105
5106 The category for a coding system which has the same code range
5107 as BIG5. Assigned the coding-system (Lisp symbol)
5108 `cn-big5' by default.
5109
5110 o coding-category-utf-8
5111
5112 The category for a coding system which has the same code range
5113 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
5114 symbol) `utf-8' by default.
5115
5116 o coding-category-utf-16-be
5117
5118 The category for a coding system in which a text has an
5119 Unicode signature (cf. Unicode Standard) in the order of BIG
5120 endian at the head. Assigned the coding-system (Lisp symbol)
5121 `utf-16-be' by default.
5122
5123 o coding-category-utf-16-le
5124
5125 The category for a coding system in which a text has an
5126 Unicode signature (cf. Unicode Standard) in the order of
5127 LITTLE endian at the head. Assigned the coding-system (Lisp
5128 symbol) `utf-16-le' by default.
5129
5130 o coding-category-ccl
5131
5132 The category for a coding system of which encoder/decoder is
5133 written in CCL programs. The default value is nil, i.e., no
5134 coding system is assigned.
5135
5136 o coding-category-binary
5137
5138 The category for a coding system not categorized in any of the
5139 above. Assigned the coding-system (Lisp symbol)
5140 `no-conversion' by default.
5141
5142 Each of them is a Lisp symbol and the value is an actual
5143 `coding-system's (this is also a Lisp symbol) assigned by a user.
5144 What Emacs does actually is to detect a category of coding system.
5145 Then, it uses a `coding-system' assigned to it. If Emacs can't
5146 decide only one possible category, it selects a category of the
5147 highest priority. Priorities of categories are also specified by a
5148 user in a Lisp variable `coding-category-list'.
5149
5150 */
5151
5152 #define EOL_SEEN_NONE 0
5153 #define EOL_SEEN_LF 1
5154 #define EOL_SEEN_CR 2
5155 #define EOL_SEEN_CRLF 4
5156
5157 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
5158 SOURCE is encoded. If CATEGORY is one of
5159 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5160 two-byte, else they are encoded by one-byte.
5161
5162 Return one of EOL_SEEN_XXX. */
5163
5164 #define MAX_EOL_CHECK_COUNT 3
5165
5166 static int
5167 detect_eol (source, src_bytes, category)
5168 unsigned char *source;
5169 EMACS_INT src_bytes;
5170 enum coding_category category;
5171 {
5172 unsigned char *src = source, *src_end = src + src_bytes;
5173 unsigned char c;
5174 int total = 0;
5175 int eol_seen = EOL_SEEN_NONE;
5176
5177 if ((1 << category) & CATEGORY_MASK_UTF_16)
5178 {
5179 int msb, lsb;
5180
5181 msb = category == (coding_category_utf_16_le
5182 | coding_category_utf_16_le_nosig);
5183 lsb = 1 - msb;
5184
5185 while (src + 1 < src_end)
5186 {
5187 c = src[lsb];
5188 if (src[msb] == 0 && (c == '\n' || c == '\r'))
5189 {
5190 int this_eol;
5191
5192 if (c == '\n')
5193 this_eol = EOL_SEEN_LF;
5194 else if (src + 3 >= src_end
5195 || src[msb + 2] != 0
5196 || src[lsb + 2] != '\n')
5197 this_eol = EOL_SEEN_CR;
5198 else
5199 this_eol = EOL_SEEN_CRLF;
5200
5201 if (eol_seen == EOL_SEEN_NONE)
5202 /* This is the first end-of-line. */
5203 eol_seen = this_eol;
5204 else if (eol_seen != this_eol)
5205 {
5206 /* The found type is different from what found before. */
5207 eol_seen = EOL_SEEN_LF;
5208 break;
5209 }
5210 if (++total == MAX_EOL_CHECK_COUNT)
5211 break;
5212 }
5213 src += 2;
5214 }
5215 }
5216 else
5217 {
5218 while (src < src_end)
5219 {
5220 c = *src++;
5221 if (c == '\n' || c == '\r')
5222 {
5223 int this_eol;
5224
5225 if (c == '\n')
5226 this_eol = EOL_SEEN_LF;
5227 else if (src >= src_end || *src != '\n')
5228 this_eol = EOL_SEEN_CR;
5229 else
5230 this_eol = EOL_SEEN_CRLF, src++;
5231
5232 if (eol_seen == EOL_SEEN_NONE)
5233 /* This is the first end-of-line. */
5234 eol_seen = this_eol;
5235 else if (eol_seen != this_eol)
5236 {
5237 /* The found type is different from what found before. */
5238 eol_seen = EOL_SEEN_LF;
5239 break;
5240 }
5241 if (++total == MAX_EOL_CHECK_COUNT)
5242 break;
5243 }
5244 }
5245 }
5246 return eol_seen;
5247 }
5248
5249
5250 static void
5251 adjust_coding_eol_type (coding, eol_seen)
5252 struct coding_system *coding;
5253 int eol_seen;
5254 {
5255 Lisp_Object eol_type;
5256
5257 eol_type = CODING_ID_EOL_TYPE (coding->id);
5258 if (eol_seen & EOL_SEEN_LF)
5259 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5260 else if (eol_seen & EOL_SEEN_CRLF)
5261 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5262 else if (eol_seen & EOL_SEEN_CR)
5263 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5264 }
5265
5266 /* Detect how a text specified in CODING is encoded. If a coding
5267 system is detected, update fields of CODING by the detected coding
5268 system. */
5269
5270 void
5271 detect_coding (coding)
5272 struct coding_system *coding;
5273 {
5274 unsigned char *src, *src_end;
5275 Lisp_Object attrs, coding_type;
5276
5277 coding->consumed = coding->consumed_char = 0;
5278 coding->produced = coding->produced_char = 0;
5279 coding_set_source (coding);
5280
5281 src_end = coding->source + coding->src_bytes;
5282
5283 /* If we have not yet decided the text encoding type, detect it
5284 now. */
5285 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5286 {
5287 int c, i;
5288
5289 for (src = coding->source; src < src_end; src++)
5290 {
5291 c = *src;
5292 if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC
5293 || c == ISO_CODE_SI
5294 || c == ISO_CODE_SO)))
5295 break;
5296 }
5297 coding->head_ascii = src - (coding->source + coding->consumed);
5298
5299 if (coding->head_ascii < coding->src_bytes)
5300 {
5301 struct coding_detection_info detect_info;
5302 enum coding_category category;
5303 struct coding_system *this;
5304
5305 detect_info.checked = detect_info.found = detect_info.rejected = 0;
5306 for (i = 0; i < coding_category_raw_text; i++)
5307 {
5308 category = coding_priorities[i];
5309 this = coding_categories + category;
5310 if (this->id < 0)
5311 {
5312 /* No coding system of this category is defined. */
5313 detect_info.rejected |= (1 << category);
5314 }
5315 else if (category >= coding_category_raw_text)
5316 continue;
5317 else if (detect_info.checked & (1 << category))
5318 {
5319 if (detect_info.found & (1 << category))
5320 break;
5321 }
5322 else if ((*(this->detector)) (coding, &detect_info)
5323 && detect_info.found & (1 << category))
5324 break;
5325 }
5326 if (i < coding_category_raw_text)
5327 setup_coding_system (CODING_ID_NAME (this->id), coding);
5328 else if (detect_info.rejected == CATEGORY_MASK_ANY)
5329 setup_coding_system (Qraw_text, coding);
5330 else if (detect_info.rejected)
5331 for (i = 0; i < coding_category_raw_text; i++)
5332 if (! (detect_info.rejected & (1 << coding_priorities[i])))
5333 {
5334 this = coding_categories + coding_priorities[i];
5335 setup_coding_system (CODING_ID_NAME (this->id), coding);
5336 break;
5337 }
5338 }
5339 }
5340 else if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qutf_16))
5341 {
5342 Lisp_Object coding_systems;
5343 struct coding_detection_info detect_info;
5344
5345 coding_systems
5346 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_16_bom);
5347 detect_info.found = detect_info.rejected = 0;
5348 if (CONSP (coding_systems)
5349 && detect_coding_utf_16 (coding, &detect_info)
5350 && (detect_info.found & (CATEGORY_MASK_UTF_16_LE
5351 | CATEGORY_MASK_UTF_16_BE)))
5352 {
5353 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5354 setup_coding_system (XCAR (coding_systems), coding);
5355 else
5356 setup_coding_system (XCDR (coding_systems), coding);
5357 }
5358 }
5359
5360 attrs = CODING_ID_ATTRS (coding->id);
5361 coding_type = CODING_ATTR_TYPE (attrs);
5362
5363 /* If we have not yet decided the EOL type, detect it now. But, the
5364 detection is impossible for a CCL based coding system, in which
5365 case, we detct the EOL type after decoding. */
5366 if (VECTORP (CODING_ID_EOL_TYPE (coding->id))
5367 && ! EQ (coding_type, Qccl))
5368 {
5369 int eol_seen = detect_eol (coding->source, coding->src_bytes,
5370 XINT (CODING_ATTR_CATEGORY (attrs)));
5371
5372 if (eol_seen != EOL_SEEN_NONE)
5373 adjust_coding_eol_type (coding, eol_seen);
5374 }
5375 }
5376
5377
5378 static void
5379 decode_eol (coding)
5380 struct coding_system *coding;
5381 {
5382 if (VECTORP (CODING_ID_EOL_TYPE (coding->id)))
5383 {
5384 unsigned char *p = CHAR_POS_ADDR (coding->dst_pos);
5385 unsigned char *pend = p + coding->produced;
5386 int eol_seen = EOL_SEEN_NONE;
5387
5388 for (; p < pend; p++)
5389 {
5390 if (*p == '\n')
5391 eol_seen |= EOL_SEEN_LF;
5392 else if (*p == '\r')
5393 {
5394 if (p + 1 < pend && *(p + 1) == '\n')
5395 {
5396 eol_seen |= EOL_SEEN_CRLF;
5397 p++;
5398 }
5399 else
5400 eol_seen |= EOL_SEEN_CR;
5401 }
5402 }
5403 if (eol_seen != EOL_SEEN_NONE)
5404 adjust_coding_eol_type (coding, eol_seen);
5405 }
5406
5407 if (EQ (CODING_ID_EOL_TYPE (coding->id), Qmac))
5408 {
5409 unsigned char *p = CHAR_POS_ADDR (coding->dst_pos);
5410 unsigned char *pend = p + coding->produced;
5411
5412 for (; p < pend; p++)
5413 if (*p == '\r')
5414 *p = '\n';
5415 }
5416 else if (EQ (CODING_ID_EOL_TYPE (coding->id), Qdos))
5417 {
5418 unsigned char *p, *pbeg, *pend;
5419 Lisp_Object undo_list;
5420
5421 move_gap_both (coding->dst_pos + coding->produced_char,
5422 coding->dst_pos_byte + coding->produced);
5423 undo_list = current_buffer->undo_list;
5424 current_buffer->undo_list = Qt;
5425 del_range_2 (coding->dst_pos, coding->dst_pos_byte, GPT, GPT_BYTE, 0);
5426 current_buffer->undo_list = undo_list;
5427 pbeg = GPT_ADDR;
5428 pend = pbeg + coding->produced;
5429
5430 for (p = pend - 1; p >= pbeg; p--)
5431 if (*p == '\r')
5432 {
5433 safe_bcopy ((char *) (p + 1), (char *) p, pend - p - 1);
5434 pend--;
5435 }
5436 coding->produced_char -= coding->produced - (pend - pbeg);
5437 coding->produced = pend - pbeg;
5438 insert_from_gap (coding->produced_char, coding->produced);
5439 }
5440 }
5441
5442 static void
5443 translate_chars (coding, table)
5444 struct coding_system *coding;
5445 Lisp_Object table;
5446 {
5447 int *charbuf = coding->charbuf;
5448 int *charbuf_end = charbuf + coding->charbuf_used;
5449 int c;
5450
5451 if (coding->chars_at_source)
5452 return;
5453
5454 while (charbuf < charbuf_end)
5455 {
5456 c = *charbuf;
5457 if (c < 0)
5458 charbuf += c;
5459 else
5460 *charbuf++ = translate_char (table, c);
5461 }
5462 }
5463
5464 static int
5465 produce_chars (coding)
5466 struct coding_system *coding;
5467 {
5468 unsigned char *dst = coding->destination + coding->produced;
5469 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5470 int produced;
5471 int produced_chars = 0;
5472
5473 if (! coding->chars_at_source)
5474 {
5475 /* Characters are in coding->charbuf. */
5476 int *buf = coding->charbuf;
5477 int *buf_end = buf + coding->charbuf_used;
5478 unsigned char *adjusted_dst_end;
5479
5480 if (BUFFERP (coding->src_object)
5481 && EQ (coding->src_object, coding->dst_object))
5482 dst_end = coding->source + coding->consumed;
5483 adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5484
5485 while (buf < buf_end)
5486 {
5487 int c = *buf++;
5488
5489 if (dst >= adjusted_dst_end)
5490 {
5491 dst = alloc_destination (coding,
5492 buf_end - buf + MAX_MULTIBYTE_LENGTH,
5493 dst);
5494 dst_end = coding->destination + coding->dst_bytes;
5495 adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5496 }
5497 if (c >= 0)
5498 {
5499 if (coding->dst_multibyte
5500 || ! CHAR_BYTE8_P (c))
5501 CHAR_STRING_ADVANCE (c, dst);
5502 else
5503 *dst++ = CHAR_TO_BYTE8 (c);
5504 produced_chars++;
5505 }
5506 else
5507 /* This is an annotation datum. (-C) is the length of
5508 it. */
5509 buf += -c - 1;
5510 }
5511 }
5512 else
5513 {
5514 unsigned char *src = coding->source;
5515 unsigned char *src_end = src + coding->src_bytes;
5516 Lisp_Object eol_type;
5517
5518 eol_type = CODING_ID_EOL_TYPE (coding->id);
5519
5520 if (coding->src_multibyte != coding->dst_multibyte)
5521 {
5522 if (coding->src_multibyte)
5523 {
5524 int multibytep = 1;
5525 int consumed_chars;
5526
5527 while (1)
5528 {
5529 unsigned char *src_base = src;
5530 int c;
5531
5532 ONE_MORE_BYTE (c);
5533 if (c == '\r')
5534 {
5535 if (EQ (eol_type, Qdos))
5536 {
5537 if (src == src_end)
5538 {
5539 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
5540 goto no_more_source;
5541 }
5542 if (*src == '\n')
5543 c = *src++;
5544 }
5545 else if (EQ (eol_type, Qmac))
5546 c = '\n';
5547 }
5548 if (dst == dst_end)
5549 {
5550 coding->consumed = src - coding->source;
5551
5552 if (EQ (coding->src_object, coding->dst_object))
5553 dst_end = src;
5554 if (dst == dst_end)
5555 {
5556 dst = alloc_destination (coding, src_end - src + 1,
5557 dst);
5558 dst_end = coding->destination + coding->dst_bytes;
5559 coding_set_source (coding);
5560 src = coding->source + coding->consumed;
5561 src_end = coding->source + coding->src_bytes;
5562 }
5563 }
5564 *dst++ = c;
5565 produced_chars++;
5566 }
5567 no_more_source:
5568 ;
5569 }
5570 else
5571 while (src < src_end)
5572 {
5573 int multibytep = 1;
5574 int c = *src++;
5575
5576 if (c == '\r')
5577 {
5578 if (EQ (eol_type, Qdos))
5579 {
5580 if (src < src_end
5581 && *src == '\n')
5582 c = *src++;
5583 }
5584 else if (EQ (eol_type, Qmac))
5585 c = '\n';
5586 }
5587 if (dst >= dst_end - 1)
5588 {
5589 coding->consumed = src - coding->source;
5590
5591 if (EQ (coding->src_object, coding->dst_object))
5592 dst_end = src;
5593 if (dst >= dst_end - 1)
5594 {
5595 dst = alloc_destination (coding, src_end - src + 2,
5596 dst);
5597 dst_end = coding->destination + coding->dst_bytes;
5598 coding_set_source (coding);
5599 src = coding->source + coding->consumed;
5600 src_end = coding->source + coding->src_bytes;
5601 }
5602 }
5603 EMIT_ONE_BYTE (c);
5604 }
5605 }
5606 else
5607 {
5608 if (!EQ (coding->src_object, coding->dst_object))
5609 {
5610 int require = coding->src_bytes - coding->dst_bytes;
5611
5612 if (require > 0)
5613 {
5614 EMACS_INT offset = src - coding->source;
5615
5616 dst = alloc_destination (coding, require, dst);
5617 coding_set_source (coding);
5618 src = coding->source + offset;
5619 src_end = coding->source + coding->src_bytes;
5620 }
5621 }
5622 produced_chars = coding->src_chars;
5623 while (src < src_end)
5624 {
5625 int c = *src++;
5626
5627 if (c == '\r')
5628 {
5629 if (EQ (eol_type, Qdos))
5630 {
5631 if (src < src_end
5632 && *src == '\n')
5633 c = *src++;
5634 produced_chars--;
5635 }
5636 else if (EQ (eol_type, Qmac))
5637 c = '\n';
5638 }
5639 *dst++ = c;
5640 }
5641 }
5642 coding->consumed = coding->src_bytes;
5643 coding->consumed_char = coding->src_chars;
5644 }
5645
5646 produced = dst - (coding->destination + coding->produced);
5647 if (BUFFERP (coding->dst_object))
5648 insert_from_gap (produced_chars, produced);
5649 coding->produced += produced;
5650 coding->produced_char += produced_chars;
5651 return produced_chars;
5652 }
5653
5654 /* Compose text in CODING->object according to the annotation data at
5655 CHARBUF. CHARBUF is an array:
5656 [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
5657 */
5658
5659 static INLINE void
5660 produce_composition (coding, charbuf)
5661 struct coding_system *coding;
5662 int *charbuf;
5663 {
5664 int len;
5665 EMACS_INT from, to;
5666 enum composition_method method;
5667 Lisp_Object components;
5668
5669 len = -charbuf[0];
5670 from = coding->dst_pos + charbuf[2];
5671 to = coding->dst_pos + charbuf[3];
5672 method = (enum composition_method) (charbuf[4]);
5673
5674 if (method == COMPOSITION_RELATIVE)
5675 components = Qnil;
5676 else
5677 {
5678 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5679 int i;
5680
5681 len -= 5;
5682 charbuf += 5;
5683 for (i = 0; i < len; i++)
5684 args[i] = make_number (charbuf[i]);
5685 components = (method == COMPOSITION_WITH_ALTCHARS
5686 ? Fstring (len, args) : Fvector (len, args));
5687 }
5688 compose_text (from, to, components, Qnil, coding->dst_object);
5689 }
5690
5691
5692 /* Put `charset' property on text in CODING->object according to
5693 the annotation data at CHARBUF. CHARBUF is an array:
5694 [ -LENGTH ANNOTATION_MASK FROM TO CHARSET-ID ]
5695 */
5696
5697 static INLINE void
5698 produce_charset (coding, charbuf)
5699 struct coding_system *coding;
5700 int *charbuf;
5701 {
5702 EMACS_INT from = coding->dst_pos + charbuf[2];
5703 EMACS_INT to = coding->dst_pos + charbuf[3];
5704 struct charset *charset = CHARSET_FROM_ID (charbuf[4]);
5705
5706 Fput_text_property (make_number (from), make_number (to),
5707 Qcharset, CHARSET_NAME (charset),
5708 coding->dst_object);
5709 }
5710
5711
5712 #define CHARBUF_SIZE 0x4000
5713
5714 #define ALLOC_CONVERSION_WORK_AREA(coding) \
5715 do { \
5716 int size = CHARBUF_SIZE;; \
5717 \
5718 coding->charbuf = NULL; \
5719 while (size > 1024) \
5720 { \
5721 coding->charbuf = (int *) alloca (sizeof (int) * size); \
5722 if (coding->charbuf) \
5723 break; \
5724 size >>= 1; \
5725 } \
5726 if (! coding->charbuf) \
5727 { \
5728 coding->result = CODING_RESULT_INSUFFICIENT_MEM; \
5729 return coding->result; \
5730 } \
5731 coding->charbuf_size = size; \
5732 } while (0)
5733
5734
5735 static void
5736 produce_annotation (coding)
5737 struct coding_system *coding;
5738 {
5739 int *charbuf = coding->charbuf;
5740 int *charbuf_end = charbuf + coding->charbuf_used;
5741
5742 if (NILP (coding->dst_object))
5743 return;
5744
5745 while (charbuf < charbuf_end)
5746 {
5747 if (*charbuf >= 0)
5748 charbuf++;
5749 else
5750 {
5751 int len = -*charbuf;
5752 switch (charbuf[1])
5753 {
5754 case CODING_ANNOTATE_COMPOSITION_MASK:
5755 produce_composition (coding, charbuf);
5756 break;
5757 case CODING_ANNOTATE_CHARSET_MASK:
5758 produce_charset (coding, charbuf);
5759 break;
5760 default:
5761 abort ();
5762 }
5763 charbuf += len;
5764 }
5765 }
5766 }
5767
5768 /* Decode the data at CODING->src_object into CODING->dst_object.
5769 CODING->src_object is a buffer, a string, or nil.
5770 CODING->dst_object is a buffer.
5771
5772 If CODING->src_object is a buffer, it must be the current buffer.
5773 In this case, if CODING->src_pos is positive, it is a position of
5774 the source text in the buffer, otherwise, the source text is in the
5775 gap area of the buffer, and CODING->src_pos specifies the offset of
5776 the text from GPT (which must be the same as PT). If this is the
5777 same buffer as CODING->dst_object, CODING->src_pos must be
5778 negative.
5779
5780 If CODING->src_object is a string, CODING->src_pos in an index to
5781 that string.
5782
5783 If CODING->src_object is nil, CODING->source must already point to
5784 the non-relocatable memory area. In this case, CODING->src_pos is
5785 an offset from CODING->source.
5786
5787 The decoded data is inserted at the current point of the buffer
5788 CODING->dst_object.
5789 */
5790
5791 static int
5792 decode_coding (coding)
5793 struct coding_system *coding;
5794 {
5795 Lisp_Object attrs;
5796
5797 if (BUFFERP (coding->src_object)
5798 && coding->src_pos > 0
5799 && coding->src_pos < GPT
5800 && coding->src_pos + coding->src_chars > GPT)
5801 move_gap_both (coding->src_pos, coding->src_pos_byte);
5802
5803 if (BUFFERP (coding->dst_object))
5804 {
5805 if (current_buffer != XBUFFER (coding->dst_object))
5806 set_buffer_internal (XBUFFER (coding->dst_object));
5807 if (GPT != PT)
5808 move_gap_both (PT, PT_BYTE);
5809 }
5810
5811 coding->consumed = coding->consumed_char = 0;
5812 coding->produced = coding->produced_char = 0;
5813 coding->chars_at_source = 0;
5814 coding->result = CODING_RESULT_SUCCESS;
5815 coding->errors = 0;
5816
5817 ALLOC_CONVERSION_WORK_AREA (coding);
5818
5819 attrs = CODING_ID_ATTRS (coding->id);
5820
5821 do
5822 {
5823 coding_set_source (coding);
5824 coding->annotated = 0;
5825 (*(coding->decoder)) (coding);
5826 if (!NILP (CODING_ATTR_DECODE_TBL (attrs)))
5827 translate_chars (coding, CODING_ATTR_DECODE_TBL (attrs));
5828 else if (!NILP (Vstandard_translation_table_for_decode))
5829 translate_chars (coding, Vstandard_translation_table_for_decode);
5830 coding_set_destination (coding);
5831 produce_chars (coding);
5832 if (coding->annotated)
5833 produce_annotation (coding);
5834 }
5835 while (coding->consumed < coding->src_bytes
5836 && ! coding->result);
5837
5838 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qccl)
5839 && SYMBOLP (CODING_ID_EOL_TYPE (coding->id))
5840 && ! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
5841 decode_eol (coding);
5842
5843 coding->carryover_bytes = 0;
5844 if (coding->consumed < coding->src_bytes)
5845 {
5846 int nbytes = coding->src_bytes - coding->consumed;
5847 unsigned char *src;
5848
5849 coding_set_source (coding);
5850 coding_set_destination (coding);
5851 src = coding->source + coding->consumed;
5852
5853 if (coding->mode & CODING_MODE_LAST_BLOCK)
5854 {
5855 /* Flush out unprocessed data as binary chars. We are sure
5856 that the number of data is less than the size of
5857 coding->charbuf. */
5858 while (nbytes-- > 0)
5859 {
5860 int c = *src++;
5861
5862 coding->charbuf[coding->charbuf_used++] = (c & 0x80 ? - c : c);
5863 }
5864 produce_chars (coding);
5865 }
5866 else
5867 {
5868 /* Record unprocessed bytes in coding->carryover. We are
5869 sure that the number of data is less than the size of
5870 coding->carryover. */
5871 unsigned char *p = coding->carryover;
5872
5873 coding->carryover_bytes = nbytes;
5874 while (nbytes-- > 0)
5875 *p++ = *src++;
5876 }
5877 coding->consumed = coding->src_bytes;
5878 }
5879
5880 return coding->result;
5881 }
5882
5883
5884 /* Extract an annotation datum from a composition starting at POS and
5885 ending before LIMIT of CODING->src_object (buffer or string), store
5886 the data in BUF, set *STOP to a starting position of the next
5887 composition (if any) or to LIMIT, and return the address of the
5888 next element of BUF.
5889
5890 If such an annotation is not found, set *STOP to a starting
5891 position of a composition after POS (if any) or to LIMIT, and
5892 return BUF. */
5893
5894 static INLINE int *
5895 handle_composition_annotation (pos, limit, coding, buf, stop)
5896 EMACS_INT pos, limit;
5897 struct coding_system *coding;
5898 int *buf;
5899 EMACS_INT *stop;
5900 {
5901 EMACS_INT start, end;
5902 Lisp_Object prop;
5903
5904 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
5905 || end > limit)
5906 *stop = limit;
5907 else if (start > pos)
5908 *stop = start;
5909 else
5910 {
5911 if (start == pos)
5912 {
5913 /* We found a composition. Store the corresponding
5914 annotation data in BUF. */
5915 int *head = buf;
5916 enum composition_method method = COMPOSITION_METHOD (prop);
5917 int nchars = COMPOSITION_LENGTH (prop);
5918
5919 ADD_COMPOSITION_DATA (buf, 0, nchars, method);
5920 if (method != COMPOSITION_RELATIVE)
5921 {
5922 Lisp_Object components;
5923 int len, i, i_byte;
5924
5925 components = COMPOSITION_COMPONENTS (prop);
5926 if (VECTORP (components))
5927 {
5928 len = XVECTOR (components)->size;
5929 for (i = 0; i < len; i++)
5930 *buf++ = XINT (AREF (components, i));
5931 }
5932 else if (STRINGP (components))
5933 {
5934 len = XSTRING (components)->size;
5935 i = i_byte = 0;
5936 while (i < len)
5937 {
5938 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
5939 buf++;
5940 }
5941 }
5942 else if (INTEGERP (components))
5943 {
5944 len = 1;
5945 *buf++ = XINT (components);
5946 }
5947 else if (CONSP (components))
5948 {
5949 for (len = 0; CONSP (components);
5950 len++, components = XCDR (components))
5951 *buf++ = XINT (XCAR (components));
5952 }
5953 else
5954 abort ();
5955 *head -= len;
5956 }
5957 }
5958
5959 if (find_composition (end, limit, &start, &end, &prop,
5960 coding->src_object)
5961 && end <= limit)
5962 *stop = start;
5963 else
5964 *stop = limit;
5965 }
5966 return buf;
5967 }
5968
5969
5970 /* Extract an annotation datum from a text property `charset' at POS of
5971 CODING->src_object (buffer of string), store the data in BUF, set
5972 *STOP to the position where the value of `charset' property changes
5973 (limiting by LIMIT), and return the address of the next element of
5974 BUF.
5975
5976 If the property value is nil, set *STOP to the position where the
5977 property value is non-nil (limiting by LIMIT), and return BUF. */
5978
5979 static INLINE int *
5980 handle_charset_annotation (pos, limit, coding, buf, stop)
5981 EMACS_INT pos, limit;
5982 struct coding_system *coding;
5983 int *buf;
5984 EMACS_INT *stop;
5985 {
5986 Lisp_Object val, next;
5987 int id;
5988
5989 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
5990 if (! NILP (val) && CHARSETP (val))
5991 id = XINT (CHARSET_SYMBOL_ID (val));
5992 else
5993 id = -1;
5994 ADD_CHARSET_DATA (buf, 0, 0, id);
5995 next = Fnext_single_property_change (make_number (pos), Qcharset,
5996 coding->src_object,
5997 make_number (limit));
5998 *stop = XINT (next);
5999 return buf;
6000 }
6001
6002
6003 static void
6004 consume_chars (coding)
6005 struct coding_system *coding;
6006 {
6007 int *buf = coding->charbuf;
6008 int *buf_end = coding->charbuf + coding->charbuf_size;
6009 const unsigned char *src = coding->source + coding->consumed;
6010 const unsigned char *src_end = coding->source + coding->src_bytes;
6011 EMACS_INT pos = coding->src_pos + coding->consumed_char;
6012 EMACS_INT end_pos = coding->src_pos + coding->src_chars;
6013 int multibytep = coding->src_multibyte;
6014 Lisp_Object eol_type;
6015 int c;
6016 EMACS_INT stop, stop_composition, stop_charset;
6017
6018 eol_type = CODING_ID_EOL_TYPE (coding->id);
6019 if (VECTORP (eol_type))
6020 eol_type = Qunix;
6021
6022 /* Note: composition handling is not yet implemented. */
6023 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
6024
6025 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
6026 stop = stop_composition = pos;
6027 else
6028 stop = stop_composition = end_pos;
6029 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
6030 stop = stop_charset = pos;
6031 else
6032 stop_charset = end_pos;
6033
6034 /* Compensate for CRLF and annotation. */
6035 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
6036 while (buf < buf_end)
6037 {
6038 if (pos == stop)
6039 {
6040 if (pos == end_pos)
6041 break;
6042 if (pos == stop_composition)
6043 buf = handle_composition_annotation (pos, end_pos, coding,
6044 buf, &stop_composition);
6045 if (pos == stop_charset)
6046 buf = handle_charset_annotation (pos, end_pos, coding,
6047 buf, &stop_charset);
6048 stop = (stop_composition < stop_charset
6049 ? stop_composition : stop_charset);
6050 }
6051
6052 if (! multibytep)
6053 {
6054 EMACS_INT bytes;
6055
6056 if (! CODING_FOR_UNIBYTE (coding)
6057 && (bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
6058 c = STRING_CHAR_ADVANCE (src), pos += bytes;
6059 else
6060 c = *src++, pos++;
6061 }
6062 else
6063 c = STRING_CHAR_ADVANCE (src), pos++;
6064 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
6065 c = '\n';
6066 if (! EQ (eol_type, Qunix))
6067 {
6068 if (c == '\n')
6069 {
6070 if (EQ (eol_type, Qdos))
6071 *buf++ = '\r';
6072 else
6073 c = '\r';
6074 }
6075 }
6076 *buf++ = c;
6077 }
6078
6079 coding->consumed = src - coding->source;
6080 coding->consumed_char = pos - coding->src_pos;
6081 coding->charbuf_used = buf - coding->charbuf;
6082 coding->chars_at_source = 0;
6083 }
6084
6085
6086 /* Encode the text at CODING->src_object into CODING->dst_object.
6087 CODING->src_object is a buffer or a string.
6088 CODING->dst_object is a buffer or nil.
6089
6090 If CODING->src_object is a buffer, it must be the current buffer.
6091 In this case, if CODING->src_pos is positive, it is a position of
6092 the source text in the buffer, otherwise. the source text is in the
6093 gap area of the buffer, and coding->src_pos specifies the offset of
6094 the text from GPT (which must be the same as PT). If this is the
6095 same buffer as CODING->dst_object, CODING->src_pos must be
6096 negative and CODING should not have `pre-write-conversion'.
6097
6098 If CODING->src_object is a string, CODING should not have
6099 `pre-write-conversion'.
6100
6101 If CODING->dst_object is a buffer, the encoded data is inserted at
6102 the current point of that buffer.
6103
6104 If CODING->dst_object is nil, the encoded data is placed at the
6105 memory area specified by CODING->destination. */
6106
6107 static int
6108 encode_coding (coding)
6109 struct coding_system *coding;
6110 {
6111 Lisp_Object attrs;
6112
6113 attrs = CODING_ID_ATTRS (coding->id);
6114
6115 if (BUFFERP (coding->dst_object))
6116 {
6117 set_buffer_internal (XBUFFER (coding->dst_object));
6118 coding->dst_multibyte
6119 = ! NILP (current_buffer->enable_multibyte_characters);
6120 }
6121
6122 coding->consumed = coding->consumed_char = 0;
6123 coding->produced = coding->produced_char = 0;
6124 coding->result = CODING_RESULT_SUCCESS;
6125 coding->errors = 0;
6126
6127 ALLOC_CONVERSION_WORK_AREA (coding);
6128
6129 do {
6130 coding_set_source (coding);
6131 consume_chars (coding);
6132
6133 if (!NILP (CODING_ATTR_ENCODE_TBL (attrs)))
6134 translate_chars (coding, CODING_ATTR_ENCODE_TBL (attrs));
6135 else if (!NILP (Vstandard_translation_table_for_encode))
6136 translate_chars (coding, Vstandard_translation_table_for_encode);
6137
6138 coding_set_destination (coding);
6139 (*(coding->encoder)) (coding);
6140 } while (coding->consumed_char < coding->src_chars);
6141
6142 if (BUFFERP (coding->dst_object))
6143 insert_from_gap (coding->produced_char, coding->produced);
6144
6145 return (coding->result);
6146 }
6147
6148
6149 /* Stack of working buffers used in code conversion. An nil element
6150 means that the code conversion of that level is not using a working
6151 buffer. */
6152 Lisp_Object Vcode_conversion_work_buf_list;
6153
6154 /* A working buffer used by the top level conversion. */
6155 Lisp_Object Vcode_conversion_reused_work_buf;
6156
6157
6158 /* Return a working buffer that can be freely used by the following
6159 code conversion. MULTIBYTEP specifies the multibyteness of the
6160 buffer. */
6161
6162 Lisp_Object
6163 make_conversion_work_buffer (multibytep, depth)
6164 int multibytep, depth;
6165 {
6166 struct buffer *current = current_buffer;
6167 Lisp_Object buf, name;
6168
6169 if (depth == 0)
6170 {
6171 if (NILP (Vcode_conversion_reused_work_buf))
6172 Vcode_conversion_reused_work_buf
6173 = Fget_buffer_create (build_string (" *code-converting-work<0>*"));
6174 buf = Vcode_conversion_reused_work_buf;
6175 }
6176 else
6177 {
6178 if (depth < 0)
6179 {
6180 name = build_string (" *code-converting-work*");
6181 name = Fgenerate_new_buffer_name (name, Qnil);
6182 }
6183 else
6184 {
6185 char str[128];
6186
6187 sprintf (str, " *code-converting-work*<%d>", depth);
6188 name = build_string (str);
6189 }
6190 buf = Fget_buffer_create (name);
6191 }
6192 set_buffer_internal (XBUFFER (buf));
6193 current_buffer->undo_list = Qt;
6194 Ferase_buffer ();
6195 Fset_buffer_multibyte (multibytep ? Qt : Qnil, Qnil);
6196 set_buffer_internal (current);
6197 return buf;
6198 }
6199
6200 static Lisp_Object
6201 code_conversion_restore (buffer)
6202 Lisp_Object buffer;
6203 {
6204 Lisp_Object workbuf;
6205
6206 workbuf = XCAR (Vcode_conversion_work_buf_list);
6207 if (! NILP (workbuf)
6208 && ! EQ (workbuf, Vcode_conversion_reused_work_buf)
6209 && ! NILP (Fbuffer_live_p (workbuf)))
6210 Fkill_buffer (workbuf);
6211 Vcode_conversion_work_buf_list = XCDR (Vcode_conversion_work_buf_list);
6212 set_buffer_internal (XBUFFER (buffer));
6213 return Qnil;
6214 }
6215
6216 static Lisp_Object
6217 code_conversion_save (buffer, with_work_buf, multibyte)
6218 Lisp_Object buffer;
6219 int with_work_buf, multibyte;
6220 {
6221 Lisp_Object workbuf;
6222
6223 if (with_work_buf)
6224 {
6225 int depth = XINT (Flength (Vcode_conversion_work_buf_list));
6226
6227 workbuf = make_conversion_work_buffer (multibyte, depth);
6228 }
6229 else
6230 workbuf = Qnil;
6231 Vcode_conversion_work_buf_list
6232 = Fcons (workbuf, Vcode_conversion_work_buf_list);
6233 record_unwind_protect (code_conversion_restore, buffer);
6234 return workbuf;
6235 }
6236
6237 int
6238 decode_coding_gap (coding, chars, bytes)
6239 struct coding_system *coding;
6240 EMACS_INT chars, bytes;
6241 {
6242 int count = specpdl_ptr - specpdl;
6243 Lisp_Object attrs;
6244 Lisp_Object buffer;
6245
6246 buffer = Fcurrent_buffer ();
6247 code_conversion_save (buffer, 0, 0);
6248
6249 coding->src_object = buffer;
6250 coding->src_chars = chars;
6251 coding->src_bytes = bytes;
6252 coding->src_pos = -chars;
6253 coding->src_pos_byte = -bytes;
6254 coding->src_multibyte = chars < bytes;
6255 coding->dst_object = buffer;
6256 coding->dst_pos = PT;
6257 coding->dst_pos_byte = PT_BYTE;
6258 coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
6259 coding->mode |= CODING_MODE_LAST_BLOCK;
6260
6261 if (CODING_REQUIRE_DETECTION (coding))
6262 detect_coding (coding);
6263
6264 decode_coding (coding);
6265
6266 attrs = CODING_ID_ATTRS (coding->id);
6267 if (! NILP (CODING_ATTR_POST_READ (attrs)))
6268 {
6269 struct gcpro gcpro1;
6270 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6271 Lisp_Object val;
6272
6273 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6274 GCPRO1 (buffer);
6275 val = call1 (CODING_ATTR_POST_READ (attrs),
6276 make_number (coding->produced_char));
6277 UNGCPRO;
6278 CHECK_NATNUM (val);
6279 coding->produced_char += Z - prev_Z;
6280 coding->produced += Z_BYTE - prev_Z_BYTE;
6281 }
6282
6283 unbind_to (count, Qnil);
6284 return coding->result;
6285 }
6286
6287 int
6288 encode_coding_gap (coding, chars, bytes)
6289 struct coding_system *coding;
6290 EMACS_INT chars, bytes;
6291 {
6292 int count = specpdl_ptr - specpdl;
6293 Lisp_Object buffer;
6294
6295 buffer = Fcurrent_buffer ();
6296 code_conversion_save (buffer, 0, 0);
6297
6298 coding->src_object = buffer;
6299 coding->src_chars = chars;
6300 coding->src_bytes = bytes;
6301 coding->src_pos = -chars;
6302 coding->src_pos_byte = -bytes;
6303 coding->src_multibyte = chars < bytes;
6304 coding->dst_object = coding->src_object;
6305 coding->dst_pos = PT;
6306 coding->dst_pos_byte = PT_BYTE;
6307
6308 encode_coding (coding);
6309
6310 unbind_to (count, Qnil);
6311 return coding->result;
6312 }
6313
6314
6315 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
6316 SRC_OBJECT into DST_OBJECT by coding context CODING.
6317
6318 SRC_OBJECT is a buffer, a string, or Qnil.
6319
6320 If it is a buffer, the text is at point of the buffer. FROM and TO
6321 are positions in the buffer.
6322
6323 If it is a string, the text is at the beginning of the string.
6324 FROM and TO are indices to the string.
6325
6326 If it is nil, the text is at coding->source. FROM and TO are
6327 indices to coding->source.
6328
6329 DST_OBJECT is a buffer, Qt, or Qnil.
6330
6331 If it is a buffer, the decoded text is inserted at point of the
6332 buffer. If the buffer is the same as SRC_OBJECT, the source text
6333 is deleted.
6334
6335 If it is Qt, a string is made from the decoded text, and
6336 set in CODING->dst_object.
6337
6338 If it is Qnil, the decoded text is stored at CODING->destination.
6339 The caller must allocate CODING->dst_bytes bytes at
6340 CODING->destination by xmalloc. If the decoded text is longer than
6341 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
6342 */
6343
6344 void
6345 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6346 dst_object)
6347 struct coding_system *coding;
6348 Lisp_Object src_object;
6349 EMACS_INT from, from_byte, to, to_byte;
6350 Lisp_Object dst_object;
6351 {
6352 int count = specpdl_ptr - specpdl;
6353 unsigned char *destination;
6354 EMACS_INT dst_bytes;
6355 EMACS_INT chars = to - from;
6356 EMACS_INT bytes = to_byte - from_byte;
6357 Lisp_Object attrs;
6358 Lisp_Object buffer;
6359 int saved_pt = -1, saved_pt_byte;
6360
6361 buffer = Fcurrent_buffer ();
6362
6363 if (NILP (dst_object))
6364 {
6365 destination = coding->destination;
6366 dst_bytes = coding->dst_bytes;
6367 }
6368
6369 coding->src_object = src_object;
6370 coding->src_chars = chars;
6371 coding->src_bytes = bytes;
6372 coding->src_multibyte = chars < bytes;
6373
6374 if (STRINGP (src_object))
6375 {
6376 coding->src_pos = from;
6377 coding->src_pos_byte = from_byte;
6378 }
6379 else if (BUFFERP (src_object))
6380 {
6381 set_buffer_internal (XBUFFER (src_object));
6382 if (from != GPT)
6383 move_gap_both (from, from_byte);
6384 if (EQ (src_object, dst_object))
6385 {
6386 saved_pt = PT, saved_pt_byte = PT_BYTE;
6387 TEMP_SET_PT_BOTH (from, from_byte);
6388 del_range_both (from, from_byte, to, to_byte, 1);
6389 coding->src_pos = -chars;
6390 coding->src_pos_byte = -bytes;
6391 }
6392 else
6393 {
6394 coding->src_pos = from;
6395 coding->src_pos_byte = from_byte;
6396 }
6397 }
6398
6399 if (CODING_REQUIRE_DETECTION (coding))
6400 detect_coding (coding);
6401 attrs = CODING_ID_ATTRS (coding->id);
6402
6403 if (EQ (dst_object, Qt)
6404 || (! NILP (CODING_ATTR_POST_READ (attrs))
6405 && NILP (dst_object)))
6406 {
6407 coding->dst_object = code_conversion_save (buffer, 1, 1);
6408 coding->dst_pos = BEG;
6409 coding->dst_pos_byte = BEG_BYTE;
6410 coding->dst_multibyte = 1;
6411 }
6412 else if (BUFFERP (dst_object))
6413 {
6414 code_conversion_save (buffer, 0, 0);
6415 coding->dst_object = dst_object;
6416 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6417 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6418 coding->dst_multibyte
6419 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6420 }
6421 else
6422 {
6423 code_conversion_save (buffer, 0, 0);
6424 coding->dst_object = Qnil;
6425 coding->dst_multibyte = 1;
6426 }
6427
6428 decode_coding (coding);
6429
6430 if (BUFFERP (coding->dst_object))
6431 set_buffer_internal (XBUFFER (coding->dst_object));
6432
6433 if (! NILP (CODING_ATTR_POST_READ (attrs)))
6434 {
6435 struct gcpro gcpro1, gcpro2;
6436 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6437 Lisp_Object val;
6438
6439 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6440 GCPRO2 (coding->src_object, coding->dst_object);
6441 val = call1 (CODING_ATTR_POST_READ (attrs),
6442 make_number (coding->produced_char));
6443 UNGCPRO;
6444 CHECK_NATNUM (val);
6445 coding->produced_char += Z - prev_Z;
6446 coding->produced += Z_BYTE - prev_Z_BYTE;
6447 }
6448
6449 if (EQ (dst_object, Qt))
6450 {
6451 coding->dst_object = Fbuffer_string ();
6452 }
6453 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
6454 {
6455 set_buffer_internal (XBUFFER (coding->dst_object));
6456 if (dst_bytes < coding->produced)
6457 {
6458 destination
6459 = (unsigned char *) xrealloc (destination, coding->produced);
6460 if (! destination)
6461 {
6462 coding->result = CODING_RESULT_INSUFFICIENT_DST;
6463 unbind_to (count, Qnil);
6464 return;
6465 }
6466 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
6467 move_gap_both (BEGV, BEGV_BYTE);
6468 bcopy (BEGV_ADDR, destination, coding->produced);
6469 coding->destination = destination;
6470 }
6471 }
6472
6473 if (saved_pt >= 0)
6474 {
6475 /* This is the case of:
6476 (BUFFERP (src_object) && EQ (src_object, dst_object))
6477 As we have moved PT while replacing the original buffer
6478 contents, we must recover it now. */
6479 set_buffer_internal (XBUFFER (src_object));
6480 if (saved_pt < from)
6481 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6482 else if (saved_pt < from + chars)
6483 TEMP_SET_PT_BOTH (from, from_byte);
6484 else if (! NILP (current_buffer->enable_multibyte_characters))
6485 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6486 saved_pt_byte + (coding->produced - bytes));
6487 else
6488 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6489 saved_pt_byte + (coding->produced - bytes));
6490 }
6491
6492 unbind_to (count, Qnil);
6493 }
6494
6495
6496 void
6497 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6498 dst_object)
6499 struct coding_system *coding;
6500 Lisp_Object src_object;
6501 EMACS_INT from, from_byte, to, to_byte;
6502 Lisp_Object dst_object;
6503 {
6504 int count = specpdl_ptr - specpdl;
6505 EMACS_INT chars = to - from;
6506 EMACS_INT bytes = to_byte - from_byte;
6507 Lisp_Object attrs;
6508 Lisp_Object buffer;
6509 int saved_pt = -1, saved_pt_byte;
6510
6511 buffer = Fcurrent_buffer ();
6512
6513 coding->src_object = src_object;
6514 coding->src_chars = chars;
6515 coding->src_bytes = bytes;
6516 coding->src_multibyte = chars < bytes;
6517
6518 attrs = CODING_ID_ATTRS (coding->id);
6519
6520 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6521 {
6522 coding->src_object = code_conversion_save (buffer, 1,
6523 coding->src_multibyte);
6524 set_buffer_internal (XBUFFER (coding->src_object));
6525 if (STRINGP (src_object))
6526 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
6527 else if (BUFFERP (src_object))
6528 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
6529 else
6530 insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
6531
6532 if (EQ (src_object, dst_object))
6533 {
6534 set_buffer_internal (XBUFFER (src_object));
6535 saved_pt = PT, saved_pt_byte = PT_BYTE;
6536 del_range_both (from, from_byte, to, to_byte, 1);
6537 set_buffer_internal (XBUFFER (coding->src_object));
6538 }
6539
6540 call2 (CODING_ATTR_PRE_WRITE (attrs),
6541 make_number (BEG), make_number (Z));
6542 coding->src_object = Fcurrent_buffer ();
6543 if (BEG != GPT)
6544 move_gap_both (BEG, BEG_BYTE);
6545 coding->src_chars = Z - BEG;
6546 coding->src_bytes = Z_BYTE - BEG_BYTE;
6547 coding->src_pos = BEG;
6548 coding->src_pos_byte = BEG_BYTE;
6549 coding->src_multibyte = Z < Z_BYTE;
6550 }
6551 else if (STRINGP (src_object))
6552 {
6553 code_conversion_save (buffer, 0, 0);
6554 coding->src_pos = from;
6555 coding->src_pos_byte = from_byte;
6556 }
6557 else if (BUFFERP (src_object))
6558 {
6559 code_conversion_save (buffer, 0, 0);
6560 set_buffer_internal (XBUFFER (src_object));
6561 if (EQ (src_object, dst_object))
6562 {
6563 saved_pt = PT, saved_pt_byte = PT_BYTE;
6564 coding->src_object = del_range_1 (from, to, 1, 1);
6565 coding->src_pos = 0;
6566 coding->src_pos_byte = 0;
6567 }
6568 else
6569 {
6570 if (from < GPT && to >= GPT)
6571 move_gap_both (from, from_byte);
6572 coding->src_pos = from;
6573 coding->src_pos_byte = from_byte;
6574 }
6575 }
6576 else
6577 code_conversion_save (buffer, 0, 0);
6578
6579 if (BUFFERP (dst_object))
6580 {
6581 coding->dst_object = dst_object;
6582 if (EQ (src_object, dst_object))
6583 {
6584 coding->dst_pos = from;
6585 coding->dst_pos_byte = from_byte;
6586 }
6587 else
6588 {
6589 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6590 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6591 }
6592 coding->dst_multibyte
6593 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6594 }
6595 else if (EQ (dst_object, Qt))
6596 {
6597 coding->dst_object = Qnil;
6598 coding->dst_bytes = coding->src_chars;
6599 if (coding->dst_bytes == 0)
6600 coding->dst_bytes = 1;
6601 coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
6602 coding->dst_multibyte = 0;
6603 }
6604 else
6605 {
6606 coding->dst_object = Qnil;
6607 coding->dst_multibyte = 0;
6608 }
6609
6610 encode_coding (coding);
6611
6612 if (EQ (dst_object, Qt))
6613 {
6614 if (BUFFERP (coding->dst_object))
6615 coding->dst_object = Fbuffer_string ();
6616 else
6617 {
6618 coding->dst_object
6619 = make_unibyte_string ((char *) coding->destination,
6620 coding->produced);
6621 xfree (coding->destination);
6622 }
6623 }
6624
6625 if (saved_pt >= 0)
6626 {
6627 /* This is the case of:
6628 (BUFFERP (src_object) && EQ (src_object, dst_object))
6629 As we have moved PT while replacing the original buffer
6630 contents, we must recover it now. */
6631 set_buffer_internal (XBUFFER (src_object));
6632 if (saved_pt < from)
6633 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6634 else if (saved_pt < from + chars)
6635 TEMP_SET_PT_BOTH (from, from_byte);
6636 else if (! NILP (current_buffer->enable_multibyte_characters))
6637 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6638 saved_pt_byte + (coding->produced - bytes));
6639 else
6640 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6641 saved_pt_byte + (coding->produced - bytes));
6642 }
6643
6644 unbind_to (count, Qnil);
6645 }
6646
6647
6648 Lisp_Object
6649 preferred_coding_system ()
6650 {
6651 int id = coding_categories[coding_priorities[0]].id;
6652
6653 return CODING_ID_NAME (id);
6654 }
6655
6656 \f
6657 #ifdef emacs
6658 /*** 8. Emacs Lisp library functions ***/
6659
6660 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6661 doc: /* Return t if OBJECT is nil or a coding-system.
6662 See the documentation of `define-coding-system' for information
6663 about coding-system objects. */)
6664 (obj)
6665 Lisp_Object obj;
6666 {
6667 return ((NILP (obj) || CODING_SYSTEM_P (obj)) ? Qt : Qnil);
6668 }
6669
6670 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6671 Sread_non_nil_coding_system, 1, 1, 0,
6672 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6673 (prompt)
6674 Lisp_Object prompt;
6675 {
6676 Lisp_Object val;
6677 do
6678 {
6679 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6680 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6681 }
6682 while (XSTRING (val)->size == 0);
6683 return (Fintern (val, Qnil));
6684 }
6685
6686 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6687 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6688 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
6689 (prompt, default_coding_system)
6690 Lisp_Object prompt, default_coding_system;
6691 {
6692 Lisp_Object val;
6693 if (SYMBOLP (default_coding_system))
6694 XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
6695 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6696 Qt, Qnil, Qcoding_system_history,
6697 default_coding_system, Qnil);
6698 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
6699 }
6700
6701 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6702 1, 1, 0,
6703 doc: /* Check validity of CODING-SYSTEM.
6704 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error. */)
6705 (coding_system)
6706 Lisp_Object coding_system;
6707 {
6708 CHECK_SYMBOL (coding_system);
6709 if (!NILP (Fcoding_system_p (coding_system)))
6710 return coding_system;
6711 while (1)
6712 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
6713 }
6714
6715 \f
6716 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
6717 HIGHEST is nonzero, return the coding system of the highest
6718 priority among the detected coding systems. Otherwize return a
6719 list of detected coding systems sorted by their priorities. If
6720 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
6721 multibyte form but contains only ASCII and eight-bit chars.
6722 Otherwise, the bytes are raw bytes.
6723
6724 CODING-SYSTEM controls the detection as below:
6725
6726 If it is nil, detect both text-format and eol-format. If the
6727 text-format part of CODING-SYSTEM is already specified
6728 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
6729 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
6730 detect only text-format. */
6731
6732 Lisp_Object
6733 detect_coding_system (src, src_bytes, highest, multibytep, coding_system)
6734 unsigned char *src;
6735 int src_bytes, highest;
6736 int multibytep;
6737 Lisp_Object coding_system;
6738 {
6739 unsigned char *src_end = src + src_bytes;
6740 Lisp_Object attrs, eol_type;
6741 Lisp_Object val;
6742 struct coding_system coding;
6743 int id;
6744 struct coding_detection_info detect_info;
6745
6746 if (NILP (coding_system))
6747 coding_system = Qundecided;
6748 setup_coding_system (coding_system, &coding);
6749 attrs = CODING_ID_ATTRS (coding.id);
6750 eol_type = CODING_ID_EOL_TYPE (coding.id);
6751 coding_system = CODING_ATTR_BASE_NAME (attrs);
6752
6753 coding.source = src;
6754 coding.src_bytes = src_bytes;
6755 coding.src_multibyte = multibytep;
6756 coding.consumed = 0;
6757 coding.mode |= CODING_MODE_LAST_BLOCK;
6758
6759 detect_info.checked = detect_info.found = detect_info.rejected = 0;
6760
6761 /* At first, detect text-format if necessary. */
6762 if (XINT (CODING_ATTR_CATEGORY (attrs)) == coding_category_undecided)
6763 {
6764 enum coding_category category;
6765 struct coding_system *this;
6766 int c, i;
6767
6768 for (; src < src_end; src++)
6769 {
6770 c = *src;
6771 if (c & 0x80
6772 || (c < 0x20 && (c == ISO_CODE_ESC
6773 || c == ISO_CODE_SI
6774 || c == ISO_CODE_SO)))
6775 break;
6776 }
6777 coding.head_ascii = src - coding.source;
6778
6779 if (src < src_end)
6780 for (i = 0; i < coding_category_raw_text; i++)
6781 {
6782 category = coding_priorities[i];
6783 this = coding_categories + category;
6784
6785 if (this->id < 0)
6786 {
6787 /* No coding system of this category is defined. */
6788 detect_info.rejected |= (1 << category);
6789 }
6790 else if (category >= coding_category_raw_text)
6791 continue;
6792 else if (detect_info.checked & (1 << category))
6793 {
6794 if (highest
6795 && (detect_info.found & (1 << category)))
6796 break;
6797 }
6798 else
6799 {
6800 if ((*(this->detector)) (&coding, &detect_info)
6801 && highest
6802 && (detect_info.found & (1 << category)))
6803 break;
6804 }
6805 }
6806
6807
6808 if (detect_info.rejected == CATEGORY_MASK_ANY)
6809 {
6810 detect_info.found = CATEGORY_MASK_RAW_TEXT;
6811 id = coding_categories[coding_category_raw_text].id;
6812 val = Fcons (make_number (id), Qnil);
6813 }
6814 else if (! detect_info.rejected && ! detect_info.found)
6815 {
6816 detect_info.found = CATEGORY_MASK_ANY;
6817 id = coding_categories[coding_category_undecided].id;
6818 val = Fcons (make_number (id), Qnil);
6819 }
6820 else if (highest)
6821 {
6822 if (detect_info.found)
6823 {
6824 detect_info.found = 1 << category;
6825 val = Fcons (make_number (this->id), Qnil);
6826 }
6827 else
6828 for (i = 0; i < coding_category_raw_text; i++)
6829 if (! (detect_info.rejected & (1 << coding_priorities[i])))
6830 {
6831 detect_info.found = 1 << coding_priorities[i];
6832 id = coding_categories[coding_priorities[i]].id;
6833 val = Fcons (make_number (id), Qnil);
6834 break;
6835 }
6836 }
6837 else
6838 {
6839 int mask = detect_info.rejected | detect_info.found;
6840 int found = 0;
6841 val = Qnil;
6842
6843 for (i = coding_category_raw_text - 1; i >= 0; i--)
6844 {
6845 category = coding_priorities[i];
6846 if (! (mask & (1 << category)))
6847 {
6848 found |= 1 << category;
6849 id = coding_categories[category].id;
6850 val = Fcons (make_number (id), val);
6851 }
6852 }
6853 for (i = coding_category_raw_text - 1; i >= 0; i--)
6854 {
6855 category = coding_priorities[i];
6856 if (detect_info.found & (1 << category))
6857 {
6858 id = coding_categories[category].id;
6859 val = Fcons (make_number (id), val);
6860 }
6861 }
6862 detect_info.found |= found;
6863 }
6864 }
6865 else
6866 {
6867 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
6868 val = Fcons (make_number (coding.id), Qnil);
6869 }
6870
6871 /* Then, detect eol-format if necessary. */
6872 {
6873 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
6874 Lisp_Object tail;
6875
6876 if (VECTORP (eol_type))
6877 {
6878 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
6879 normal_eol = detect_eol (coding.source, src_bytes,
6880 coding_category_raw_text);
6881 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
6882 | CATEGORY_MASK_UTF_16_BE_NOSIG))
6883 utf_16_be_eol = detect_eol (coding.source, src_bytes,
6884 coding_category_utf_16_be);
6885 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
6886 | CATEGORY_MASK_UTF_16_LE_NOSIG))
6887 utf_16_le_eol = detect_eol (coding.source, src_bytes,
6888 coding_category_utf_16_le);
6889 }
6890 else
6891 {
6892 if (EQ (eol_type, Qunix))
6893 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
6894 else if (EQ (eol_type, Qdos))
6895 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
6896 else
6897 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
6898 }
6899
6900 for (tail = val; CONSP (tail); tail = XCDR (tail))
6901 {
6902 enum coding_category category;
6903 int this_eol;
6904
6905 id = XINT (XCAR (tail));
6906 attrs = CODING_ID_ATTRS (id);
6907 category = XINT (CODING_ATTR_CATEGORY (attrs));
6908 eol_type = CODING_ID_EOL_TYPE (id);
6909 if (VECTORP (eol_type))
6910 {
6911 if (category == coding_category_utf_16_be
6912 || category == coding_category_utf_16_be_nosig)
6913 this_eol = utf_16_be_eol;
6914 else if (category == coding_category_utf_16_le
6915 || category == coding_category_utf_16_le_nosig)
6916 this_eol = utf_16_le_eol;
6917 else
6918 this_eol = normal_eol;
6919
6920 if (this_eol == EOL_SEEN_LF)
6921 XSETCAR (tail, AREF (eol_type, 0));
6922 else if (this_eol == EOL_SEEN_CRLF)
6923 XSETCAR (tail, AREF (eol_type, 1));
6924 else if (this_eol == EOL_SEEN_CR)
6925 XSETCAR (tail, AREF (eol_type, 2));
6926 else
6927 XSETCAR (tail, CODING_ID_NAME (id));
6928 }
6929 else
6930 XSETCAR (tail, CODING_ID_NAME (id));
6931 }
6932 }
6933
6934 return (highest ? XCAR (val) : val);
6935 }
6936
6937
6938 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6939 2, 3, 0,
6940 doc: /* Detect coding system of the text in the region between START and END.
6941 Return a list of possible coding systems ordered by priority.
6942
6943 If only ASCII characters are found, it returns a list of single element
6944 `undecided' or its subsidiary coding system according to a detected
6945 end-of-line format.
6946
6947 If optional argument HIGHEST is non-nil, return the coding system of
6948 highest priority. */)
6949 (start, end, highest)
6950 Lisp_Object start, end, highest;
6951 {
6952 int from, to;
6953 int from_byte, to_byte;
6954
6955 CHECK_NUMBER_COERCE_MARKER (start);
6956 CHECK_NUMBER_COERCE_MARKER (end);
6957
6958 validate_region (&start, &end);
6959 from = XINT (start), to = XINT (end);
6960 from_byte = CHAR_TO_BYTE (from);
6961 to_byte = CHAR_TO_BYTE (to);
6962
6963 if (from < GPT && to >= GPT)
6964 move_gap_both (to, to_byte);
6965
6966 return detect_coding_system (BYTE_POS_ADDR (from_byte),
6967 to_byte - from_byte,
6968 !NILP (highest),
6969 !NILP (current_buffer
6970 ->enable_multibyte_characters),
6971 Qnil);
6972 }
6973
6974 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6975 1, 2, 0,
6976 doc: /* Detect coding system of the text in STRING.
6977 Return a list of possible coding systems ordered by priority.
6978
6979 If only ASCII characters are found, it returns a list of single element
6980 `undecided' or its subsidiary coding system according to a detected
6981 end-of-line format.
6982
6983 If optional argument HIGHEST is non-nil, return the coding system of
6984 highest priority. */)
6985 (string, highest)
6986 Lisp_Object string, highest;
6987 {
6988 CHECK_STRING (string);
6989
6990 return detect_coding_system (XSTRING (string)->data,
6991 STRING_BYTES (XSTRING (string)),
6992 !NILP (highest),
6993 STRING_MULTIBYTE (string),
6994 Qnil);
6995 }
6996
6997
6998 static INLINE int
6999 char_encodable_p (c, attrs)
7000 int c;
7001 Lisp_Object attrs;
7002 {
7003 Lisp_Object tail;
7004 struct charset *charset;
7005
7006 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
7007 CONSP (tail); tail = XCDR (tail))
7008 {
7009 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
7010 if (CHAR_CHARSET_P (c, charset))
7011 break;
7012 }
7013 return (! NILP (tail));
7014 }
7015
7016
7017 /* Return a list of coding systems that safely encode the text between
7018 START and END. If EXCLUDE is non-nil, it is a list of coding
7019 systems not to check. The returned list doesn't contain any such
7020 coding systems. In any case, if the text contains only ASCII or is
7021 unibyte, return t. */
7022
7023 DEFUN ("find-coding-systems-region-internal",
7024 Ffind_coding_systems_region_internal,
7025 Sfind_coding_systems_region_internal, 2, 3, 0,
7026 doc: /* Internal use only. */)
7027 (start, end, exclude)
7028 Lisp_Object start, end, exclude;
7029 {
7030 Lisp_Object coding_attrs_list, safe_codings;
7031 EMACS_INT start_byte, end_byte;
7032 const unsigned char *p, *pbeg, *pend;
7033 int c;
7034 Lisp_Object tail, elt;
7035
7036 if (STRINGP (start))
7037 {
7038 if (!STRING_MULTIBYTE (start)
7039 || XSTRING (start)->size == STRING_BYTES (XSTRING (start)))
7040 return Qt;
7041 start_byte = 0;
7042 end_byte = STRING_BYTES (XSTRING (start));
7043 }
7044 else
7045 {
7046 CHECK_NUMBER_COERCE_MARKER (start);
7047 CHECK_NUMBER_COERCE_MARKER (end);
7048 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7049 args_out_of_range (start, end);
7050 if (NILP (current_buffer->enable_multibyte_characters))
7051 return Qt;
7052 start_byte = CHAR_TO_BYTE (XINT (start));
7053 end_byte = CHAR_TO_BYTE (XINT (end));
7054 if (XINT (end) - XINT (start) == end_byte - start_byte)
7055 return Qt;
7056
7057 if (XINT (start) < GPT && XINT (end) > GPT)
7058 {
7059 if ((GPT - XINT (start)) < (XINT (end) - GPT))
7060 move_gap_both (XINT (start), start_byte);
7061 else
7062 move_gap_both (XINT (end), end_byte);
7063 }
7064 }
7065
7066 coding_attrs_list = Qnil;
7067 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
7068 if (NILP (exclude)
7069 || NILP (Fmemq (XCAR (tail), exclude)))
7070 {
7071 Lisp_Object attrs;
7072
7073 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
7074 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
7075 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
7076 coding_attrs_list = Fcons (attrs, coding_attrs_list);
7077 }
7078
7079 if (STRINGP (start))
7080 p = pbeg = XSTRING (start)->data;
7081 else
7082 p = pbeg = BYTE_POS_ADDR (start_byte);
7083 pend = p + (end_byte - start_byte);
7084
7085 while (p < pend && ASCII_BYTE_P (*p)) p++;
7086 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7087
7088 while (p < pend)
7089 {
7090 if (ASCII_BYTE_P (*p))
7091 p++;
7092 else
7093 {
7094 c = STRING_CHAR_ADVANCE (p);
7095
7096 charset_map_loaded = 0;
7097 for (tail = coding_attrs_list; CONSP (tail);)
7098 {
7099 elt = XCAR (tail);
7100 if (NILP (elt))
7101 tail = XCDR (tail);
7102 else if (char_encodable_p (c, elt))
7103 tail = XCDR (tail);
7104 else if (CONSP (XCDR (tail)))
7105 {
7106 XSETCAR (tail, XCAR (XCDR (tail)));
7107 XSETCDR (tail, XCDR (XCDR (tail)));
7108 }
7109 else
7110 {
7111 XSETCAR (tail, Qnil);
7112 tail = XCDR (tail);
7113 }
7114 }
7115 if (charset_map_loaded)
7116 {
7117 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7118
7119 if (STRINGP (start))
7120 pbeg = XSTRING (start)->data;
7121 else
7122 pbeg = BYTE_POS_ADDR (start_byte);
7123 p = pbeg + p_offset;
7124 pend = pbeg + pend_offset;
7125 }
7126 }
7127 }
7128
7129 safe_codings = Qnil;
7130 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
7131 if (! NILP (XCAR (tail)))
7132 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
7133
7134 return safe_codings;
7135 }
7136
7137
7138 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
7139 Scheck_coding_systems_region, 3, 3, 0,
7140 doc: /* Check if the region is encodable by coding systems.
7141
7142 START and END are buffer positions specifying the region.
7143 CODING-SYSTEM-LIST is a list of coding systems to check.
7144
7145 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
7146 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
7147 whole region, POS0, POS1, ... are buffer positions where non-encodable
7148 characters are found.
7149
7150 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
7151 value is nil.
7152
7153 START may be a string. In that case, check if the string is
7154 encodable, and the value contains indices to the string instead of
7155 buffer positions. END is ignored. */)
7156 (start, end, coding_system_list)
7157 Lisp_Object start, end, coding_system_list;
7158 {
7159 Lisp_Object list;
7160 EMACS_INT start_byte, end_byte;
7161 int pos;
7162 const unsigned char *p, *pbeg, *pend;
7163 int c;
7164 Lisp_Object tail, elt;
7165
7166 if (STRINGP (start))
7167 {
7168 if (!STRING_MULTIBYTE (start)
7169 && XSTRING (start)->size != STRING_BYTES (XSTRING (start)))
7170 return Qnil;
7171 start_byte = 0;
7172 end_byte = STRING_BYTES (XSTRING (start));
7173 pos = 0;
7174 }
7175 else
7176 {
7177 CHECK_NUMBER_COERCE_MARKER (start);
7178 CHECK_NUMBER_COERCE_MARKER (end);
7179 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7180 args_out_of_range (start, end);
7181 if (NILP (current_buffer->enable_multibyte_characters))
7182 return Qnil;
7183 start_byte = CHAR_TO_BYTE (XINT (start));
7184 end_byte = CHAR_TO_BYTE (XINT (end));
7185 if (XINT (end) - XINT (start) == end_byte - start_byte)
7186 return Qt;
7187
7188 if (XINT (start) < GPT && XINT (end) > GPT)
7189 {
7190 if ((GPT - XINT (start)) < (XINT (end) - GPT))
7191 move_gap_both (XINT (start), start_byte);
7192 else
7193 move_gap_both (XINT (end), end_byte);
7194 }
7195 pos = XINT (start);
7196 }
7197
7198 list = Qnil;
7199 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
7200 {
7201 elt = XCAR (tail);
7202 list = Fcons (Fcons (elt, Fcons (AREF (CODING_SYSTEM_SPEC (elt), 0),
7203 Qnil)),
7204 list);
7205 }
7206
7207 if (STRINGP (start))
7208 p = pbeg = XSTRING (start)->data;
7209 else
7210 p = pbeg = BYTE_POS_ADDR (start_byte);
7211 pend = p + (end_byte - start_byte);
7212
7213 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
7214 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7215
7216 while (p < pend)
7217 {
7218 if (ASCII_BYTE_P (*p))
7219 p++;
7220 else
7221 {
7222 c = STRING_CHAR_ADVANCE (p);
7223
7224 charset_map_loaded = 0;
7225 for (tail = list; CONSP (tail); tail = XCDR (tail))
7226 {
7227 elt = XCDR (XCAR (tail));
7228 if (! char_encodable_p (c, XCAR (elt)))
7229 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
7230 }
7231 if (charset_map_loaded)
7232 {
7233 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7234
7235 if (STRINGP (start))
7236 pbeg = XSTRING (start)->data;
7237 else
7238 pbeg = BYTE_POS_ADDR (start_byte);
7239 p = pbeg + p_offset;
7240 pend = pbeg + pend_offset;
7241 }
7242 }
7243 pos++;
7244 }
7245
7246 tail = list;
7247 list = Qnil;
7248 for (; CONSP (tail); tail = XCDR (tail))
7249 {
7250 elt = XCAR (tail);
7251 if (CONSP (XCDR (XCDR (elt))))
7252 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
7253 list);
7254 }
7255
7256 return list;
7257 }
7258
7259
7260
7261 Lisp_Object
7262 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
7263 Lisp_Object start, end, coding_system, dst_object;
7264 int encodep, norecord;
7265 {
7266 struct coding_system coding;
7267 EMACS_INT from, from_byte, to, to_byte;
7268 Lisp_Object src_object;
7269
7270 CHECK_NUMBER_COERCE_MARKER (start);
7271 CHECK_NUMBER_COERCE_MARKER (end);
7272 if (NILP (coding_system))
7273 coding_system = Qno_conversion;
7274 else
7275 CHECK_CODING_SYSTEM (coding_system);
7276 src_object = Fcurrent_buffer ();
7277 if (NILP (dst_object))
7278 dst_object = src_object;
7279 else if (! EQ (dst_object, Qt))
7280 CHECK_BUFFER (dst_object);
7281
7282 validate_region (&start, &end);
7283 from = XFASTINT (start);
7284 from_byte = CHAR_TO_BYTE (from);
7285 to = XFASTINT (end);
7286 to_byte = CHAR_TO_BYTE (to);
7287
7288 setup_coding_system (coding_system, &coding);
7289 coding.mode |= CODING_MODE_LAST_BLOCK;
7290
7291 if (encodep)
7292 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7293 dst_object);
7294 else
7295 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7296 dst_object);
7297 if (! norecord)
7298 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7299
7300 if (coding.result != CODING_RESULT_SUCCESS)
7301 error ("Code conversion error: %d", coding.result);
7302
7303 return (BUFFERP (dst_object)
7304 ? make_number (coding.produced_char)
7305 : coding.dst_object);
7306 }
7307
7308
7309 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7310 3, 4, "r\nzCoding system: ",
7311 doc: /* Decode the current region from the specified coding system.
7312 When called from a program, takes four arguments:
7313 START, END, CODING-SYSTEM, and DESTINATION.
7314 START and END are buffer positions.
7315
7316 Optional 4th arguments DESTINATION specifies where the decoded text goes.
7317 If nil, the region between START and END is replace by the decoded text.
7318 If buffer, the decoded text is inserted in the buffer.
7319 If t, the decoded text is returned.
7320
7321 This function sets `last-coding-system-used' to the precise coding system
7322 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7323 not fully specified.)
7324 It returns the length of the decoded text. */)
7325 (start, end, coding_system, destination)
7326 Lisp_Object start, end, coding_system, destination;
7327 {
7328 return code_convert_region (start, end, coding_system, destination, 0, 0);
7329 }
7330
7331 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7332 3, 4, "r\nzCoding system: ",
7333 doc: /* Encode the current region by specified coding system.
7334 When called from a program, takes three arguments:
7335 START, END, and CODING-SYSTEM. START and END are buffer positions.
7336
7337 Optional 4th arguments DESTINATION specifies where the encoded text goes.
7338 If nil, the region between START and END is replace by the encoded text.
7339 If buffer, the encoded text is inserted in the buffer.
7340 If t, the encoded text is returned.
7341
7342 This function sets `last-coding-system-used' to the precise coding system
7343 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7344 not fully specified.)
7345 It returns the length of the encoded text. */)
7346 (start, end, coding_system, destination)
7347 Lisp_Object start, end, coding_system, destination;
7348 {
7349 return code_convert_region (start, end, coding_system, destination, 1, 0);
7350 }
7351
7352 Lisp_Object
7353 code_convert_string (string, coding_system, dst_object,
7354 encodep, nocopy, norecord)
7355 Lisp_Object string, coding_system, dst_object;
7356 int encodep, nocopy, norecord;
7357 {
7358 struct coding_system coding;
7359 EMACS_INT chars, bytes;
7360
7361 CHECK_STRING (string);
7362 if (NILP (coding_system))
7363 {
7364 if (! norecord)
7365 Vlast_coding_system_used = Qno_conversion;
7366 if (NILP (dst_object))
7367 return (nocopy ? Fcopy_sequence (string) : string);
7368 }
7369
7370 if (NILP (coding_system))
7371 coding_system = Qno_conversion;
7372 else
7373 CHECK_CODING_SYSTEM (coding_system);
7374 if (NILP (dst_object))
7375 dst_object = Qt;
7376 else if (! EQ (dst_object, Qt))
7377 CHECK_BUFFER (dst_object);
7378
7379 setup_coding_system (coding_system, &coding);
7380 coding.mode |= CODING_MODE_LAST_BLOCK;
7381 chars = XSTRING (string)->size;
7382 bytes = STRING_BYTES (XSTRING (string));
7383 if (encodep)
7384 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7385 else
7386 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7387 if (! norecord)
7388 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7389
7390 if (coding.result != CODING_RESULT_SUCCESS)
7391 error ("Code conversion error: %d", coding.result);
7392
7393 return (BUFFERP (dst_object)
7394 ? make_number (coding.produced_char)
7395 : coding.dst_object);
7396 }
7397
7398
7399 /* Encode or decode STRING according to CODING_SYSTEM.
7400 Do not set Vlast_coding_system_used.
7401
7402 This function is called only from macros DECODE_FILE and
7403 ENCODE_FILE, thus we ignore character composition. */
7404
7405 Lisp_Object
7406 code_convert_string_norecord (string, coding_system, encodep)
7407 Lisp_Object string, coding_system;
7408 int encodep;
7409 {
7410 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
7411 }
7412
7413
7414 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7415 2, 4, 0,
7416 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7417
7418 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
7419 if the decoding operation is trivial.
7420
7421 Optional fourth arg BUFFER non-nil meant that the decoded text is
7422 inserted in BUFFER instead of returned as a string. In this case,
7423 the return value is BUFFER.
7424
7425 This function sets `last-coding-system-used' to the precise coding system
7426 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7427 not fully specified. */)
7428 (string, coding_system, nocopy, buffer)
7429 Lisp_Object string, coding_system, nocopy, buffer;
7430 {
7431 return code_convert_string (string, coding_system, buffer,
7432 0, ! NILP (nocopy), 0);
7433 }
7434
7435 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
7436 2, 4, 0,
7437 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7438
7439 Optional third arg NOCOPY non-nil means it is OK to return STRING
7440 itself if the encoding operation is trivial.
7441
7442 Optional fourth arg BUFFER non-nil meant that the encoded text is
7443 inserted in BUFFER instead of returned as a string. In this case,
7444 the return value is BUFFER.
7445
7446 This function sets `last-coding-system-used' to the precise coding system
7447 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7448 not fully specified.) */)
7449 (string, coding_system, nocopy, buffer)
7450 Lisp_Object string, coding_system, nocopy, buffer;
7451 {
7452 return code_convert_string (string, coding_system, buffer,
7453 1, ! NILP (nocopy), 1);
7454 }
7455
7456 \f
7457 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
7458 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7459 Return the corresponding character. */)
7460 (code)
7461 Lisp_Object code;
7462 {
7463 Lisp_Object spec, attrs, val;
7464 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
7465 int c;
7466
7467 CHECK_NATNUM (code);
7468 c = XFASTINT (code);
7469 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7470 attrs = AREF (spec, 0);
7471
7472 if (ASCII_BYTE_P (c)
7473 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7474 return code;
7475
7476 val = CODING_ATTR_CHARSET_LIST (attrs);
7477 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7478 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7479 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
7480
7481 if (c <= 0x7F)
7482 charset = charset_roman;
7483 else if (c >= 0xA0 && c < 0xDF)
7484 {
7485 charset = charset_kana;
7486 c -= 0x80;
7487 }
7488 else
7489 {
7490 int s1 = c >> 8, s2 = c & 0xFF;
7491
7492 if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
7493 || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
7494 error ("Invalid code: %d", code);
7495 SJIS_TO_JIS (c);
7496 charset = charset_kanji;
7497 }
7498 c = DECODE_CHAR (charset, c);
7499 if (c < 0)
7500 error ("Invalid code: %d", code);
7501 return make_number (c);
7502 }
7503
7504
7505 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
7506 doc: /* Encode a Japanese character CHAR to shift_jis encoding.
7507 Return the corresponding code in SJIS. */)
7508 (ch)
7509 Lisp_Object ch;
7510 {
7511 Lisp_Object spec, attrs, charset_list;
7512 int c;
7513 struct charset *charset;
7514 unsigned code;
7515
7516 CHECK_CHARACTER (ch);
7517 c = XFASTINT (ch);
7518 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7519 attrs = AREF (spec, 0);
7520
7521 if (ASCII_CHAR_P (c)
7522 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7523 return ch;
7524
7525 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7526 charset = char_charset (c, charset_list, &code);
7527 if (code == CHARSET_INVALID_CODE (charset))
7528 error ("Can't encode by shift_jis encoding: %d", c);
7529 JIS_TO_SJIS (code);
7530
7531 return make_number (code);
7532 }
7533
7534 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7535 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7536 Return the corresponding character. */)
7537 (code)
7538 Lisp_Object code;
7539 {
7540 Lisp_Object spec, attrs, val;
7541 struct charset *charset_roman, *charset_big5, *charset;
7542 int c;
7543
7544 CHECK_NATNUM (code);
7545 c = XFASTINT (code);
7546 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7547 attrs = AREF (spec, 0);
7548
7549 if (ASCII_BYTE_P (c)
7550 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7551 return code;
7552
7553 val = CODING_ATTR_CHARSET_LIST (attrs);
7554 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7555 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
7556
7557 if (c <= 0x7F)
7558 charset = charset_roman;
7559 else
7560 {
7561 int b1 = c >> 8, b2 = c & 0x7F;
7562 if (b1 < 0xA1 || b1 > 0xFE
7563 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
7564 error ("Invalid code: %d", code);
7565 charset = charset_big5;
7566 }
7567 c = DECODE_CHAR (charset, (unsigned )c);
7568 if (c < 0)
7569 error ("Invalid code: %d", code);
7570 return make_number (c);
7571 }
7572
7573 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7574 doc: /* Encode the Big5 character CHAR to BIG5 coding system.
7575 Return the corresponding character code in Big5. */)
7576 (ch)
7577 Lisp_Object ch;
7578 {
7579 Lisp_Object spec, attrs, charset_list;
7580 struct charset *charset;
7581 int c;
7582 unsigned code;
7583
7584 CHECK_CHARACTER (ch);
7585 c = XFASTINT (ch);
7586 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7587 attrs = AREF (spec, 0);
7588 if (ASCII_CHAR_P (c)
7589 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7590 return ch;
7591
7592 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7593 charset = char_charset (c, charset_list, &code);
7594 if (code == CHARSET_INVALID_CODE (charset))
7595 error ("Can't encode by Big5 encoding: %d", c);
7596
7597 return make_number (code);
7598 }
7599
7600 \f
7601 DEFUN ("set-terminal-coding-system-internal",
7602 Fset_terminal_coding_system_internal,
7603 Sset_terminal_coding_system_internal, 1, 1, 0,
7604 doc: /* Internal use only. */)
7605 (coding_system)
7606 Lisp_Object coding_system;
7607 {
7608 CHECK_SYMBOL (coding_system);
7609 setup_coding_system (Fcheck_coding_system (coding_system),
7610 &terminal_coding);
7611
7612 /* We had better not send unsafe characters to terminal. */
7613 terminal_coding.mode |= CODING_MODE_SAFE_ENCODING;
7614 /* Characer composition should be disabled. */
7615 terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7616 terminal_coding.src_multibyte = 1;
7617 terminal_coding.dst_multibyte = 0;
7618 return Qnil;
7619 }
7620
7621 DEFUN ("set-safe-terminal-coding-system-internal",
7622 Fset_safe_terminal_coding_system_internal,
7623 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7624 doc: /* Internal use only. */)
7625 (coding_system)
7626 Lisp_Object coding_system;
7627 {
7628 CHECK_SYMBOL (coding_system);
7629 setup_coding_system (Fcheck_coding_system (coding_system),
7630 &safe_terminal_coding);
7631 /* Characer composition should be disabled. */
7632 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7633 safe_terminal_coding.src_multibyte = 1;
7634 safe_terminal_coding.dst_multibyte = 0;
7635 return Qnil;
7636 }
7637
7638 DEFUN ("terminal-coding-system",
7639 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
7640 doc: /* Return coding system specified for terminal output. */)
7641 ()
7642 {
7643 return CODING_ID_NAME (terminal_coding.id);
7644 }
7645
7646 DEFUN ("set-keyboard-coding-system-internal",
7647 Fset_keyboard_coding_system_internal,
7648 Sset_keyboard_coding_system_internal, 1, 1, 0,
7649 doc: /* Internal use only. */)
7650 (coding_system)
7651 Lisp_Object coding_system;
7652 {
7653 CHECK_SYMBOL (coding_system);
7654 setup_coding_system (Fcheck_coding_system (coding_system),
7655 &keyboard_coding);
7656 /* Characer composition should be disabled. */
7657 keyboard_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7658 return Qnil;
7659 }
7660
7661 DEFUN ("keyboard-coding-system",
7662 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
7663 doc: /* Return coding system specified for decoding keyboard input. */)
7664 ()
7665 {
7666 return CODING_ID_NAME (keyboard_coding.id);
7667 }
7668
7669 \f
7670 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7671 Sfind_operation_coding_system, 1, MANY, 0,
7672 doc: /* Choose a coding system for an operation based on the target name.
7673 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7674 DECODING-SYSTEM is the coding system to use for decoding
7675 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7676 for encoding (in case OPERATION does encoding).
7677
7678 The first argument OPERATION specifies an I/O primitive:
7679 For file I/O, `insert-file-contents' or `write-region'.
7680 For process I/O, `call-process', `call-process-region', or `start-process'.
7681 For network I/O, `open-network-stream'.
7682
7683 The remaining arguments should be the same arguments that were passed
7684 to the primitive. Depending on which primitive, one of those arguments
7685 is selected as the TARGET. For example, if OPERATION does file I/O,
7686 whichever argument specifies the file name is TARGET.
7687
7688 TARGET has a meaning which depends on OPERATION:
7689 For file I/O, TARGET is a file name.
7690 For process I/O, TARGET is a process name.
7691 For network I/O, TARGET is a service name or a port number
7692
7693 This function looks up what specified for TARGET in,
7694 `file-coding-system-alist', `process-coding-system-alist',
7695 or `network-coding-system-alist' depending on OPERATION.
7696 They may specify a coding system, a cons of coding systems,
7697 or a function symbol to call.
7698 In the last case, we call the function with one argument,
7699 which is a list of all the arguments given to this function.
7700
7701 usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
7702 (nargs, args)
7703 int nargs;
7704 Lisp_Object *args;
7705 {
7706 Lisp_Object operation, target_idx, target, val;
7707 register Lisp_Object chain;
7708
7709 if (nargs < 2)
7710 error ("Too few arguments");
7711 operation = args[0];
7712 if (!SYMBOLP (operation)
7713 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7714 error ("Invalid first arguement");
7715 if (nargs < 1 + XINT (target_idx))
7716 error ("Too few arguments for operation: %s",
7717 XSYMBOL (operation)->name->data);
7718 target = args[XINT (target_idx) + 1];
7719 if (!(STRINGP (target)
7720 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7721 error ("Invalid %dth argument", XINT (target_idx) + 1);
7722
7723 chain = ((EQ (operation, Qinsert_file_contents)
7724 || EQ (operation, Qwrite_region))
7725 ? Vfile_coding_system_alist
7726 : (EQ (operation, Qopen_network_stream)
7727 ? Vnetwork_coding_system_alist
7728 : Vprocess_coding_system_alist));
7729 if (NILP (chain))
7730 return Qnil;
7731
7732 for (; CONSP (chain); chain = XCDR (chain))
7733 {
7734 Lisp_Object elt;
7735
7736 elt = XCAR (chain);
7737 if (CONSP (elt)
7738 && ((STRINGP (target)
7739 && STRINGP (XCAR (elt))
7740 && fast_string_match (XCAR (elt), target) >= 0)
7741 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7742 {
7743 val = XCDR (elt);
7744 /* Here, if VAL is both a valid coding system and a valid
7745 function symbol, we return VAL as a coding system. */
7746 if (CONSP (val))
7747 return val;
7748 if (! SYMBOLP (val))
7749 return Qnil;
7750 if (! NILP (Fcoding_system_p (val)))
7751 return Fcons (val, val);
7752 if (! NILP (Ffboundp (val)))
7753 {
7754 val = call1 (val, Flist (nargs, args));
7755 if (CONSP (val))
7756 return val;
7757 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7758 return Fcons (val, val);
7759 }
7760 return Qnil;
7761 }
7762 }
7763 return Qnil;
7764 }
7765
7766 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
7767 Sset_coding_system_priority, 0, MANY, 0,
7768 doc: /* Assign higher priority to the coding systems given as arguments.
7769 If multiple coding systems belongs to the same category,
7770 all but the first one are ignored. */)
7771 (nargs, args)
7772 int nargs;
7773 Lisp_Object *args;
7774 {
7775 int i, j;
7776 int changed[coding_category_max];
7777 enum coding_category priorities[coding_category_max];
7778
7779 bzero (changed, sizeof changed);
7780
7781 for (i = j = 0; i < nargs; i++)
7782 {
7783 enum coding_category category;
7784 Lisp_Object spec, attrs;
7785
7786 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
7787 attrs = AREF (spec, 0);
7788 category = XINT (CODING_ATTR_CATEGORY (attrs));
7789 if (changed[category])
7790 /* Ignore this coding system because a coding system of the
7791 same category already had a higher priority. */
7792 continue;
7793 changed[category] = 1;
7794 priorities[j++] = category;
7795 if (coding_categories[category].id >= 0
7796 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
7797 setup_coding_system (args[i], &coding_categories[category]);
7798 Fset (AREF (Vcoding_category_table, category), args[i]);
7799 }
7800
7801 /* Now we have decided top J priorities. Reflect the order of the
7802 original priorities to the remaining priorities. */
7803
7804 for (i = j, j = 0; i < coding_category_max; i++, j++)
7805 {
7806 while (j < coding_category_max
7807 && changed[coding_priorities[j]])
7808 j++;
7809 if (j == coding_category_max)
7810 abort ();
7811 priorities[i] = coding_priorities[j];
7812 }
7813
7814 bcopy (priorities, coding_priorities, sizeof priorities);
7815
7816 /* Update `coding-category-list'. */
7817 Vcoding_category_list = Qnil;
7818 for (i = coding_category_max - 1; i >= 0; i--)
7819 Vcoding_category_list
7820 = Fcons (AREF (Vcoding_category_table, priorities[i]),
7821 Vcoding_category_list);
7822
7823 return Qnil;
7824 }
7825
7826 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
7827 Scoding_system_priority_list, 0, 1, 0,
7828 doc: /* Return a list of coding systems ordered by their priorities.
7829 HIGHESTP non-nil means just return the highest priority one. */)
7830 (highestp)
7831 Lisp_Object highestp;
7832 {
7833 int i;
7834 Lisp_Object val;
7835
7836 for (i = 0, val = Qnil; i < coding_category_max; i++)
7837 {
7838 enum coding_category category = coding_priorities[i];
7839 int id = coding_categories[category].id;
7840 Lisp_Object attrs;
7841
7842 if (id < 0)
7843 continue;
7844 attrs = CODING_ID_ATTRS (id);
7845 if (! NILP (highestp))
7846 return CODING_ATTR_BASE_NAME (attrs);
7847 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
7848 }
7849 return Fnreverse (val);
7850 }
7851
7852 static char *suffixes[] = { "-unix", "-dos", "-mac" };
7853
7854 static Lisp_Object
7855 make_subsidiaries (base)
7856 Lisp_Object base;
7857 {
7858 Lisp_Object subsidiaries;
7859 int base_name_len = STRING_BYTES (XSYMBOL (base)->name);
7860 char *buf = (char *) alloca (base_name_len + 6);
7861 int i;
7862
7863 bcopy (XSYMBOL (base)->name->data, buf, base_name_len);
7864 subsidiaries = Fmake_vector (make_number (3), Qnil);
7865 for (i = 0; i < 3; i++)
7866 {
7867 bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
7868 ASET (subsidiaries, i, intern (buf));
7869 }
7870 return subsidiaries;
7871 }
7872
7873
7874 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7875 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
7876 doc: /* For internal use only.
7877 usage: (define-coding-system-internal ...) */)
7878 (nargs, args)
7879 int nargs;
7880 Lisp_Object *args;
7881 {
7882 Lisp_Object name;
7883 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
7884 Lisp_Object attrs; /* Vector of attributes. */
7885 Lisp_Object eol_type;
7886 Lisp_Object aliases;
7887 Lisp_Object coding_type, charset_list, safe_charsets;
7888 enum coding_category category;
7889 Lisp_Object tail, val;
7890 int max_charset_id = 0;
7891 int i;
7892
7893 if (nargs < coding_arg_max)
7894 goto short_args;
7895
7896 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
7897
7898 name = args[coding_arg_name];
7899 CHECK_SYMBOL (name);
7900 CODING_ATTR_BASE_NAME (attrs) = name;
7901
7902 val = args[coding_arg_mnemonic];
7903 if (! STRINGP (val))
7904 CHECK_CHARACTER (val);
7905 CODING_ATTR_MNEMONIC (attrs) = val;
7906
7907 coding_type = args[coding_arg_coding_type];
7908 CHECK_SYMBOL (coding_type);
7909 CODING_ATTR_TYPE (attrs) = coding_type;
7910
7911 charset_list = args[coding_arg_charset_list];
7912 if (SYMBOLP (charset_list))
7913 {
7914 if (EQ (charset_list, Qiso_2022))
7915 {
7916 if (! EQ (coding_type, Qiso_2022))
7917 error ("Invalid charset-list");
7918 charset_list = Viso_2022_charset_list;
7919 }
7920 else if (EQ (charset_list, Qemacs_mule))
7921 {
7922 if (! EQ (coding_type, Qemacs_mule))
7923 error ("Invalid charset-list");
7924 charset_list = Vemacs_mule_charset_list;
7925 }
7926 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
7927 if (max_charset_id < XFASTINT (XCAR (tail)))
7928 max_charset_id = XFASTINT (XCAR (tail));
7929 }
7930 else
7931 {
7932 charset_list = Fcopy_sequence (charset_list);
7933 for (tail = charset_list; !NILP (tail); tail = Fcdr (tail))
7934 {
7935 struct charset *charset;
7936
7937 val = Fcar (tail);
7938 CHECK_CHARSET_GET_CHARSET (val, charset);
7939 if (EQ (coding_type, Qiso_2022)
7940 ? CHARSET_ISO_FINAL (charset) < 0
7941 : EQ (coding_type, Qemacs_mule)
7942 ? CHARSET_EMACS_MULE_ID (charset) < 0
7943 : 0)
7944 error ("Can't handle charset `%s'",
7945 XSYMBOL (CHARSET_NAME (charset))->name->data);
7946
7947 XCAR (tail) = make_number (charset->id);
7948 if (max_charset_id < charset->id)
7949 max_charset_id = charset->id;
7950 }
7951 }
7952 CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
7953
7954 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
7955 make_number (255));
7956 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
7957 XSTRING (safe_charsets)->data[XFASTINT (XCAR (tail))] = 0;
7958 CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
7959
7960 CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
7961
7962 val = args[coding_arg_decode_translation_table];
7963 if (! NILP (val))
7964 CHECK_CHAR_TABLE (val);
7965 CODING_ATTR_DECODE_TBL (attrs) = val;
7966
7967 val = args[coding_arg_encode_translation_table];
7968 if (! NILP (val))
7969 CHECK_CHAR_TABLE (val);
7970 CODING_ATTR_ENCODE_TBL (attrs) = val;
7971
7972 val = args[coding_arg_post_read_conversion];
7973 CHECK_SYMBOL (val);
7974 CODING_ATTR_POST_READ (attrs) = val;
7975
7976 val = args[coding_arg_pre_write_conversion];
7977 CHECK_SYMBOL (val);
7978 CODING_ATTR_PRE_WRITE (attrs) = val;
7979
7980 val = args[coding_arg_default_char];
7981 if (NILP (val))
7982 CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
7983 else
7984 {
7985 CHECK_CHARACTER (val);
7986 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
7987 }
7988
7989 val = args[coding_arg_plist];
7990 CHECK_LIST (val);
7991 CODING_ATTR_PLIST (attrs) = val;
7992
7993 if (EQ (coding_type, Qcharset))
7994 {
7995 Lisp_Object list;
7996 /* Generate a lisp vector of 256 elements. Each element is nil,
7997 integer, or a list of charset IDs.
7998
7999 If Nth element is nil, the byte code N is invalid in this
8000 coding system.
8001
8002 If Nth element is a number NUM, N is the first byte of a
8003 charset whose ID is NUM.
8004
8005 If Nth element is a list of charset IDs, N is the first byte
8006 of one of them. The list is sorted by dimensions of the
8007 charsets. A charset of smaller dimension comes firtst.
8008 */
8009 for (list = Qnil, tail = charset_list; CONSP (tail); tail = XCDR (tail))
8010 {
8011 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
8012
8013 if (charset->method == CHARSET_METHOD_SUPERSET)
8014 {
8015 val = CHARSET_SUPERSET (charset);
8016 for (; CONSP (val); val = XCDR (val))
8017 list = Fcons (XCAR (XCAR (val)), list);
8018 }
8019 else
8020 list = Fcons (XCAR (tail), list);
8021 }
8022
8023 val = Fmake_vector (make_number (256), Qnil);
8024
8025 for (tail = Fnreverse (list); CONSP (tail); tail = XCDR (tail))
8026 {
8027 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
8028 int dim = CHARSET_DIMENSION (charset);
8029 int idx = (dim - 1) * 4;
8030
8031 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8032 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8033
8034 for (i = charset->code_space[idx];
8035 i <= charset->code_space[idx + 1]; i++)
8036 {
8037 Lisp_Object tmp, tmp2;
8038 int dim2;
8039
8040 tmp = AREF (val, i);
8041 if (NILP (tmp))
8042 tmp = XCAR (tail);
8043 else if (NUMBERP (tmp))
8044 {
8045 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
8046 if (dim < dim2)
8047 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
8048 else
8049 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
8050 }
8051 else
8052 {
8053 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
8054 {
8055 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
8056 if (dim < dim2)
8057 break;
8058 }
8059 if (NILP (tmp2))
8060 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
8061 else
8062 {
8063 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
8064 XSETCAR (tmp2, XCAR (tail));
8065 }
8066 }
8067 ASET (val, i, tmp);
8068 }
8069 }
8070 ASET (attrs, coding_attr_charset_valids, val);
8071 category = coding_category_charset;
8072 }
8073 else if (EQ (coding_type, Qccl))
8074 {
8075 Lisp_Object valids;
8076
8077 if (nargs < coding_arg_ccl_max)
8078 goto short_args;
8079
8080 val = args[coding_arg_ccl_decoder];
8081 CHECK_CCL_PROGRAM (val);
8082 if (VECTORP (val))
8083 val = Fcopy_sequence (val);
8084 ASET (attrs, coding_attr_ccl_decoder, val);
8085
8086 val = args[coding_arg_ccl_encoder];
8087 CHECK_CCL_PROGRAM (val);
8088 if (VECTORP (val))
8089 val = Fcopy_sequence (val);
8090 ASET (attrs, coding_attr_ccl_encoder, val);
8091
8092 val = args[coding_arg_ccl_valids];
8093 valids = Fmake_string (make_number (256), make_number (0));
8094 for (tail = val; !NILP (tail); tail = Fcdr (tail))
8095 {
8096 int from, to;
8097
8098 val = Fcar (tail);
8099 if (INTEGERP (val))
8100 {
8101 from = to = XINT (val);
8102 if (from < 0 || from > 255)
8103 args_out_of_range_3 (val, make_number (0), make_number (255));
8104 }
8105 else
8106 {
8107 CHECK_CONS (val);
8108 CHECK_NUMBER (XCAR (val));
8109 CHECK_NUMBER (XCDR (val));
8110 from = XINT (XCAR (val));
8111 if (from < 0 || from > 255)
8112 args_out_of_range_3 (XCAR (val),
8113 make_number (0), make_number (255));
8114 to = XINT (XCDR (val));
8115 if (to < from || to > 255)
8116 args_out_of_range_3 (XCDR (val),
8117 XCAR (val), make_number (255));
8118 }
8119 for (i = from; i <= to; i++)
8120 XSTRING (valids)->data[i] = 1;
8121 }
8122 ASET (attrs, coding_attr_ccl_valids, valids);
8123
8124 category = coding_category_ccl;
8125 }
8126 else if (EQ (coding_type, Qutf_16))
8127 {
8128 Lisp_Object bom, endian;
8129
8130 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8131
8132 if (nargs < coding_arg_utf16_max)
8133 goto short_args;
8134
8135 bom = args[coding_arg_utf16_bom];
8136 if (! NILP (bom) && ! EQ (bom, Qt))
8137 {
8138 CHECK_CONS (bom);
8139 CHECK_CODING_SYSTEM (XCAR (bom));
8140 CHECK_CODING_SYSTEM (XCDR (bom));
8141 }
8142 ASET (attrs, coding_attr_utf_16_bom, bom);
8143
8144 endian = args[coding_arg_utf16_endian];
8145 CHECK_SYMBOL (endian);
8146 if (NILP (endian))
8147 endian = Qbig;
8148 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
8149 error ("Invalid endian: %s", XSYMBOL (endian)->name->data);
8150 ASET (attrs, coding_attr_utf_16_endian, endian);
8151
8152 category = (CONSP (bom)
8153 ? coding_category_utf_16_auto
8154 : NILP (bom)
8155 ? (EQ (endian, Qbig)
8156 ? coding_category_utf_16_be_nosig
8157 : coding_category_utf_16_le_nosig)
8158 : (EQ (endian, Qbig)
8159 ? coding_category_utf_16_be
8160 : coding_category_utf_16_le));
8161 }
8162 else if (EQ (coding_type, Qiso_2022))
8163 {
8164 Lisp_Object initial, reg_usage, request, flags;
8165 int i;
8166
8167 if (nargs < coding_arg_iso2022_max)
8168 goto short_args;
8169
8170 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
8171 CHECK_VECTOR (initial);
8172 for (i = 0; i < 4; i++)
8173 {
8174 val = Faref (initial, make_number (i));
8175 if (! NILP (val))
8176 {
8177 struct charset *charset;
8178
8179 CHECK_CHARSET_GET_CHARSET (val, charset);
8180 ASET (initial, i, make_number (CHARSET_ID (charset)));
8181 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
8182 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8183 }
8184 else
8185 ASET (initial, i, make_number (-1));
8186 }
8187
8188 reg_usage = args[coding_arg_iso2022_reg_usage];
8189 CHECK_CONS (reg_usage);
8190 CHECK_NATNUM (XCAR (reg_usage));
8191 CHECK_NATNUM (XCDR (reg_usage));
8192
8193 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
8194 for (tail = request; ! NILP (tail); tail = Fcdr (tail))
8195 {
8196 int id;
8197
8198 val = Fcar (tail);
8199 CHECK_CONS (val);
8200 CHECK_CHARSET_GET_ID (XCAR (val), id);
8201 CHECK_NATNUM (XCDR (val));
8202 if (XINT (XCDR (val)) >= 4)
8203 error ("Invalid graphic register number: %d", XINT (XCDR (val)));
8204 XCAR (val) = make_number (id);
8205 }
8206
8207 flags = args[coding_arg_iso2022_flags];
8208 CHECK_NATNUM (flags);
8209 i = XINT (flags);
8210 if (EQ (args[coding_arg_charset_list], Qiso_2022))
8211 flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
8212
8213 ASET (attrs, coding_attr_iso_initial, initial);
8214 ASET (attrs, coding_attr_iso_usage, reg_usage);
8215 ASET (attrs, coding_attr_iso_request, request);
8216 ASET (attrs, coding_attr_iso_flags, flags);
8217 setup_iso_safe_charsets (attrs);
8218
8219 if (i & CODING_ISO_FLAG_SEVEN_BITS)
8220 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
8221 | CODING_ISO_FLAG_SINGLE_SHIFT))
8222 ? coding_category_iso_7_else
8223 : EQ (args[coding_arg_charset_list], Qiso_2022)
8224 ? coding_category_iso_7
8225 : coding_category_iso_7_tight);
8226 else
8227 {
8228 int id = XINT (AREF (initial, 1));
8229
8230 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
8231 || EQ (args[coding_arg_charset_list], Qiso_2022)
8232 || id < 0)
8233 ? coding_category_iso_8_else
8234 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
8235 ? coding_category_iso_8_1
8236 : coding_category_iso_8_2);
8237 }
8238 if (category != coding_category_iso_8_1
8239 && category != coding_category_iso_8_2)
8240 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8241 }
8242 else if (EQ (coding_type, Qemacs_mule))
8243 {
8244 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
8245 ASET (attrs, coding_attr_emacs_mule_full, Qt);
8246 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8247 category = coding_category_emacs_mule;
8248 }
8249 else if (EQ (coding_type, Qshift_jis))
8250 {
8251
8252 struct charset *charset;
8253
8254 if (XINT (Flength (charset_list)) != 3)
8255 error ("There should be just three charsets");
8256
8257 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8258 if (CHARSET_DIMENSION (charset) != 1)
8259 error ("Dimension of charset %s is not one",
8260 XSYMBOL (CHARSET_NAME (charset))->name->data);
8261 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8262 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8263
8264 charset_list = XCDR (charset_list);
8265 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8266 if (CHARSET_DIMENSION (charset) != 1)
8267 error ("Dimension of charset %s is not one",
8268 XSYMBOL (CHARSET_NAME (charset))->name->data);
8269
8270 charset_list = XCDR (charset_list);
8271 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8272 if (CHARSET_DIMENSION (charset) != 2)
8273 error ("Dimension of charset %s is not two",
8274 XSYMBOL (CHARSET_NAME (charset))->name->data);
8275
8276 category = coding_category_sjis;
8277 Vsjis_coding_system = name;
8278 }
8279 else if (EQ (coding_type, Qbig5))
8280 {
8281 struct charset *charset;
8282
8283 if (XINT (Flength (charset_list)) != 2)
8284 error ("There should be just two charsets");
8285
8286 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8287 if (CHARSET_DIMENSION (charset) != 1)
8288 error ("Dimension of charset %s is not one",
8289 XSYMBOL (CHARSET_NAME (charset))->name->data);
8290 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8291 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8292
8293 charset_list = XCDR (charset_list);
8294 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8295 if (CHARSET_DIMENSION (charset) != 2)
8296 error ("Dimension of charset %s is not two",
8297 XSYMBOL (CHARSET_NAME (charset))->name->data);
8298
8299 category = coding_category_big5;
8300 Vbig5_coding_system = name;
8301 }
8302 else if (EQ (coding_type, Qraw_text))
8303 {
8304 category = coding_category_raw_text;
8305 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8306 }
8307 else if (EQ (coding_type, Qutf_8))
8308 {
8309 category = coding_category_utf_8;
8310 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8311 }
8312 else if (EQ (coding_type, Qundecided))
8313 category = coding_category_undecided;
8314 else
8315 error ("Invalid coding system type: %s",
8316 XSYMBOL (coding_type)->name->data);
8317
8318 CODING_ATTR_CATEGORY (attrs) = make_number (category);
8319 CODING_ATTR_PLIST (attrs)
8320 = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
8321 CODING_ATTR_PLIST (attrs)));
8322
8323 eol_type = args[coding_arg_eol_type];
8324 if (! NILP (eol_type)
8325 && ! EQ (eol_type, Qunix)
8326 && ! EQ (eol_type, Qdos)
8327 && ! EQ (eol_type, Qmac))
8328 error ("Invalid eol-type");
8329
8330 aliases = Fcons (name, Qnil);
8331
8332 if (NILP (eol_type))
8333 {
8334 eol_type = make_subsidiaries (name);
8335 for (i = 0; i < 3; i++)
8336 {
8337 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
8338
8339 this_name = AREF (eol_type, i);
8340 this_aliases = Fcons (this_name, Qnil);
8341 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
8342 this_spec = Fmake_vector (make_number (3), attrs);
8343 ASET (this_spec, 1, this_aliases);
8344 ASET (this_spec, 2, this_eol_type);
8345 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
8346 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
8347 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
8348 Vcoding_system_alist);
8349 }
8350 }
8351
8352 spec_vec = Fmake_vector (make_number (3), attrs);
8353 ASET (spec_vec, 1, aliases);
8354 ASET (spec_vec, 2, eol_type);
8355
8356 Fputhash (name, spec_vec, Vcoding_system_hash_table);
8357 Vcoding_system_list = Fcons (name, Vcoding_system_list);
8358 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
8359 Vcoding_system_alist);
8360
8361 {
8362 int id = coding_categories[category].id;
8363
8364 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
8365 setup_coding_system (name, &coding_categories[category]);
8366 }
8367
8368 return Qnil;
8369
8370 short_args:
8371 return Fsignal (Qwrong_number_of_arguments,
8372 Fcons (intern ("define-coding-system-internal"),
8373 make_number (nargs)));
8374 }
8375
8376 /* Fixme: should this record the alias relationships for
8377 diagnostics? Should it update coding-system-list? */
8378 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
8379 Sdefine_coding_system_alias, 2, 2, 0,
8380 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
8381 (alias, coding_system)
8382 Lisp_Object alias, coding_system;
8383 {
8384 Lisp_Object spec, aliases, eol_type;
8385
8386 CHECK_SYMBOL (alias);
8387 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8388 aliases = AREF (spec, 1);
8389 while (!NILP (XCDR (aliases)))
8390 aliases = XCDR (aliases);
8391 XCDR (aliases) = Fcons (alias, Qnil);
8392
8393 eol_type = AREF (spec, 2);
8394 if (VECTORP (eol_type))
8395 {
8396 Lisp_Object subsidiaries;
8397 int i;
8398
8399 subsidiaries = make_subsidiaries (alias);
8400 for (i = 0; i < 3; i++)
8401 Fdefine_coding_system_alias (AREF (subsidiaries, i),
8402 AREF (eol_type, i));
8403
8404 ASET (spec, 2, subsidiaries);
8405 }
8406
8407 Fputhash (alias, spec, Vcoding_system_hash_table);
8408 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
8409 Vcoding_system_alist);
8410
8411 return Qnil;
8412 }
8413
8414 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
8415 1, 1, 0,
8416 doc: /* Return the base of CODING-SYSTEM.
8417 Any alias or subsidiary coding system is not a base coding system. */)
8418 (coding_system)
8419 Lisp_Object coding_system;
8420 {
8421 Lisp_Object spec, attrs;
8422
8423 if (NILP (coding_system))
8424 return (Qno_conversion);
8425 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8426 attrs = AREF (spec, 0);
8427 return CODING_ATTR_BASE_NAME (attrs);
8428 }
8429
8430 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
8431 1, 1, 0,
8432 doc: "Return the property list of CODING-SYSTEM.")
8433 (coding_system)
8434 Lisp_Object coding_system;
8435 {
8436 Lisp_Object spec, attrs;
8437
8438 if (NILP (coding_system))
8439 coding_system = Qno_conversion;
8440 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8441 attrs = AREF (spec, 0);
8442 return CODING_ATTR_PLIST (attrs);
8443 }
8444
8445
8446 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
8447 1, 1, 0,
8448 doc: /* Return the list of aliases of CODING-SYSTEM. */)
8449 (coding_system)
8450 Lisp_Object coding_system;
8451 {
8452 Lisp_Object spec;
8453
8454 if (NILP (coding_system))
8455 coding_system = Qno_conversion;
8456 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8457 return AREF (spec, 1);
8458 }
8459
8460 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
8461 Scoding_system_eol_type, 1, 1, 0,
8462 doc: /* Return eol-type of CODING-SYSTEM.
8463 An eol-type is integer 0, 1, 2, or a vector of coding systems.
8464
8465 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
8466 and CR respectively.
8467
8468 A vector value indicates that a format of end-of-line should be
8469 detected automatically. Nth element of the vector is the subsidiary
8470 coding system whose eol-type is N. */)
8471 (coding_system)
8472 Lisp_Object coding_system;
8473 {
8474 Lisp_Object spec, eol_type;
8475 int n;
8476
8477 if (NILP (coding_system))
8478 coding_system = Qno_conversion;
8479 if (! CODING_SYSTEM_P (coding_system))
8480 return Qnil;
8481 spec = CODING_SYSTEM_SPEC (coding_system);
8482 eol_type = AREF (spec, 2);
8483 if (VECTORP (eol_type))
8484 return Fcopy_sequence (eol_type);
8485 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
8486 return make_number (n);
8487 }
8488
8489 #endif /* emacs */
8490
8491 \f
8492 /*** 9. Post-amble ***/
8493
8494 void
8495 init_coding_once ()
8496 {
8497 int i;
8498
8499 for (i = 0; i < coding_category_max; i++)
8500 {
8501 coding_categories[i].id = -1;
8502 coding_priorities[i] = i;
8503 }
8504
8505 /* ISO2022 specific initialize routine. */
8506 for (i = 0; i < 0x20; i++)
8507 iso_code_class[i] = ISO_control_0;
8508 for (i = 0x21; i < 0x7F; i++)
8509 iso_code_class[i] = ISO_graphic_plane_0;
8510 for (i = 0x80; i < 0xA0; i++)
8511 iso_code_class[i] = ISO_control_1;
8512 for (i = 0xA1; i < 0xFF; i++)
8513 iso_code_class[i] = ISO_graphic_plane_1;
8514 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
8515 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
8516 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
8517 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
8518 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
8519 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
8520 iso_code_class[ISO_CODE_ESC] = ISO_escape;
8521 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
8522 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
8523 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
8524
8525 inhibit_pre_post_conversion = 0;
8526
8527 for (i = 0; i < 256; i++)
8528 {
8529 emacs_mule_bytes[i] = 1;
8530 }
8531 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
8532 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
8533 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
8534 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
8535 }
8536
8537 #ifdef emacs
8538
8539 void
8540 syms_of_coding ()
8541 {
8542 staticpro (&Vcoding_system_hash_table);
8543 Vcoding_system_hash_table = Fmakehash (Qeq);
8544
8545 staticpro (&Vsjis_coding_system);
8546 Vsjis_coding_system = Qnil;
8547
8548 staticpro (&Vbig5_coding_system);
8549 Vbig5_coding_system = Qnil;
8550
8551 staticpro (&Vcode_conversion_work_buf_list);
8552 Vcode_conversion_work_buf_list = Qnil;
8553
8554 staticpro (&Vcode_conversion_reused_work_buf);
8555 Vcode_conversion_reused_work_buf = Qnil;
8556
8557 DEFSYM (Qcharset, "charset");
8558 DEFSYM (Qtarget_idx, "target-idx");
8559 DEFSYM (Qcoding_system_history, "coding-system-history");
8560 Fset (Qcoding_system_history, Qnil);
8561
8562 /* Target FILENAME is the first argument. */
8563 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
8564 /* Target FILENAME is the third argument. */
8565 Fput (Qwrite_region, Qtarget_idx, make_number (2));
8566
8567 DEFSYM (Qcall_process, "call-process");
8568 /* Target PROGRAM is the first argument. */
8569 Fput (Qcall_process, Qtarget_idx, make_number (0));
8570
8571 DEFSYM (Qcall_process_region, "call-process-region");
8572 /* Target PROGRAM is the third argument. */
8573 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
8574
8575 DEFSYM (Qstart_process, "start-process");
8576 /* Target PROGRAM is the third argument. */
8577 Fput (Qstart_process, Qtarget_idx, make_number (2));
8578
8579 DEFSYM (Qopen_network_stream, "open-network-stream");
8580 /* Target SERVICE is the fourth argument. */
8581 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
8582
8583 DEFSYM (Qcoding_system, "coding-system");
8584 DEFSYM (Qcoding_aliases, "coding-aliases");
8585
8586 DEFSYM (Qeol_type, "eol-type");
8587 DEFSYM (Qunix, "unix");
8588 DEFSYM (Qdos, "dos");
8589
8590 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
8591 DEFSYM (Qpost_read_conversion, "post-read-conversion");
8592 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
8593 DEFSYM (Qdefault_char, "default-char");
8594 DEFSYM (Qundecided, "undecided");
8595 DEFSYM (Qno_conversion, "no-conversion");
8596 DEFSYM (Qraw_text, "raw-text");
8597
8598 DEFSYM (Qiso_2022, "iso-2022");
8599
8600 DEFSYM (Qutf_8, "utf-8");
8601
8602 DEFSYM (Qutf_16, "utf-16");
8603 DEFSYM (Qbig, "big");
8604 DEFSYM (Qlittle, "little");
8605
8606 DEFSYM (Qshift_jis, "shift-jis");
8607 DEFSYM (Qbig5, "big5");
8608
8609 DEFSYM (Qcoding_system_p, "coding-system-p");
8610
8611 DEFSYM (Qcoding_system_error, "coding-system-error");
8612 Fput (Qcoding_system_error, Qerror_conditions,
8613 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
8614 Fput (Qcoding_system_error, Qerror_message,
8615 build_string ("Invalid coding system"));
8616
8617 /* Intern this now in case it isn't already done.
8618 Setting this variable twice is harmless.
8619 But don't staticpro it here--that is done in alloc.c. */
8620 Qchar_table_extra_slots = intern ("char-table-extra-slots");
8621
8622 DEFSYM (Qtranslation_table, "translation-table");
8623 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
8624 DEFSYM (Qtranslation_table_id, "translation-table-id");
8625 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
8626 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
8627
8628 DEFSYM (Qvalid_codes, "valid-codes");
8629
8630 DEFSYM (Qemacs_mule, "emacs-mule");
8631
8632 DEFSYM (QCcategory, ":category");
8633
8634 Vcoding_category_table
8635 = Fmake_vector (make_number (coding_category_max), Qnil);
8636 staticpro (&Vcoding_category_table);
8637 /* Followings are target of code detection. */
8638 ASET (Vcoding_category_table, coding_category_iso_7,
8639 intern ("coding-category-iso-7"));
8640 ASET (Vcoding_category_table, coding_category_iso_7_tight,
8641 intern ("coding-category-iso-7-tight"));
8642 ASET (Vcoding_category_table, coding_category_iso_8_1,
8643 intern ("coding-category-iso-8-1"));
8644 ASET (Vcoding_category_table, coding_category_iso_8_2,
8645 intern ("coding-category-iso-8-2"));
8646 ASET (Vcoding_category_table, coding_category_iso_7_else,
8647 intern ("coding-category-iso-7-else"));
8648 ASET (Vcoding_category_table, coding_category_iso_8_else,
8649 intern ("coding-category-iso-8-else"));
8650 ASET (Vcoding_category_table, coding_category_utf_8,
8651 intern ("coding-category-utf-8"));
8652 ASET (Vcoding_category_table, coding_category_utf_16_be,
8653 intern ("coding-category-utf-16-be"));
8654 ASET (Vcoding_category_table, coding_category_utf_16_auto,
8655 intern ("coding-category-utf-16-auto"));
8656 ASET (Vcoding_category_table, coding_category_utf_16_le,
8657 intern ("coding-category-utf-16-le"));
8658 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
8659 intern ("coding-category-utf-16-be-nosig"));
8660 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
8661 intern ("coding-category-utf-16-le-nosig"));
8662 ASET (Vcoding_category_table, coding_category_charset,
8663 intern ("coding-category-charset"));
8664 ASET (Vcoding_category_table, coding_category_sjis,
8665 intern ("coding-category-sjis"));
8666 ASET (Vcoding_category_table, coding_category_big5,
8667 intern ("coding-category-big5"));
8668 ASET (Vcoding_category_table, coding_category_ccl,
8669 intern ("coding-category-ccl"));
8670 ASET (Vcoding_category_table, coding_category_emacs_mule,
8671 intern ("coding-category-emacs-mule"));
8672 /* Followings are NOT target of code detection. */
8673 ASET (Vcoding_category_table, coding_category_raw_text,
8674 intern ("coding-category-raw-text"));
8675 ASET (Vcoding_category_table, coding_category_undecided,
8676 intern ("coding-category-undecided"));
8677
8678 defsubr (&Scoding_system_p);
8679 defsubr (&Sread_coding_system);
8680 defsubr (&Sread_non_nil_coding_system);
8681 defsubr (&Scheck_coding_system);
8682 defsubr (&Sdetect_coding_region);
8683 defsubr (&Sdetect_coding_string);
8684 defsubr (&Sfind_coding_systems_region_internal);
8685 defsubr (&Scheck_coding_systems_region);
8686 defsubr (&Sdecode_coding_region);
8687 defsubr (&Sencode_coding_region);
8688 defsubr (&Sdecode_coding_string);
8689 defsubr (&Sencode_coding_string);
8690 defsubr (&Sdecode_sjis_char);
8691 defsubr (&Sencode_sjis_char);
8692 defsubr (&Sdecode_big5_char);
8693 defsubr (&Sencode_big5_char);
8694 defsubr (&Sset_terminal_coding_system_internal);
8695 defsubr (&Sset_safe_terminal_coding_system_internal);
8696 defsubr (&Sterminal_coding_system);
8697 defsubr (&Sset_keyboard_coding_system_internal);
8698 defsubr (&Skeyboard_coding_system);
8699 defsubr (&Sfind_operation_coding_system);
8700 defsubr (&Sset_coding_system_priority);
8701 defsubr (&Sdefine_coding_system_internal);
8702 defsubr (&Sdefine_coding_system_alias);
8703 defsubr (&Scoding_system_base);
8704 defsubr (&Scoding_system_plist);
8705 defsubr (&Scoding_system_aliases);
8706 defsubr (&Scoding_system_eol_type);
8707 defsubr (&Scoding_system_priority_list);
8708
8709 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
8710 doc: /* List of coding systems.
8711
8712 Do not alter the value of this variable manually. This variable should be
8713 updated by the functions `define-coding-system' and
8714 `define-coding-system-alias'. */);
8715 Vcoding_system_list = Qnil;
8716
8717 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
8718 doc: /* Alist of coding system names.
8719 Each element is one element list of coding system name.
8720 This variable is given to `completing-read' as TABLE argument.
8721
8722 Do not alter the value of this variable manually. This variable should be
8723 updated by the functions `make-coding-system' and
8724 `define-coding-system-alias'. */);
8725 Vcoding_system_alist = Qnil;
8726
8727 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
8728 doc: /* List of coding-categories (symbols) ordered by priority.
8729
8730 On detecting a coding system, Emacs tries code detection algorithms
8731 associated with each coding-category one by one in this order. When
8732 one algorithm agrees with a byte sequence of source text, the coding
8733 system bound to the corresponding coding-category is selected. */);
8734 {
8735 int i;
8736
8737 Vcoding_category_list = Qnil;
8738 for (i = coding_category_max - 1; i >= 0; i--)
8739 Vcoding_category_list
8740 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
8741 Vcoding_category_list);
8742 }
8743
8744 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
8745 doc: /* Specify the coding system for read operations.
8746 It is useful to bind this variable with `let', but do not set it globally.
8747 If the value is a coding system, it is used for decoding on read operation.
8748 If not, an appropriate element is used from one of the coding system alists:
8749 There are three such tables, `file-coding-system-alist',
8750 `process-coding-system-alist', and `network-coding-system-alist'. */);
8751 Vcoding_system_for_read = Qnil;
8752
8753 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
8754 doc: /* Specify the coding system for write operations.
8755 Programs bind this variable with `let', but you should not set it globally.
8756 If the value is a coding system, it is used for encoding of output,
8757 when writing it to a file and when sending it to a file or subprocess.
8758
8759 If this does not specify a coding system, an appropriate element
8760 is used from one of the coding system alists:
8761 There are three such tables, `file-coding-system-alist',
8762 `process-coding-system-alist', and `network-coding-system-alist'.
8763 For output to files, if the above procedure does not specify a coding system,
8764 the value of `buffer-file-coding-system' is used. */);
8765 Vcoding_system_for_write = Qnil;
8766
8767 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
8768 doc: /*
8769 Coding system used in the latest file or process I/O. */);
8770 Vlast_coding_system_used = Qnil;
8771
8772 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
8773 doc: /*
8774 *Non-nil means always inhibit code conversion of end-of-line format.
8775 See info node `Coding Systems' and info node `Text and Binary' concerning
8776 such conversion. */);
8777 inhibit_eol_conversion = 0;
8778
8779 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
8780 doc: /*
8781 Non-nil means process buffer inherits coding system of process output.
8782 Bind it to t if the process output is to be treated as if it were a file
8783 read from some filesystem. */);
8784 inherit_process_coding_system = 0;
8785
8786 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
8787 doc: /*
8788 Alist to decide a coding system to use for a file I/O operation.
8789 The format is ((PATTERN . VAL) ...),
8790 where PATTERN is a regular expression matching a file name,
8791 VAL is a coding system, a cons of coding systems, or a function symbol.
8792 If VAL is a coding system, it is used for both decoding and encoding
8793 the file contents.
8794 If VAL is a cons of coding systems, the car part is used for decoding,
8795 and the cdr part is used for encoding.
8796 If VAL is a function symbol, the function must return a coding system
8797 or a cons of coding systems which are used as above. The function gets
8798 the arguments with which `find-operation-coding-systems' was called.
8799
8800 See also the function `find-operation-coding-system'
8801 and the variable `auto-coding-alist'. */);
8802 Vfile_coding_system_alist = Qnil;
8803
8804 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
8805 doc: /*
8806 Alist to decide a coding system to use for a process I/O operation.
8807 The format is ((PATTERN . VAL) ...),
8808 where PATTERN is a regular expression matching a program name,
8809 VAL is a coding system, a cons of coding systems, or a function symbol.
8810 If VAL is a coding system, it is used for both decoding what received
8811 from the program and encoding what sent to the program.
8812 If VAL is a cons of coding systems, the car part is used for decoding,
8813 and the cdr part is used for encoding.
8814 If VAL is a function symbol, the function must return a coding system
8815 or a cons of coding systems which are used as above.
8816
8817 See also the function `find-operation-coding-system'. */);
8818 Vprocess_coding_system_alist = Qnil;
8819
8820 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
8821 doc: /*
8822 Alist to decide a coding system to use for a network I/O operation.
8823 The format is ((PATTERN . VAL) ...),
8824 where PATTERN is a regular expression matching a network service name
8825 or is a port number to connect to,
8826 VAL is a coding system, a cons of coding systems, or a function symbol.
8827 If VAL is a coding system, it is used for both decoding what received
8828 from the network stream and encoding what sent to the network stream.
8829 If VAL is a cons of coding systems, the car part is used for decoding,
8830 and the cdr part is used for encoding.
8831 If VAL is a function symbol, the function must return a coding system
8832 or a cons of coding systems which are used as above.
8833
8834 See also the function `find-operation-coding-system'. */);
8835 Vnetwork_coding_system_alist = Qnil;
8836
8837 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
8838 doc: /* Coding system to use with system messages.
8839 Also used for decoding keyboard input on X Window system. */);
8840 Vlocale_coding_system = Qnil;
8841
8842 /* The eol mnemonics are reset in startup.el system-dependently. */
8843 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
8844 doc: /*
8845 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
8846 eol_mnemonic_unix = build_string (":");
8847
8848 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
8849 doc: /*
8850 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
8851 eol_mnemonic_dos = build_string ("\\");
8852
8853 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
8854 doc: /*
8855 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
8856 eol_mnemonic_mac = build_string ("/");
8857
8858 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
8859 doc: /*
8860 *String displayed in mode line when end-of-line format is not yet determined. */);
8861 eol_mnemonic_undecided = build_string (":");
8862
8863 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
8864 doc: /*
8865 *Non-nil enables character translation while encoding and decoding. */);
8866 Venable_character_translation = Qt;
8867
8868 DEFVAR_LISP ("standard-translation-table-for-decode",
8869 &Vstandard_translation_table_for_decode,
8870 doc: /* Table for translating characters while decoding. */);
8871 Vstandard_translation_table_for_decode = Qnil;
8872
8873 DEFVAR_LISP ("standard-translation-table-for-encode",
8874 &Vstandard_translation_table_for_encode,
8875 doc: /* Table for translating characters while encoding. */);
8876 Vstandard_translation_table_for_encode = Qnil;
8877
8878 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
8879 doc: /* Alist of charsets vs revision numbers.
8880 While encoding, if a charset (car part of an element) is found,
8881 designate it with the escape sequence identifying revision (cdr part
8882 of the element). */);
8883 Vcharset_revision_table = Qnil;
8884
8885 DEFVAR_LISP ("default-process-coding-system",
8886 &Vdefault_process_coding_system,
8887 doc: /* Cons of coding systems used for process I/O by default.
8888 The car part is used for decoding a process output,
8889 the cdr part is used for encoding a text to be sent to a process. */);
8890 Vdefault_process_coding_system = Qnil;
8891
8892 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
8893 doc: /*
8894 Table of extra Latin codes in the range 128..159 (inclusive).
8895 This is a vector of length 256.
8896 If Nth element is non-nil, the existence of code N in a file
8897 \(or output of subprocess) doesn't prevent it to be detected as
8898 a coding system of ISO 2022 variant which has a flag
8899 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
8900 or reading output of a subprocess.
8901 Only 128th through 159th elements has a meaning. */);
8902 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
8903
8904 DEFVAR_LISP ("select-safe-coding-system-function",
8905 &Vselect_safe_coding_system_function,
8906 doc: /*
8907 Function to call to select safe coding system for encoding a text.
8908
8909 If set, this function is called to force a user to select a proper
8910 coding system which can encode the text in the case that a default
8911 coding system used in each operation can't encode the text.
8912
8913 The default value is `select-safe-coding-system' (which see). */);
8914 Vselect_safe_coding_system_function = Qnil;
8915
8916 DEFVAR_BOOL ("inhibit-iso-escape-detection",
8917 &inhibit_iso_escape_detection,
8918 doc: /*
8919 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
8920
8921 By default, on reading a file, Emacs tries to detect how the text is
8922 encoded. This code detection is sensitive to escape sequences. If
8923 the sequence is valid as ISO2022, the code is determined as one of
8924 the ISO2022 encodings, and the file is decoded by the corresponding
8925 coding system (e.g. `iso-2022-7bit').
8926
8927 However, there may be a case that you want to read escape sequences in
8928 a file as is. In such a case, you can set this variable to non-nil.
8929 Then, as the code detection ignores any escape sequences, no file is
8930 detected as encoded in some ISO2022 encoding. The result is that all
8931 escape sequences become visible in a buffer.
8932
8933 The default value is nil, and it is strongly recommended not to change
8934 it. That is because many Emacs Lisp source files that contain
8935 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
8936 in Emacs's distribution, and they won't be decoded correctly on
8937 reading if you suppress escape sequence detection.
8938
8939 The other way to read escape sequences in a file without decoding is
8940 to explicitly specify some coding system that doesn't use ISO2022's
8941 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
8942 inhibit_iso_escape_detection = 0;
8943
8944 {
8945 Lisp_Object args[coding_arg_max];
8946 Lisp_Object plist[14];
8947 int i;
8948
8949 for (i = 0; i < coding_arg_max; i++)
8950 args[i] = Qnil;
8951
8952 plist[0] = intern (":name");
8953 plist[1] = args[coding_arg_name] = Qno_conversion;
8954 plist[2] = intern (":mnemonic");
8955 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
8956 plist[4] = intern (":coding-type");
8957 plist[5] = args[coding_arg_coding_type] = Qraw_text;
8958 plist[6] = intern (":ascii-compatible-p");
8959 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
8960 plist[8] = intern (":default-char");
8961 plist[9] = args[coding_arg_default_char] = make_number (0);
8962 plist[10] = intern (":docstring");
8963 plist[11] = build_string ("Do no conversion.\n\
8964 \n\
8965 When you visit a file with this coding, the file is read into a\n\
8966 unibyte buffer as is, thus each byte of a file is treated as a\n\
8967 character.");
8968 plist[12] = intern (":eol-type");
8969 plist[13] = args[coding_arg_eol_type] = Qunix;
8970 args[coding_arg_plist] = Flist (14, plist);
8971 Fdefine_coding_system_internal (coding_arg_max, args);
8972 }
8973
8974 setup_coding_system (Qno_conversion, &keyboard_coding);
8975 setup_coding_system (Qno_conversion, &terminal_coding);
8976 setup_coding_system (Qno_conversion, &safe_terminal_coding);
8977
8978 {
8979 int i;
8980
8981 for (i = 0; i < coding_category_max; i++)
8982 Fset (AREF (Vcoding_category_table, i), Qno_conversion);
8983 }
8984 }
8985
8986 char *
8987 emacs_strerror (error_number)
8988 int error_number;
8989 {
8990 char *str;
8991
8992 synchronize_system_messages_locale ();
8993 str = strerror (error_number);
8994
8995 if (! NILP (Vlocale_coding_system))
8996 {
8997 Lisp_Object dec = code_convert_string_norecord (build_string (str),
8998 Vlocale_coding_system,
8999 0);
9000 str = (char *) XSTRING (dec)->data;
9001 }
9002
9003 return str;
9004 }
9005
9006 #endif /* emacs */