]> code.delx.au - gnu-emacs/blob - src/coding.c
New directory
[gnu-emacs] / src / coding.c
1 /* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 1995,97,1998,2002,2003 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001,2002,2003 Free Software Foundation, Inc.
5
6 This file is part of GNU Emacs.
7
8 GNU Emacs is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2, or (at your option)
11 any later version.
12
13 GNU Emacs is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GNU Emacs; see the file COPYING. If not, write to
20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 Boston, MA 02111-1307, USA. */
22
23 /*** TABLE OF CONTENTS ***
24
25 0. General comments
26 1. Preamble
27 2. Emacs' internal format (emacs-mule) handlers
28 3. ISO2022 handlers
29 4. Shift-JIS and BIG5 handlers
30 5. CCL handlers
31 6. End-of-line handlers
32 7. C library functions
33 8. Emacs Lisp library functions
34 9. Post-amble
35
36 */
37
38 /*** 0. General comments ***/
39
40
41 /*** GENERAL NOTE on CODING SYSTEMS ***
42
43 A coding system is an encoding mechanism for one or more character
44 sets. Here's a list of coding systems which Emacs can handle. When
45 we say "decode", it means converting some other coding system to
46 Emacs' internal format (emacs-mule), and when we say "encode",
47 it means converting the coding system emacs-mule to some other
48 coding system.
49
50 0. Emacs' internal format (emacs-mule)
51
52 Emacs itself holds a multi-lingual character in buffers and strings
53 in a special format. Details are described in section 2.
54
55 1. ISO2022
56
57 The most famous coding system for multiple character sets. X's
58 Compound Text, various EUCs (Extended Unix Code), and coding
59 systems used in Internet communication such as ISO-2022-JP are
60 all variants of ISO2022. Details are described in section 3.
61
62 2. SJIS (or Shift-JIS or MS-Kanji-Code)
63
64 A coding system to encode character sets: ASCII, JISX0201, and
65 JISX0208. Widely used for PC's in Japan. Details are described in
66 section 4.
67
68 3. BIG5
69
70 A coding system to encode the character sets ASCII and Big5. Widely
71 used for Chinese (mainly in Taiwan and Hong Kong). Details are
72 described in section 4. In this file, when we write "BIG5"
73 (all uppercase), we mean the coding system, and when we write
74 "Big5" (capitalized), we mean the character set.
75
76 4. Raw text
77
78 A coding system for text containing random 8-bit code. Emacs does
79 no code conversion on such text except for end-of-line format.
80
81 5. Other
82
83 If a user wants to read/write text encoded in a coding system not
84 listed above, he can supply a decoder and an encoder for it as CCL
85 (Code Conversion Language) programs. Emacs executes the CCL program
86 while reading/writing.
87
88 Emacs represents a coding system by a Lisp symbol that has a property
89 `coding-system'. But, before actually using the coding system, the
90 information about it is set in a structure of type `struct
91 coding_system' for rapid processing. See section 6 for more details.
92
93 */
94
95 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
96
97 How end-of-line of text is encoded depends on the operating system.
98 For instance, Unix's format is just one byte of `line-feed' code,
99 whereas DOS's format is two-byte sequence of `carriage-return' and
100 `line-feed' codes. MacOS's format is usually one byte of
101 `carriage-return'.
102
103 Since text character encoding and end-of-line encoding are
104 independent, any coding system described above can have any
105 end-of-line format. So Emacs has information about end-of-line
106 format in each coding-system. See section 6 for more details.
107
108 */
109
110 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
111
112 These functions check if a text between SRC and SRC_END is encoded
113 in the coding system category XXX. Each returns an integer value in
114 which appropriate flag bits for the category XXX are set. The flag
115 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
116 template for these functions. If MULTIBYTEP is nonzero, 8-bit codes
117 of the range 0x80..0x9F are in multibyte form. */
118 #if 0
119 int
120 detect_coding_emacs_mule (src, src_end, multibytep)
121 unsigned char *src, *src_end;
122 int multibytep;
123 {
124 ...
125 }
126 #endif
127
128 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
129
130 These functions decode SRC_BYTES length of unibyte text at SOURCE
131 encoded in CODING to Emacs' internal format. The resulting
132 multibyte text goes to a place pointed to by DESTINATION, the length
133 of which should not exceed DST_BYTES.
134
135 These functions set the information about original and decoded texts
136 in the members `produced', `produced_char', `consumed', and
137 `consumed_char' of the structure *CODING. They also set the member
138 `result' to one of CODING_FINISH_XXX indicating how the decoding
139 finished.
140
141 DST_BYTES zero means that the source area and destination area are
142 overlapped, which means that we can produce a decoded text until it
143 reaches the head of the not-yet-decoded source text.
144
145 Below is a template for these functions. */
146 #if 0
147 static void
148 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
149 struct coding_system *coding;
150 unsigned char *source, *destination;
151 int src_bytes, dst_bytes;
152 {
153 ...
154 }
155 #endif
156
157 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
158
159 These functions encode SRC_BYTES length text at SOURCE from Emacs'
160 internal multibyte format to CODING. The resulting unibyte text
161 goes to a place pointed to by DESTINATION, the length of which
162 should not exceed DST_BYTES.
163
164 These functions set the information about original and encoded texts
165 in the members `produced', `produced_char', `consumed', and
166 `consumed_char' of the structure *CODING. They also set the member
167 `result' to one of CODING_FINISH_XXX indicating how the encoding
168 finished.
169
170 DST_BYTES zero means that the source area and destination area are
171 overlapped, which means that we can produce encoded text until it
172 reaches at the head of the not-yet-encoded source text.
173
174 Below is a template for these functions. */
175 #if 0
176 static void
177 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
178 struct coding_system *coding;
179 unsigned char *source, *destination;
180 int src_bytes, dst_bytes;
181 {
182 ...
183 }
184 #endif
185
186 /*** COMMONLY USED MACROS ***/
187
188 /* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
189 get one, two, and three bytes from the source text respectively.
190 If there are not enough bytes in the source, they jump to
191 `label_end_of_loop'. The caller should set variables `coding',
192 `src' and `src_end' to appropriate pointer in advance. These
193 macros are called from decoding routines `decode_coding_XXX', thus
194 it is assumed that the source text is unibyte. */
195
196 #define ONE_MORE_BYTE(c1) \
197 do { \
198 if (src >= src_end) \
199 { \
200 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
201 goto label_end_of_loop; \
202 } \
203 c1 = *src++; \
204 } while (0)
205
206 #define TWO_MORE_BYTES(c1, c2) \
207 do { \
208 if (src + 1 >= src_end) \
209 { \
210 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
211 goto label_end_of_loop; \
212 } \
213 c1 = *src++; \
214 c2 = *src++; \
215 } while (0)
216
217
218 /* Like ONE_MORE_BYTE, but 8-bit bytes of data at SRC are in multibyte
219 form if MULTIBYTEP is nonzero. */
220
221 #define ONE_MORE_BYTE_CHECK_MULTIBYTE(c1, multibytep) \
222 do { \
223 if (src >= src_end) \
224 { \
225 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
226 goto label_end_of_loop; \
227 } \
228 c1 = *src++; \
229 if (multibytep && c1 == LEADING_CODE_8_BIT_CONTROL) \
230 c1 = *src++ - 0x20; \
231 } while (0)
232
233 /* Set C to the next character at the source text pointed by `src'.
234 If there are not enough characters in the source, jump to
235 `label_end_of_loop'. The caller should set variables `coding'
236 `src', `src_end', and `translation_table' to appropriate pointers
237 in advance. This macro is used in encoding routines
238 `encode_coding_XXX', thus it assumes that the source text is in
239 multibyte form except for 8-bit characters. 8-bit characters are
240 in multibyte form if coding->src_multibyte is nonzero, else they
241 are represented by a single byte. */
242
243 #define ONE_MORE_CHAR(c) \
244 do { \
245 int len = src_end - src; \
246 int bytes; \
247 if (len <= 0) \
248 { \
249 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
250 goto label_end_of_loop; \
251 } \
252 if (coding->src_multibyte \
253 || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes)) \
254 c = STRING_CHAR_AND_LENGTH (src, len, bytes); \
255 else \
256 c = *src, bytes = 1; \
257 if (!NILP (translation_table)) \
258 c = translate_char (translation_table, c, -1, 0, 0); \
259 src += bytes; \
260 } while (0)
261
262
263 /* Produce a multibyte form of character C to `dst'. Jump to
264 `label_end_of_loop' if there's not enough space at `dst'.
265
266 If we are now in the middle of a composition sequence, the decoded
267 character may be ALTCHAR (for the current composition). In that
268 case, the character goes to coding->cmp_data->data instead of
269 `dst'.
270
271 This macro is used in decoding routines. */
272
273 #define EMIT_CHAR(c) \
274 do { \
275 if (! COMPOSING_P (coding) \
276 || coding->composing == COMPOSITION_RELATIVE \
277 || coding->composing == COMPOSITION_WITH_RULE) \
278 { \
279 int bytes = CHAR_BYTES (c); \
280 if ((dst + bytes) > (dst_bytes ? dst_end : src)) \
281 { \
282 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
283 goto label_end_of_loop; \
284 } \
285 dst += CHAR_STRING (c, dst); \
286 coding->produced_char++; \
287 } \
288 \
289 if (COMPOSING_P (coding) \
290 && coding->composing != COMPOSITION_RELATIVE) \
291 { \
292 CODING_ADD_COMPOSITION_COMPONENT (coding, c); \
293 coding->composition_rule_follows \
294 = coding->composing != COMPOSITION_WITH_ALTCHARS; \
295 } \
296 } while (0)
297
298
299 #define EMIT_ONE_BYTE(c) \
300 do { \
301 if (dst >= (dst_bytes ? dst_end : src)) \
302 { \
303 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
304 goto label_end_of_loop; \
305 } \
306 *dst++ = c; \
307 } while (0)
308
309 #define EMIT_TWO_BYTES(c1, c2) \
310 do { \
311 if (dst + 2 > (dst_bytes ? dst_end : src)) \
312 { \
313 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
314 goto label_end_of_loop; \
315 } \
316 *dst++ = c1, *dst++ = c2; \
317 } while (0)
318
319 #define EMIT_BYTES(from, to) \
320 do { \
321 if (dst + (to - from) > (dst_bytes ? dst_end : src)) \
322 { \
323 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
324 goto label_end_of_loop; \
325 } \
326 while (from < to) \
327 *dst++ = *from++; \
328 } while (0)
329
330 \f
331 /*** 1. Preamble ***/
332
333 #ifdef emacs
334 #include <config.h>
335 #endif
336
337 #include <stdio.h>
338
339 #ifdef emacs
340
341 #include "lisp.h"
342 #include "buffer.h"
343 #include "charset.h"
344 #include "composite.h"
345 #include "ccl.h"
346 #include "coding.h"
347 #include "window.h"
348 #include "intervals.h"
349
350 #else /* not emacs */
351
352 #include "mulelib.h"
353
354 #endif /* not emacs */
355
356 Lisp_Object Qcoding_system, Qeol_type;
357 Lisp_Object Qbuffer_file_coding_system;
358 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
359 Lisp_Object Qno_conversion, Qundecided;
360 Lisp_Object Qcoding_system_history;
361 Lisp_Object Qsafe_chars;
362 Lisp_Object Qvalid_codes;
363
364 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
365 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
366 Lisp_Object Qstart_process, Qopen_network_stream;
367 Lisp_Object Qtarget_idx;
368
369 Lisp_Object Vselect_safe_coding_system_function;
370
371 int coding_system_require_warning;
372
373 /* Mnemonic string for each format of end-of-line. */
374 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
375 /* Mnemonic string to indicate format of end-of-line is not yet
376 decided. */
377 Lisp_Object eol_mnemonic_undecided;
378
379 /* Format of end-of-line decided by system. This is CODING_EOL_LF on
380 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
381 int system_eol_type;
382
383 #ifdef emacs
384
385 /* Information about which coding system is safe for which chars.
386 The value has the form (GENERIC-LIST . NON-GENERIC-ALIST).
387
388 GENERIC-LIST is a list of generic coding systems which can encode
389 any characters.
390
391 NON-GENERIC-ALIST is an alist of non generic coding systems vs the
392 corresponding char table that contains safe chars. */
393 Lisp_Object Vcoding_system_safe_chars;
394
395 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
396
397 Lisp_Object Qcoding_system_p, Qcoding_system_error;
398
399 /* Coding system emacs-mule and raw-text are for converting only
400 end-of-line format. */
401 Lisp_Object Qemacs_mule, Qraw_text;
402
403 Lisp_Object Qutf_8;
404
405 /* Coding-systems are handed between Emacs Lisp programs and C internal
406 routines by the following three variables. */
407 /* Coding-system for reading files and receiving data from process. */
408 Lisp_Object Vcoding_system_for_read;
409 /* Coding-system for writing files and sending data to process. */
410 Lisp_Object Vcoding_system_for_write;
411 /* Coding-system actually used in the latest I/O. */
412 Lisp_Object Vlast_coding_system_used;
413
414 /* A vector of length 256 which contains information about special
415 Latin codes (especially for dealing with Microsoft codes). */
416 Lisp_Object Vlatin_extra_code_table;
417
418 /* Flag to inhibit code conversion of end-of-line format. */
419 int inhibit_eol_conversion;
420
421 /* Flag to inhibit ISO2022 escape sequence detection. */
422 int inhibit_iso_escape_detection;
423
424 /* Flag to make buffer-file-coding-system inherit from process-coding. */
425 int inherit_process_coding_system;
426
427 /* Coding system to be used to encode text for terminal display. */
428 struct coding_system terminal_coding;
429
430 /* Coding system to be used to encode text for terminal display when
431 terminal coding system is nil. */
432 struct coding_system safe_terminal_coding;
433
434 /* Coding system of what is sent from terminal keyboard. */
435 struct coding_system keyboard_coding;
436
437 /* Default coding system to be used to write a file. */
438 struct coding_system default_buffer_file_coding;
439
440 Lisp_Object Vfile_coding_system_alist;
441 Lisp_Object Vprocess_coding_system_alist;
442 Lisp_Object Vnetwork_coding_system_alist;
443
444 Lisp_Object Vlocale_coding_system;
445
446 #endif /* emacs */
447
448 Lisp_Object Qcoding_category, Qcoding_category_index;
449
450 /* List of symbols `coding-category-xxx' ordered by priority. */
451 Lisp_Object Vcoding_category_list;
452
453 /* Table of coding categories (Lisp symbols). */
454 Lisp_Object Vcoding_category_table;
455
456 /* Table of names of symbol for each coding-category. */
457 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
458 "coding-category-emacs-mule",
459 "coding-category-sjis",
460 "coding-category-iso-7",
461 "coding-category-iso-7-tight",
462 "coding-category-iso-8-1",
463 "coding-category-iso-8-2",
464 "coding-category-iso-7-else",
465 "coding-category-iso-8-else",
466 "coding-category-ccl",
467 "coding-category-big5",
468 "coding-category-utf-8",
469 "coding-category-utf-16-be",
470 "coding-category-utf-16-le",
471 "coding-category-raw-text",
472 "coding-category-binary"
473 };
474
475 /* Table of pointers to coding systems corresponding to each coding
476 categories. */
477 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
478
479 /* Table of coding category masks. Nth element is a mask for a coding
480 category of which priority is Nth. */
481 static
482 int coding_priorities[CODING_CATEGORY_IDX_MAX];
483
484 /* Flag to tell if we look up translation table on character code
485 conversion. */
486 Lisp_Object Venable_character_translation;
487 /* Standard translation table to look up on decoding (reading). */
488 Lisp_Object Vstandard_translation_table_for_decode;
489 /* Standard translation table to look up on encoding (writing). */
490 Lisp_Object Vstandard_translation_table_for_encode;
491
492 Lisp_Object Qtranslation_table;
493 Lisp_Object Qtranslation_table_id;
494 Lisp_Object Qtranslation_table_for_decode;
495 Lisp_Object Qtranslation_table_for_encode;
496
497 /* Alist of charsets vs revision number. */
498 Lisp_Object Vcharset_revision_alist;
499
500 /* Default coding systems used for process I/O. */
501 Lisp_Object Vdefault_process_coding_system;
502
503 /* Char table for translating Quail and self-inserting input. */
504 Lisp_Object Vtranslation_table_for_input;
505
506 /* Global flag to tell that we can't call post-read-conversion and
507 pre-write-conversion functions. Usually the value is zero, but it
508 is set to 1 temporarily while such functions are running. This is
509 to avoid infinite recursive call. */
510 static int inhibit_pre_post_conversion;
511
512 Lisp_Object Qchar_coding_system;
513
514 /* Return `safe-chars' property of CODING_SYSTEM (symbol). Don't check
515 its validity. */
516
517 Lisp_Object
518 coding_safe_chars (coding_system)
519 Lisp_Object coding_system;
520 {
521 Lisp_Object coding_spec, plist, safe_chars;
522
523 coding_spec = Fget (coding_system, Qcoding_system);
524 plist = XVECTOR (coding_spec)->contents[3];
525 safe_chars = Fplist_get (XVECTOR (coding_spec)->contents[3], Qsafe_chars);
526 return (CHAR_TABLE_P (safe_chars) ? safe_chars : Qt);
527 }
528
529 #define CODING_SAFE_CHAR_P(safe_chars, c) \
530 (EQ (safe_chars, Qt) || !NILP (CHAR_TABLE_REF (safe_chars, c)))
531
532 \f
533 /*** 2. Emacs internal format (emacs-mule) handlers ***/
534
535 /* Emacs' internal format for representation of multiple character
536 sets is a kind of multi-byte encoding, i.e. characters are
537 represented by variable-length sequences of one-byte codes.
538
539 ASCII characters and control characters (e.g. `tab', `newline') are
540 represented by one-byte sequences which are their ASCII codes, in
541 the range 0x00 through 0x7F.
542
543 8-bit characters of the range 0x80..0x9F are represented by
544 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
545 code + 0x20).
546
547 8-bit characters of the range 0xA0..0xFF are represented by
548 one-byte sequences which are their 8-bit code.
549
550 The other characters are represented by a sequence of `base
551 leading-code', optional `extended leading-code', and one or two
552 `position-code's. The length of the sequence is determined by the
553 base leading-code. Leading-code takes the range 0x81 through 0x9D,
554 whereas extended leading-code and position-code take the range 0xA0
555 through 0xFF. See `charset.h' for more details about leading-code
556 and position-code.
557
558 --- CODE RANGE of Emacs' internal format ---
559 character set range
560 ------------- -----
561 ascii 0x00..0x7F
562 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
563 eight-bit-graphic 0xA0..0xBF
564 ELSE 0x81..0x9D + [0xA0..0xFF]+
565 ---------------------------------------------
566
567 As this is the internal character representation, the format is
568 usually not used externally (i.e. in a file or in a data sent to a
569 process). But, it is possible to have a text externally in this
570 format (i.e. by encoding by the coding system `emacs-mule').
571
572 In that case, a sequence of one-byte codes has a slightly different
573 form.
574
575 Firstly, all characters in eight-bit-control are represented by
576 one-byte sequences which are their 8-bit code.
577
578 Next, character composition data are represented by the byte
579 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
580 where,
581 METHOD is 0xF0 plus one of composition method (enum
582 composition_method),
583
584 BYTES is 0xA0 plus the byte length of these composition data,
585
586 CHARS is 0xA0 plus the number of characters composed by these
587 data,
588
589 COMPONENTs are characters of multibyte form or composition
590 rules encoded by two-byte of ASCII codes.
591
592 In addition, for backward compatibility, the following formats are
593 also recognized as composition data on decoding.
594
595 0x80 MSEQ ...
596 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
597
598 Here,
599 MSEQ is a multibyte form but in these special format:
600 ASCII: 0xA0 ASCII_CODE+0x80,
601 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
602 RULE is a one byte code of the range 0xA0..0xF0 that
603 represents a composition rule.
604 */
605
606 enum emacs_code_class_type emacs_code_class[256];
607
608 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
609 Check if a text is encoded in Emacs' internal format. If it is,
610 return CODING_CATEGORY_MASK_EMACS_MULE, else return 0. */
611
612 static int
613 detect_coding_emacs_mule (src, src_end, multibytep)
614 unsigned char *src, *src_end;
615 int multibytep;
616 {
617 unsigned char c;
618 int composing = 0;
619 /* Dummy for ONE_MORE_BYTE. */
620 struct coding_system dummy_coding;
621 struct coding_system *coding = &dummy_coding;
622
623 while (1)
624 {
625 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
626
627 if (composing)
628 {
629 if (c < 0xA0)
630 composing = 0;
631 else if (c == 0xA0)
632 {
633 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
634 c &= 0x7F;
635 }
636 else
637 c -= 0x20;
638 }
639
640 if (c < 0x20)
641 {
642 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
643 return 0;
644 }
645 else if (c >= 0x80 && c < 0xA0)
646 {
647 if (c == 0x80)
648 /* Old leading code for a composite character. */
649 composing = 1;
650 else
651 {
652 unsigned char *src_base = src - 1;
653 int bytes;
654
655 if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
656 bytes))
657 return 0;
658 src = src_base + bytes;
659 }
660 }
661 }
662 label_end_of_loop:
663 return CODING_CATEGORY_MASK_EMACS_MULE;
664 }
665
666
667 /* Record the starting position START and METHOD of one composition. */
668
669 #define CODING_ADD_COMPOSITION_START(coding, start, method) \
670 do { \
671 struct composition_data *cmp_data = coding->cmp_data; \
672 int *data = cmp_data->data + cmp_data->used; \
673 coding->cmp_data_start = cmp_data->used; \
674 data[0] = -1; \
675 data[1] = cmp_data->char_offset + start; \
676 data[3] = (int) method; \
677 cmp_data->used += 4; \
678 } while (0)
679
680 /* Record the ending position END of the current composition. */
681
682 #define CODING_ADD_COMPOSITION_END(coding, end) \
683 do { \
684 struct composition_data *cmp_data = coding->cmp_data; \
685 int *data = cmp_data->data + coding->cmp_data_start; \
686 data[0] = cmp_data->used - coding->cmp_data_start; \
687 data[2] = cmp_data->char_offset + end; \
688 } while (0)
689
690 /* Record one COMPONENT (alternate character or composition rule). */
691
692 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component) \
693 do { \
694 coding->cmp_data->data[coding->cmp_data->used++] = component; \
695 if (coding->cmp_data->used - coding->cmp_data_start \
696 == COMPOSITION_DATA_MAX_BUNCH_LENGTH) \
697 { \
698 CODING_ADD_COMPOSITION_END (coding, coding->produced_char); \
699 coding->composing = COMPOSITION_NO; \
700 } \
701 } while (0)
702
703
704 /* Get one byte from a data pointed by SRC and increment SRC. If SRC
705 is not less than SRC_END, return -1 without incrementing Src. */
706
707 #define SAFE_ONE_MORE_BYTE() (src >= src_end ? -1 : *src++)
708
709
710 /* Decode a character represented as a component of composition
711 sequence of Emacs 20 style at SRC. Set C to that character, store
712 its multibyte form sequence at P, and set P to the end of that
713 sequence. If no valid character is found, set C to -1. */
714
715 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(c, p) \
716 do { \
717 int bytes; \
718 \
719 c = SAFE_ONE_MORE_BYTE (); \
720 if (c < 0) \
721 break; \
722 if (CHAR_HEAD_P (c)) \
723 c = -1; \
724 else if (c == 0xA0) \
725 { \
726 c = SAFE_ONE_MORE_BYTE (); \
727 if (c < 0xA0) \
728 c = -1; \
729 else \
730 { \
731 c -= 0xA0; \
732 *p++ = c; \
733 } \
734 } \
735 else if (BASE_LEADING_CODE_P (c - 0x20)) \
736 { \
737 unsigned char *p0 = p; \
738 \
739 c -= 0x20; \
740 *p++ = c; \
741 bytes = BYTES_BY_CHAR_HEAD (c); \
742 while (--bytes) \
743 { \
744 c = SAFE_ONE_MORE_BYTE (); \
745 if (c < 0) \
746 break; \
747 *p++ = c; \
748 } \
749 if (UNIBYTE_STR_AS_MULTIBYTE_P (p0, p - p0, bytes) \
750 || (coding->flags /* We are recovering a file. */ \
751 && p0[0] == LEADING_CODE_8_BIT_CONTROL \
752 && ! CHAR_HEAD_P (p0[1]))) \
753 c = STRING_CHAR (p0, bytes); \
754 else \
755 c = -1; \
756 } \
757 else \
758 c = -1; \
759 } while (0)
760
761
762 /* Decode a composition rule represented as a component of composition
763 sequence of Emacs 20 style at SRC. Set C to the rule. If not
764 valid rule is found, set C to -1. */
765
766 #define DECODE_EMACS_MULE_COMPOSITION_RULE(c) \
767 do { \
768 c = SAFE_ONE_MORE_BYTE (); \
769 c -= 0xA0; \
770 if (c < 0 || c >= 81) \
771 c = -1; \
772 else \
773 { \
774 gref = c / 9, nref = c % 9; \
775 c = COMPOSITION_ENCODE_RULE (gref, nref); \
776 } \
777 } while (0)
778
779
780 /* Decode composition sequence encoded by `emacs-mule' at the source
781 pointed by SRC. SRC_END is the end of source. Store information
782 of the composition in CODING->cmp_data.
783
784 For backward compatibility, decode also a composition sequence of
785 Emacs 20 style. In that case, the composition sequence contains
786 characters that should be extracted into a buffer or string. Store
787 those characters at *DESTINATION in multibyte form.
788
789 If we encounter an invalid byte sequence, return 0.
790 If we encounter an insufficient source or destination, or
791 insufficient space in CODING->cmp_data, return 1.
792 Otherwise, return consumed bytes in the source.
793
794 */
795 static INLINE int
796 decode_composition_emacs_mule (coding, src, src_end,
797 destination, dst_end, dst_bytes)
798 struct coding_system *coding;
799 unsigned char *src, *src_end, **destination, *dst_end;
800 int dst_bytes;
801 {
802 unsigned char *dst = *destination;
803 int method, data_len, nchars;
804 unsigned char *src_base = src++;
805 /* Store components of composition. */
806 int component[COMPOSITION_DATA_MAX_BUNCH_LENGTH];
807 int ncomponent;
808 /* Store multibyte form of characters to be composed. This is for
809 Emacs 20 style composition sequence. */
810 unsigned char buf[MAX_COMPOSITION_COMPONENTS * MAX_MULTIBYTE_LENGTH];
811 unsigned char *bufp = buf;
812 int c, i, gref, nref;
813
814 if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
815 >= COMPOSITION_DATA_SIZE)
816 {
817 coding->result = CODING_FINISH_INSUFFICIENT_CMP;
818 return -1;
819 }
820
821 ONE_MORE_BYTE (c);
822 if (c - 0xF0 >= COMPOSITION_RELATIVE
823 && c - 0xF0 <= COMPOSITION_WITH_RULE_ALTCHARS)
824 {
825 int with_rule;
826
827 method = c - 0xF0;
828 with_rule = (method == COMPOSITION_WITH_RULE
829 || method == COMPOSITION_WITH_RULE_ALTCHARS);
830 ONE_MORE_BYTE (c);
831 data_len = c - 0xA0;
832 if (data_len < 4
833 || src_base + data_len > src_end)
834 return 0;
835 ONE_MORE_BYTE (c);
836 nchars = c - 0xA0;
837 if (c < 1)
838 return 0;
839 for (ncomponent = 0; src < src_base + data_len; ncomponent++)
840 {
841 /* If it is longer than this, it can't be valid. */
842 if (ncomponent >= COMPOSITION_DATA_MAX_BUNCH_LENGTH)
843 return 0;
844
845 if (ncomponent % 2 && with_rule)
846 {
847 ONE_MORE_BYTE (gref);
848 gref -= 32;
849 ONE_MORE_BYTE (nref);
850 nref -= 32;
851 c = COMPOSITION_ENCODE_RULE (gref, nref);
852 }
853 else
854 {
855 int bytes;
856 if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
857 || (coding->flags /* We are recovering a file. */
858 && src[0] == LEADING_CODE_8_BIT_CONTROL
859 && ! CHAR_HEAD_P (src[1])))
860 c = STRING_CHAR (src, bytes);
861 else
862 c = *src, bytes = 1;
863 src += bytes;
864 }
865 component[ncomponent] = c;
866 }
867 }
868 else
869 {
870 /* This may be an old Emacs 20 style format. See the comment at
871 the section 2 of this file. */
872 while (src < src_end && !CHAR_HEAD_P (*src)) src++;
873 if (src == src_end
874 && !(coding->mode & CODING_MODE_LAST_BLOCK))
875 goto label_end_of_loop;
876
877 src_end = src;
878 src = src_base + 1;
879 if (c < 0xC0)
880 {
881 method = COMPOSITION_RELATIVE;
882 for (ncomponent = 0; ncomponent < MAX_COMPOSITION_COMPONENTS;)
883 {
884 DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
885 if (c < 0)
886 break;
887 component[ncomponent++] = c;
888 }
889 if (ncomponent < 2)
890 return 0;
891 nchars = ncomponent;
892 }
893 else if (c == 0xFF)
894 {
895 method = COMPOSITION_WITH_RULE;
896 src++;
897 DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
898 if (c < 0)
899 return 0;
900 component[0] = c;
901 for (ncomponent = 1;
902 ncomponent < MAX_COMPOSITION_COMPONENTS * 2 - 1;)
903 {
904 DECODE_EMACS_MULE_COMPOSITION_RULE (c);
905 if (c < 0)
906 break;
907 component[ncomponent++] = c;
908 DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
909 if (c < 0)
910 break;
911 component[ncomponent++] = c;
912 }
913 if (ncomponent < 3)
914 return 0;
915 nchars = (ncomponent + 1) / 2;
916 }
917 else
918 return 0;
919 }
920
921 if (buf == bufp || dst + (bufp - buf) <= (dst_bytes ? dst_end : src))
922 {
923 CODING_ADD_COMPOSITION_START (coding, coding->produced_char, method);
924 for (i = 0; i < ncomponent; i++)
925 CODING_ADD_COMPOSITION_COMPONENT (coding, component[i]);
926 CODING_ADD_COMPOSITION_END (coding, coding->produced_char + nchars);
927 if (buf < bufp)
928 {
929 unsigned char *p = buf;
930 EMIT_BYTES (p, bufp);
931 *destination += bufp - buf;
932 coding->produced_char += nchars;
933 }
934 return (src - src_base);
935 }
936 label_end_of_loop:
937 return -1;
938 }
939
940 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
941
942 static void
943 decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
944 struct coding_system *coding;
945 unsigned char *source, *destination;
946 int src_bytes, dst_bytes;
947 {
948 unsigned char *src = source;
949 unsigned char *src_end = source + src_bytes;
950 unsigned char *dst = destination;
951 unsigned char *dst_end = destination + dst_bytes;
952 /* SRC_BASE remembers the start position in source in each loop.
953 The loop will be exited when there's not enough source code, or
954 when there's not enough destination area to produce a
955 character. */
956 unsigned char *src_base;
957
958 coding->produced_char = 0;
959 while ((src_base = src) < src_end)
960 {
961 unsigned char tmp[MAX_MULTIBYTE_LENGTH], *p;
962 int bytes;
963
964 if (*src == '\r')
965 {
966 int c = *src++;
967
968 if (coding->eol_type == CODING_EOL_CR)
969 c = '\n';
970 else if (coding->eol_type == CODING_EOL_CRLF)
971 {
972 ONE_MORE_BYTE (c);
973 if (c != '\n')
974 {
975 src--;
976 c = '\r';
977 }
978 }
979 *dst++ = c;
980 coding->produced_char++;
981 continue;
982 }
983 else if (*src == '\n')
984 {
985 if ((coding->eol_type == CODING_EOL_CR
986 || coding->eol_type == CODING_EOL_CRLF)
987 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
988 {
989 coding->result = CODING_FINISH_INCONSISTENT_EOL;
990 goto label_end_of_loop;
991 }
992 *dst++ = *src++;
993 coding->produced_char++;
994 continue;
995 }
996 else if (*src == 0x80 && coding->cmp_data)
997 {
998 /* Start of composition data. */
999 int consumed = decode_composition_emacs_mule (coding, src, src_end,
1000 &dst, dst_end,
1001 dst_bytes);
1002 if (consumed < 0)
1003 goto label_end_of_loop;
1004 else if (consumed > 0)
1005 {
1006 src += consumed;
1007 continue;
1008 }
1009 bytes = CHAR_STRING (*src, tmp);
1010 p = tmp;
1011 src++;
1012 }
1013 else if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
1014 || (coding->flags /* We are recovering a file. */
1015 && src[0] == LEADING_CODE_8_BIT_CONTROL
1016 && ! CHAR_HEAD_P (src[1])))
1017 {
1018 p = src;
1019 src += bytes;
1020 }
1021 else
1022 {
1023 bytes = CHAR_STRING (*src, tmp);
1024 p = tmp;
1025 src++;
1026 }
1027 if (dst + bytes >= (dst_bytes ? dst_end : src))
1028 {
1029 coding->result = CODING_FINISH_INSUFFICIENT_DST;
1030 break;
1031 }
1032 while (bytes--) *dst++ = *p++;
1033 coding->produced_char++;
1034 }
1035 label_end_of_loop:
1036 coding->consumed = coding->consumed_char = src_base - source;
1037 coding->produced = dst - destination;
1038 }
1039
1040
1041 /* Encode composition data stored at DATA into a special byte sequence
1042 starting by 0x80. Update CODING->cmp_data_start and maybe
1043 CODING->cmp_data for the next call. */
1044
1045 #define ENCODE_COMPOSITION_EMACS_MULE(coding, data) \
1046 do { \
1047 unsigned char buf[1024], *p0 = buf, *p; \
1048 int len = data[0]; \
1049 int i; \
1050 \
1051 buf[0] = 0x80; \
1052 buf[1] = 0xF0 + data[3]; /* METHOD */ \
1053 buf[3] = 0xA0 + (data[2] - data[1]); /* COMPOSED-CHARS */ \
1054 p = buf + 4; \
1055 if (data[3] == COMPOSITION_WITH_RULE \
1056 || data[3] == COMPOSITION_WITH_RULE_ALTCHARS) \
1057 { \
1058 p += CHAR_STRING (data[4], p); \
1059 for (i = 5; i < len; i += 2) \
1060 { \
1061 int gref, nref; \
1062 COMPOSITION_DECODE_RULE (data[i], gref, nref); \
1063 *p++ = 0x20 + gref; \
1064 *p++ = 0x20 + nref; \
1065 p += CHAR_STRING (data[i + 1], p); \
1066 } \
1067 } \
1068 else \
1069 { \
1070 for (i = 4; i < len; i++) \
1071 p += CHAR_STRING (data[i], p); \
1072 } \
1073 buf[2] = 0xA0 + (p - buf); /* COMPONENTS-BYTES */ \
1074 \
1075 if (dst + (p - buf) + 4 > (dst_bytes ? dst_end : src)) \
1076 { \
1077 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
1078 goto label_end_of_loop; \
1079 } \
1080 while (p0 < p) \
1081 *dst++ = *p0++; \
1082 coding->cmp_data_start += data[0]; \
1083 if (coding->cmp_data_start == coding->cmp_data->used \
1084 && coding->cmp_data->next) \
1085 { \
1086 coding->cmp_data = coding->cmp_data->next; \
1087 coding->cmp_data_start = 0; \
1088 } \
1089 } while (0)
1090
1091
1092 static void encode_eol P_ ((struct coding_system *, const unsigned char *,
1093 unsigned char *, int, int));
1094
1095 static void
1096 encode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
1097 struct coding_system *coding;
1098 unsigned char *source, *destination;
1099 int src_bytes, dst_bytes;
1100 {
1101 unsigned char *src = source;
1102 unsigned char *src_end = source + src_bytes;
1103 unsigned char *dst = destination;
1104 unsigned char *dst_end = destination + dst_bytes;
1105 unsigned char *src_base;
1106 int c;
1107 int char_offset;
1108 int *data;
1109
1110 Lisp_Object translation_table;
1111
1112 translation_table = Qnil;
1113
1114 /* Optimization for the case that there's no composition. */
1115 if (!coding->cmp_data || coding->cmp_data->used == 0)
1116 {
1117 encode_eol (coding, source, destination, src_bytes, dst_bytes);
1118 return;
1119 }
1120
1121 char_offset = coding->cmp_data->char_offset;
1122 data = coding->cmp_data->data + coding->cmp_data_start;
1123 while (1)
1124 {
1125 src_base = src;
1126
1127 /* If SRC starts a composition, encode the information about the
1128 composition in advance. */
1129 if (coding->cmp_data_start < coding->cmp_data->used
1130 && char_offset + coding->consumed_char == data[1])
1131 {
1132 ENCODE_COMPOSITION_EMACS_MULE (coding, data);
1133 char_offset = coding->cmp_data->char_offset;
1134 data = coding->cmp_data->data + coding->cmp_data_start;
1135 }
1136
1137 ONE_MORE_CHAR (c);
1138 if (c == '\n' && (coding->eol_type == CODING_EOL_CRLF
1139 || coding->eol_type == CODING_EOL_CR))
1140 {
1141 if (coding->eol_type == CODING_EOL_CRLF)
1142 EMIT_TWO_BYTES ('\r', c);
1143 else
1144 EMIT_ONE_BYTE ('\r');
1145 }
1146 else if (SINGLE_BYTE_CHAR_P (c))
1147 {
1148 if (coding->flags && ! ASCII_BYTE_P (c))
1149 {
1150 /* As we are auto saving, retain the multibyte form for
1151 8-bit chars. */
1152 unsigned char buf[MAX_MULTIBYTE_LENGTH];
1153 int bytes = CHAR_STRING (c, buf);
1154
1155 if (bytes == 1)
1156 EMIT_ONE_BYTE (buf[0]);
1157 else
1158 EMIT_TWO_BYTES (buf[0], buf[1]);
1159 }
1160 else
1161 EMIT_ONE_BYTE (c);
1162 }
1163 else
1164 EMIT_BYTES (src_base, src);
1165 coding->consumed_char++;
1166 }
1167 label_end_of_loop:
1168 coding->consumed = src_base - source;
1169 coding->produced = coding->produced_char = dst - destination;
1170 return;
1171 }
1172
1173 \f
1174 /*** 3. ISO2022 handlers ***/
1175
1176 /* The following note describes the coding system ISO2022 briefly.
1177 Since the intention of this note is to help understand the
1178 functions in this file, some parts are NOT ACCURATE or are OVERLY
1179 SIMPLIFIED. For thorough understanding, please refer to the
1180 original document of ISO2022. This is equivalent to the standard
1181 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
1182
1183 ISO2022 provides many mechanisms to encode several character sets
1184 in 7-bit and 8-bit environments. For 7-bit environments, all text
1185 is encoded using bytes less than 128. This may make the encoded
1186 text a little bit longer, but the text passes more easily through
1187 several types of gateway, some of which strip off the MSB (Most
1188 Significant Bit).
1189
1190 There are two kinds of character sets: control character sets and
1191 graphic character sets. The former contain control characters such
1192 as `newline' and `escape' to provide control functions (control
1193 functions are also provided by escape sequences). The latter
1194 contain graphic characters such as 'A' and '-'. Emacs recognizes
1195 two control character sets and many graphic character sets.
1196
1197 Graphic character sets are classified into one of the following
1198 four classes, according to the number of bytes (DIMENSION) and
1199 number of characters in one dimension (CHARS) of the set:
1200 - DIMENSION1_CHARS94
1201 - DIMENSION1_CHARS96
1202 - DIMENSION2_CHARS94
1203 - DIMENSION2_CHARS96
1204
1205 In addition, each character set is assigned an identification tag,
1206 unique for each set, called the "final character" (denoted as <F>
1207 hereafter). The <F> of each character set is decided by ECMA(*)
1208 when it is registered in ISO. The code range of <F> is 0x30..0x7F
1209 (0x30..0x3F are for private use only).
1210
1211 Note (*): ECMA = European Computer Manufacturers Association
1212
1213 Here are examples of graphic character sets [NAME(<F>)]:
1214 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
1215 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
1216 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
1217 o DIMENSION2_CHARS96 -- none for the moment
1218
1219 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
1220 C0 [0x00..0x1F] -- control character plane 0
1221 GL [0x20..0x7F] -- graphic character plane 0
1222 C1 [0x80..0x9F] -- control character plane 1
1223 GR [0xA0..0xFF] -- graphic character plane 1
1224
1225 A control character set is directly designated and invoked to C0 or
1226 C1 by an escape sequence. The most common case is that:
1227 - ISO646's control character set is designated/invoked to C0, and
1228 - ISO6429's control character set is designated/invoked to C1,
1229 and usually these designations/invocations are omitted in encoded
1230 text. In a 7-bit environment, only C0 can be used, and a control
1231 character for C1 is encoded by an appropriate escape sequence to
1232 fit into the environment. All control characters for C1 are
1233 defined to have corresponding escape sequences.
1234
1235 A graphic character set is at first designated to one of four
1236 graphic registers (G0 through G3), then these graphic registers are
1237 invoked to GL or GR. These designations and invocations can be
1238 done independently. The most common case is that G0 is invoked to
1239 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
1240 these invocations and designations are omitted in encoded text.
1241 In a 7-bit environment, only GL can be used.
1242
1243 When a graphic character set of CHARS94 is invoked to GL, codes
1244 0x20 and 0x7F of the GL area work as control characters SPACE and
1245 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
1246 be used.
1247
1248 There are two ways of invocation: locking-shift and single-shift.
1249 With locking-shift, the invocation lasts until the next different
1250 invocation, whereas with single-shift, the invocation affects the
1251 following character only and doesn't affect the locking-shift
1252 state. Invocations are done by the following control characters or
1253 escape sequences:
1254
1255 ----------------------------------------------------------------------
1256 abbrev function cntrl escape seq description
1257 ----------------------------------------------------------------------
1258 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
1259 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
1260 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
1261 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
1262 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
1263 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
1264 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
1265 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
1266 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
1267 ----------------------------------------------------------------------
1268 (*) These are not used by any known coding system.
1269
1270 Control characters for these functions are defined by macros
1271 ISO_CODE_XXX in `coding.h'.
1272
1273 Designations are done by the following escape sequences:
1274 ----------------------------------------------------------------------
1275 escape sequence description
1276 ----------------------------------------------------------------------
1277 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
1278 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
1279 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
1280 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
1281 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
1282 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
1283 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
1284 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
1285 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
1286 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
1287 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
1288 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
1289 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
1290 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
1291 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
1292 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
1293 ----------------------------------------------------------------------
1294
1295 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
1296 of dimension 1, chars 94, and final character <F>, etc...
1297
1298 Note (*): Although these designations are not allowed in ISO2022,
1299 Emacs accepts them on decoding, and produces them on encoding
1300 CHARS96 character sets in a coding system which is characterized as
1301 7-bit environment, non-locking-shift, and non-single-shift.
1302
1303 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
1304 '(' can be omitted. We refer to this as "short-form" hereafter.
1305
1306 Now you may notice that there are a lot of ways of encoding the
1307 same multilingual text in ISO2022. Actually, there exist many
1308 coding systems such as Compound Text (used in X11's inter client
1309 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
1310 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
1311 localized platforms), and all of these are variants of ISO2022.
1312
1313 In addition to the above, Emacs handles two more kinds of escape
1314 sequences: ISO6429's direction specification and Emacs' private
1315 sequence for specifying character composition.
1316
1317 ISO6429's direction specification takes the following form:
1318 o CSI ']' -- end of the current direction
1319 o CSI '0' ']' -- end of the current direction
1320 o CSI '1' ']' -- start of left-to-right text
1321 o CSI '2' ']' -- start of right-to-left text
1322 The control character CSI (0x9B: control sequence introducer) is
1323 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
1324
1325 Character composition specification takes the following form:
1326 o ESC '0' -- start relative composition
1327 o ESC '1' -- end composition
1328 o ESC '2' -- start rule-base composition (*)
1329 o ESC '3' -- start relative composition with alternate chars (**)
1330 o ESC '4' -- start rule-base composition with alternate chars (**)
1331 Since these are not standard escape sequences of any ISO standard,
1332 the use of them with these meanings is restricted to Emacs only.
1333
1334 (*) This form is used only in Emacs 20.5 and older versions,
1335 but the newer versions can safely decode it.
1336 (**) This form is used only in Emacs 21.1 and newer versions,
1337 and the older versions can't decode it.
1338
1339 Here's a list of example usages of these composition escape
1340 sequences (categorized by `enum composition_method').
1341
1342 COMPOSITION_RELATIVE:
1343 ESC 0 CHAR [ CHAR ] ESC 1
1344 COMPOSITION_WITH_RULE:
1345 ESC 2 CHAR [ RULE CHAR ] ESC 1
1346 COMPOSITION_WITH_ALTCHARS:
1347 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
1348 COMPOSITION_WITH_RULE_ALTCHARS:
1349 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
1350
1351 enum iso_code_class_type iso_code_class[256];
1352
1353 #define CHARSET_OK(idx, charset, c) \
1354 (coding_system_table[idx] \
1355 && (charset == CHARSET_ASCII \
1356 || (safe_chars = coding_safe_chars (coding_system_table[idx]->symbol), \
1357 CODING_SAFE_CHAR_P (safe_chars, c))) \
1358 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding_system_table[idx], \
1359 charset) \
1360 != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
1361
1362 #define SHIFT_OUT_OK(idx) \
1363 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
1364
1365 #define COMPOSITION_OK(idx) \
1366 (coding_system_table[idx]->composing != COMPOSITION_DISABLED)
1367
1368 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1369 Check if a text is encoded in ISO2022. If it is, return an
1370 integer in which appropriate flag bits any of:
1371 CODING_CATEGORY_MASK_ISO_7
1372 CODING_CATEGORY_MASK_ISO_7_TIGHT
1373 CODING_CATEGORY_MASK_ISO_8_1
1374 CODING_CATEGORY_MASK_ISO_8_2
1375 CODING_CATEGORY_MASK_ISO_7_ELSE
1376 CODING_CATEGORY_MASK_ISO_8_ELSE
1377 are set. If a code which should never appear in ISO2022 is found,
1378 returns 0. */
1379
1380 static int
1381 detect_coding_iso2022 (src, src_end, multibytep)
1382 unsigned char *src, *src_end;
1383 int multibytep;
1384 {
1385 int mask = CODING_CATEGORY_MASK_ISO;
1386 int mask_found = 0;
1387 int reg[4], shift_out = 0, single_shifting = 0;
1388 int c, c1, charset;
1389 /* Dummy for ONE_MORE_BYTE. */
1390 struct coding_system dummy_coding;
1391 struct coding_system *coding = &dummy_coding;
1392 Lisp_Object safe_chars;
1393
1394 reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
1395 while (mask && src < src_end)
1396 {
1397 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1398 retry:
1399 switch (c)
1400 {
1401 case ISO_CODE_ESC:
1402 if (inhibit_iso_escape_detection)
1403 break;
1404 single_shifting = 0;
1405 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1406 if (c >= '(' && c <= '/')
1407 {
1408 /* Designation sequence for a charset of dimension 1. */
1409 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
1410 if (c1 < ' ' || c1 >= 0x80
1411 || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
1412 /* Invalid designation sequence. Just ignore. */
1413 break;
1414 reg[(c - '(') % 4] = charset;
1415 }
1416 else if (c == '$')
1417 {
1418 /* Designation sequence for a charset of dimension 2. */
1419 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1420 if (c >= '@' && c <= 'B')
1421 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
1422 reg[0] = charset = iso_charset_table[1][0][c];
1423 else if (c >= '(' && c <= '/')
1424 {
1425 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
1426 if (c1 < ' ' || c1 >= 0x80
1427 || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
1428 /* Invalid designation sequence. Just ignore. */
1429 break;
1430 reg[(c - '(') % 4] = charset;
1431 }
1432 else
1433 /* Invalid designation sequence. Just ignore. */
1434 break;
1435 }
1436 else if (c == 'N' || c == 'O')
1437 {
1438 /* ESC <Fe> for SS2 or SS3. */
1439 mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
1440 break;
1441 }
1442 else if (c >= '0' && c <= '4')
1443 {
1444 /* ESC <Fp> for start/end composition. */
1445 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7))
1446 mask_found |= CODING_CATEGORY_MASK_ISO_7;
1447 else
1448 mask &= ~CODING_CATEGORY_MASK_ISO_7;
1449 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT))
1450 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1451 else
1452 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1453 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_1))
1454 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1455 else
1456 mask &= ~CODING_CATEGORY_MASK_ISO_8_1;
1457 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_2))
1458 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1459 else
1460 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1461 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_ELSE))
1462 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1463 else
1464 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1465 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_ELSE))
1466 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1467 else
1468 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1469 break;
1470 }
1471 else
1472 /* Invalid escape sequence. Just ignore. */
1473 break;
1474
1475 /* We found a valid designation sequence for CHARSET. */
1476 mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
1477 c = MAKE_CHAR (charset, 0, 0);
1478 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset, c))
1479 mask_found |= CODING_CATEGORY_MASK_ISO_7;
1480 else
1481 mask &= ~CODING_CATEGORY_MASK_ISO_7;
1482 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset, c))
1483 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1484 else
1485 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1486 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset, c))
1487 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1488 else
1489 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1490 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset, c))
1491 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1492 else
1493 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1494 break;
1495
1496 case ISO_CODE_SO:
1497 if (inhibit_iso_escape_detection)
1498 break;
1499 single_shifting = 0;
1500 if (shift_out == 0
1501 && (reg[1] >= 0
1502 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
1503 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
1504 {
1505 /* Locking shift out. */
1506 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1507 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1508 }
1509 break;
1510
1511 case ISO_CODE_SI:
1512 if (inhibit_iso_escape_detection)
1513 break;
1514 single_shifting = 0;
1515 if (shift_out == 1)
1516 {
1517 /* Locking shift in. */
1518 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1519 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1520 }
1521 break;
1522
1523 case ISO_CODE_CSI:
1524 single_shifting = 0;
1525 case ISO_CODE_SS2:
1526 case ISO_CODE_SS3:
1527 {
1528 int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
1529
1530 if (inhibit_iso_escape_detection)
1531 break;
1532 if (c != ISO_CODE_CSI)
1533 {
1534 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1535 & CODING_FLAG_ISO_SINGLE_SHIFT)
1536 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1537 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1538 & CODING_FLAG_ISO_SINGLE_SHIFT)
1539 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1540 single_shifting = 1;
1541 }
1542 if (VECTORP (Vlatin_extra_code_table)
1543 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1544 {
1545 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1546 & CODING_FLAG_ISO_LATIN_EXTRA)
1547 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1548 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1549 & CODING_FLAG_ISO_LATIN_EXTRA)
1550 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1551 }
1552 mask &= newmask;
1553 mask_found |= newmask;
1554 }
1555 break;
1556
1557 default:
1558 if (c < 0x80)
1559 {
1560 single_shifting = 0;
1561 break;
1562 }
1563 else if (c < 0xA0)
1564 {
1565 single_shifting = 0;
1566 if (VECTORP (Vlatin_extra_code_table)
1567 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1568 {
1569 int newmask = 0;
1570
1571 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1572 & CODING_FLAG_ISO_LATIN_EXTRA)
1573 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1574 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1575 & CODING_FLAG_ISO_LATIN_EXTRA)
1576 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1577 mask &= newmask;
1578 mask_found |= newmask;
1579 }
1580 else
1581 return 0;
1582 }
1583 else
1584 {
1585 mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
1586 | CODING_CATEGORY_MASK_ISO_7_ELSE);
1587 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1588 /* Check the length of succeeding codes of the range
1589 0xA0..0FF. If the byte length is odd, we exclude
1590 CODING_CATEGORY_MASK_ISO_8_2. We can check this only
1591 when we are not single shifting. */
1592 if (!single_shifting
1593 && mask & CODING_CATEGORY_MASK_ISO_8_2)
1594 {
1595 int i = 1;
1596
1597 c = -1;
1598 while (src < src_end)
1599 {
1600 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1601 if (c < 0xA0)
1602 break;
1603 i++;
1604 }
1605
1606 if (i & 1 && src < src_end)
1607 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1608 else
1609 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1610 if (c >= 0)
1611 /* This means that we have read one extra byte. */
1612 goto retry;
1613 }
1614 }
1615 break;
1616 }
1617 }
1618 label_end_of_loop:
1619 return (mask & mask_found);
1620 }
1621
1622 /* Decode a character of which charset is CHARSET, the 1st position
1623 code is C1, the 2nd position code is C2, and return the decoded
1624 character code. If the variable `translation_table' is non-nil,
1625 returned the translated code. */
1626
1627 #define DECODE_ISO_CHARACTER(charset, c1, c2) \
1628 (NILP (translation_table) \
1629 ? MAKE_CHAR (charset, c1, c2) \
1630 : translate_char (translation_table, -1, charset, c1, c2))
1631
1632 /* Set designation state into CODING. */
1633 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
1634 do { \
1635 int charset, c; \
1636 \
1637 if (final_char < '0' || final_char >= 128) \
1638 goto label_invalid_code; \
1639 charset = ISO_CHARSET_TABLE (make_number (dimension), \
1640 make_number (chars), \
1641 make_number (final_char)); \
1642 c = MAKE_CHAR (charset, 0, 0); \
1643 if (charset >= 0 \
1644 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1645 || CODING_SAFE_CHAR_P (safe_chars, c))) \
1646 { \
1647 if (coding->spec.iso2022.last_invalid_designation_register == 0 \
1648 && reg == 0 \
1649 && charset == CHARSET_ASCII) \
1650 { \
1651 /* We should insert this designation sequence as is so \
1652 that it is surely written back to a file. */ \
1653 coding->spec.iso2022.last_invalid_designation_register = -1; \
1654 goto label_invalid_code; \
1655 } \
1656 coding->spec.iso2022.last_invalid_designation_register = -1; \
1657 if ((coding->mode & CODING_MODE_DIRECTION) \
1658 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
1659 charset = CHARSET_REVERSE_CHARSET (charset); \
1660 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1661 } \
1662 else \
1663 { \
1664 coding->spec.iso2022.last_invalid_designation_register = reg; \
1665 goto label_invalid_code; \
1666 } \
1667 } while (0)
1668
1669 /* Allocate a memory block for storing information about compositions.
1670 The block is chained to the already allocated blocks. */
1671
1672 void
1673 coding_allocate_composition_data (coding, char_offset)
1674 struct coding_system *coding;
1675 int char_offset;
1676 {
1677 struct composition_data *cmp_data
1678 = (struct composition_data *) xmalloc (sizeof *cmp_data);
1679
1680 cmp_data->char_offset = char_offset;
1681 cmp_data->used = 0;
1682 cmp_data->prev = coding->cmp_data;
1683 cmp_data->next = NULL;
1684 if (coding->cmp_data)
1685 coding->cmp_data->next = cmp_data;
1686 coding->cmp_data = cmp_data;
1687 coding->cmp_data_start = 0;
1688 }
1689
1690 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
1691 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
1692 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
1693 ESC 3 : altchar composition : ESC 3 ALT ... ESC 0 CHAR ... ESC 1
1694 ESC 4 : alt&rule composition : ESC 4 ALT RULE .. ALT ESC 0 CHAR ... ESC 1
1695 */
1696
1697 #define DECODE_COMPOSITION_START(c1) \
1698 do { \
1699 if (coding->composing == COMPOSITION_DISABLED) \
1700 { \
1701 *dst++ = ISO_CODE_ESC; \
1702 *dst++ = c1 & 0x7f; \
1703 coding->produced_char += 2; \
1704 } \
1705 else if (!COMPOSING_P (coding)) \
1706 { \
1707 /* This is surely the start of a composition. We must be sure \
1708 that coding->cmp_data has enough space to store the \
1709 information about the composition. If not, terminate the \
1710 current decoding loop, allocate one more memory block for \
1711 coding->cmp_data in the caller, then start the decoding \
1712 loop again. We can't allocate memory here directly because \
1713 it may cause buffer/string relocation. */ \
1714 if (!coding->cmp_data \
1715 || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1716 >= COMPOSITION_DATA_SIZE)) \
1717 { \
1718 coding->result = CODING_FINISH_INSUFFICIENT_CMP; \
1719 goto label_end_of_loop; \
1720 } \
1721 coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE \
1722 : c1 == '2' ? COMPOSITION_WITH_RULE \
1723 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
1724 : COMPOSITION_WITH_RULE_ALTCHARS); \
1725 CODING_ADD_COMPOSITION_START (coding, coding->produced_char, \
1726 coding->composing); \
1727 coding->composition_rule_follows = 0; \
1728 } \
1729 else \
1730 { \
1731 /* We are already handling a composition. If the method is \
1732 the following two, the codes following the current escape \
1733 sequence are actual characters stored in a buffer. */ \
1734 if (coding->composing == COMPOSITION_WITH_ALTCHARS \
1735 || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS) \
1736 { \
1737 coding->composing = COMPOSITION_RELATIVE; \
1738 coding->composition_rule_follows = 0; \
1739 } \
1740 } \
1741 } while (0)
1742
1743 /* Handle composition end sequence ESC 1. */
1744
1745 #define DECODE_COMPOSITION_END(c1) \
1746 do { \
1747 if (! COMPOSING_P (coding)) \
1748 { \
1749 *dst++ = ISO_CODE_ESC; \
1750 *dst++ = c1; \
1751 coding->produced_char += 2; \
1752 } \
1753 else \
1754 { \
1755 CODING_ADD_COMPOSITION_END (coding, coding->produced_char); \
1756 coding->composing = COMPOSITION_NO; \
1757 } \
1758 } while (0)
1759
1760 /* Decode a composition rule from the byte C1 (and maybe one more byte
1761 from SRC) and store one encoded composition rule in
1762 coding->cmp_data. */
1763
1764 #define DECODE_COMPOSITION_RULE(c1) \
1765 do { \
1766 int rule = 0; \
1767 (c1) -= 32; \
1768 if (c1 < 81) /* old format (before ver.21) */ \
1769 { \
1770 int gref = (c1) / 9; \
1771 int nref = (c1) % 9; \
1772 if (gref == 4) gref = 10; \
1773 if (nref == 4) nref = 10; \
1774 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
1775 } \
1776 else if (c1 < 93) /* new format (after ver.21) */ \
1777 { \
1778 ONE_MORE_BYTE (c2); \
1779 rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
1780 } \
1781 CODING_ADD_COMPOSITION_COMPONENT (coding, rule); \
1782 coding->composition_rule_follows = 0; \
1783 } while (0)
1784
1785
1786 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1787
1788 static void
1789 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1790 struct coding_system *coding;
1791 unsigned char *source, *destination;
1792 int src_bytes, dst_bytes;
1793 {
1794 unsigned char *src = source;
1795 unsigned char *src_end = source + src_bytes;
1796 unsigned char *dst = destination;
1797 unsigned char *dst_end = destination + dst_bytes;
1798 /* Charsets invoked to graphic plane 0 and 1 respectively. */
1799 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1800 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1801 /* SRC_BASE remembers the start position in source in each loop.
1802 The loop will be exited when there's not enough source code
1803 (within macro ONE_MORE_BYTE), or when there's not enough
1804 destination area to produce a character (within macro
1805 EMIT_CHAR). */
1806 unsigned char *src_base;
1807 int c, charset;
1808 Lisp_Object translation_table;
1809 Lisp_Object safe_chars;
1810
1811 safe_chars = coding_safe_chars (coding->symbol);
1812
1813 if (NILP (Venable_character_translation))
1814 translation_table = Qnil;
1815 else
1816 {
1817 translation_table = coding->translation_table_for_decode;
1818 if (NILP (translation_table))
1819 translation_table = Vstandard_translation_table_for_decode;
1820 }
1821
1822 coding->result = CODING_FINISH_NORMAL;
1823
1824 while (1)
1825 {
1826 int c1, c2;
1827
1828 src_base = src;
1829 ONE_MORE_BYTE (c1);
1830
1831 /* We produce no character or one character. */
1832 switch (iso_code_class [c1])
1833 {
1834 case ISO_0x20_or_0x7F:
1835 if (COMPOSING_P (coding) && coding->composition_rule_follows)
1836 {
1837 DECODE_COMPOSITION_RULE (c1);
1838 continue;
1839 }
1840 if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1841 {
1842 /* This is SPACE or DEL. */
1843 charset = CHARSET_ASCII;
1844 break;
1845 }
1846 /* This is a graphic character, we fall down ... */
1847
1848 case ISO_graphic_plane_0:
1849 if (COMPOSING_P (coding) && coding->composition_rule_follows)
1850 {
1851 DECODE_COMPOSITION_RULE (c1);
1852 continue;
1853 }
1854 charset = charset0;
1855 break;
1856
1857 case ISO_0xA0_or_0xFF:
1858 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1859 || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1860 goto label_invalid_code;
1861 /* This is a graphic character, we fall down ... */
1862
1863 case ISO_graphic_plane_1:
1864 if (charset1 < 0)
1865 goto label_invalid_code;
1866 charset = charset1;
1867 break;
1868
1869 case ISO_control_0:
1870 if (COMPOSING_P (coding))
1871 DECODE_COMPOSITION_END ('1');
1872
1873 /* All ISO2022 control characters in this class have the
1874 same representation in Emacs internal format. */
1875 if (c1 == '\n'
1876 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1877 && (coding->eol_type == CODING_EOL_CR
1878 || coding->eol_type == CODING_EOL_CRLF))
1879 {
1880 coding->result = CODING_FINISH_INCONSISTENT_EOL;
1881 goto label_end_of_loop;
1882 }
1883 charset = CHARSET_ASCII;
1884 break;
1885
1886 case ISO_control_1:
1887 if (COMPOSING_P (coding))
1888 DECODE_COMPOSITION_END ('1');
1889 goto label_invalid_code;
1890
1891 case ISO_carriage_return:
1892 if (COMPOSING_P (coding))
1893 DECODE_COMPOSITION_END ('1');
1894
1895 if (coding->eol_type == CODING_EOL_CR)
1896 c1 = '\n';
1897 else if (coding->eol_type == CODING_EOL_CRLF)
1898 {
1899 ONE_MORE_BYTE (c1);
1900 if (c1 != ISO_CODE_LF)
1901 {
1902 src--;
1903 c1 = '\r';
1904 }
1905 }
1906 charset = CHARSET_ASCII;
1907 break;
1908
1909 case ISO_shift_out:
1910 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1911 || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1912 goto label_invalid_code;
1913 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1914 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1915 continue;
1916
1917 case ISO_shift_in:
1918 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1919 goto label_invalid_code;
1920 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1921 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1922 continue;
1923
1924 case ISO_single_shift_2_7:
1925 case ISO_single_shift_2:
1926 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1927 goto label_invalid_code;
1928 /* SS2 is handled as an escape sequence of ESC 'N' */
1929 c1 = 'N';
1930 goto label_escape_sequence;
1931
1932 case ISO_single_shift_3:
1933 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1934 goto label_invalid_code;
1935 /* SS2 is handled as an escape sequence of ESC 'O' */
1936 c1 = 'O';
1937 goto label_escape_sequence;
1938
1939 case ISO_control_sequence_introducer:
1940 /* CSI is handled as an escape sequence of ESC '[' ... */
1941 c1 = '[';
1942 goto label_escape_sequence;
1943
1944 case ISO_escape:
1945 ONE_MORE_BYTE (c1);
1946 label_escape_sequence:
1947 /* Escape sequences handled by Emacs are invocation,
1948 designation, direction specification, and character
1949 composition specification. */
1950 switch (c1)
1951 {
1952 case '&': /* revision of following character set */
1953 ONE_MORE_BYTE (c1);
1954 if (!(c1 >= '@' && c1 <= '~'))
1955 goto label_invalid_code;
1956 ONE_MORE_BYTE (c1);
1957 if (c1 != ISO_CODE_ESC)
1958 goto label_invalid_code;
1959 ONE_MORE_BYTE (c1);
1960 goto label_escape_sequence;
1961
1962 case '$': /* designation of 2-byte character set */
1963 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1964 goto label_invalid_code;
1965 ONE_MORE_BYTE (c1);
1966 if (c1 >= '@' && c1 <= 'B')
1967 { /* designation of JISX0208.1978, GB2312.1980,
1968 or JISX0208.1980 */
1969 DECODE_DESIGNATION (0, 2, 94, c1);
1970 }
1971 else if (c1 >= 0x28 && c1 <= 0x2B)
1972 { /* designation of DIMENSION2_CHARS94 character set */
1973 ONE_MORE_BYTE (c2);
1974 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1975 }
1976 else if (c1 >= 0x2C && c1 <= 0x2F)
1977 { /* designation of DIMENSION2_CHARS96 character set */
1978 ONE_MORE_BYTE (c2);
1979 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1980 }
1981 else
1982 goto label_invalid_code;
1983 /* We must update these variables now. */
1984 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1985 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1986 continue;
1987
1988 case 'n': /* invocation of locking-shift-2 */
1989 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1990 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1991 goto label_invalid_code;
1992 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
1993 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1994 continue;
1995
1996 case 'o': /* invocation of locking-shift-3 */
1997 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1998 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1999 goto label_invalid_code;
2000 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
2001 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2002 continue;
2003
2004 case 'N': /* invocation of single-shift-2 */
2005 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2006 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2007 goto label_invalid_code;
2008 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
2009 ONE_MORE_BYTE (c1);
2010 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2011 goto label_invalid_code;
2012 break;
2013
2014 case 'O': /* invocation of single-shift-3 */
2015 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2016 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2017 goto label_invalid_code;
2018 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
2019 ONE_MORE_BYTE (c1);
2020 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2021 goto label_invalid_code;
2022 break;
2023
2024 case '0': case '2': case '3': case '4': /* start composition */
2025 DECODE_COMPOSITION_START (c1);
2026 continue;
2027
2028 case '1': /* end composition */
2029 DECODE_COMPOSITION_END (c1);
2030 continue;
2031
2032 case '[': /* specification of direction */
2033 if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
2034 goto label_invalid_code;
2035 /* For the moment, nested direction is not supported.
2036 So, `coding->mode & CODING_MODE_DIRECTION' zero means
2037 left-to-right, and nonzero means right-to-left. */
2038 ONE_MORE_BYTE (c1);
2039 switch (c1)
2040 {
2041 case ']': /* end of the current direction */
2042 coding->mode &= ~CODING_MODE_DIRECTION;
2043
2044 case '0': /* end of the current direction */
2045 case '1': /* start of left-to-right direction */
2046 ONE_MORE_BYTE (c1);
2047 if (c1 == ']')
2048 coding->mode &= ~CODING_MODE_DIRECTION;
2049 else
2050 goto label_invalid_code;
2051 break;
2052
2053 case '2': /* start of right-to-left direction */
2054 ONE_MORE_BYTE (c1);
2055 if (c1 == ']')
2056 coding->mode |= CODING_MODE_DIRECTION;
2057 else
2058 goto label_invalid_code;
2059 break;
2060
2061 default:
2062 goto label_invalid_code;
2063 }
2064 continue;
2065
2066 case '%':
2067 if (COMPOSING_P (coding))
2068 DECODE_COMPOSITION_END ('1');
2069 ONE_MORE_BYTE (c1);
2070 if (c1 == '/')
2071 {
2072 /* CTEXT extended segment:
2073 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
2074 We keep these bytes as is for the moment.
2075 They may be decoded by post-read-conversion. */
2076 int dim, M, L;
2077 int size, required;
2078 int produced_chars;
2079
2080 ONE_MORE_BYTE (dim);
2081 ONE_MORE_BYTE (M);
2082 ONE_MORE_BYTE (L);
2083 size = ((M - 128) * 128) + (L - 128);
2084 required = 8 + size * 2;
2085 if (dst + required > (dst_bytes ? dst_end : src))
2086 goto label_end_of_loop;
2087 *dst++ = ISO_CODE_ESC;
2088 *dst++ = '%';
2089 *dst++ = '/';
2090 *dst++ = dim;
2091 produced_chars = 4;
2092 dst += CHAR_STRING (M, dst), produced_chars++;
2093 dst += CHAR_STRING (L, dst), produced_chars++;
2094 while (size-- > 0)
2095 {
2096 ONE_MORE_BYTE (c1);
2097 dst += CHAR_STRING (c1, dst), produced_chars++;
2098 }
2099 coding->produced_char += produced_chars;
2100 }
2101 else if (c1 == 'G')
2102 {
2103 unsigned char *d = dst;
2104 int produced_chars;
2105
2106 /* XFree86 extension for embedding UTF-8 in CTEXT:
2107 ESC % G --UTF-8-BYTES-- ESC % @
2108 We keep these bytes as is for the moment.
2109 They may be decoded by post-read-conversion. */
2110 if (d + 6 > (dst_bytes ? dst_end : src))
2111 goto label_end_of_loop;
2112 *d++ = ISO_CODE_ESC;
2113 *d++ = '%';
2114 *d++ = 'G';
2115 produced_chars = 3;
2116 while (d + 1 < (dst_bytes ? dst_end : src))
2117 {
2118 ONE_MORE_BYTE (c1);
2119 if (c1 == ISO_CODE_ESC
2120 && src + 1 < src_end
2121 && src[0] == '%'
2122 && src[1] == '@')
2123 break;
2124 d += CHAR_STRING (c1, d), produced_chars++;
2125 }
2126 if (d + 3 > (dst_bytes ? dst_end : src))
2127 goto label_end_of_loop;
2128 *d++ = ISO_CODE_ESC;
2129 *d++ = '%';
2130 *d++ = '@';
2131 dst = d;
2132 coding->produced_char += produced_chars + 3;
2133 }
2134 else
2135 goto label_invalid_code;
2136 continue;
2137
2138 default:
2139 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
2140 goto label_invalid_code;
2141 if (c1 >= 0x28 && c1 <= 0x2B)
2142 { /* designation of DIMENSION1_CHARS94 character set */
2143 ONE_MORE_BYTE (c2);
2144 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
2145 }
2146 else if (c1 >= 0x2C && c1 <= 0x2F)
2147 { /* designation of DIMENSION1_CHARS96 character set */
2148 ONE_MORE_BYTE (c2);
2149 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
2150 }
2151 else
2152 goto label_invalid_code;
2153 /* We must update these variables now. */
2154 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2155 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2156 continue;
2157 }
2158 }
2159
2160 /* Now we know CHARSET and 1st position code C1 of a character.
2161 Produce a multibyte sequence for that character while getting
2162 2nd position code C2 if necessary. */
2163 if (CHARSET_DIMENSION (charset) == 2)
2164 {
2165 ONE_MORE_BYTE (c2);
2166 if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
2167 /* C2 is not in a valid range. */
2168 goto label_invalid_code;
2169 }
2170 c = DECODE_ISO_CHARACTER (charset, c1, c2);
2171 EMIT_CHAR (c);
2172 continue;
2173
2174 label_invalid_code:
2175 coding->errors++;
2176 if (COMPOSING_P (coding))
2177 DECODE_COMPOSITION_END ('1');
2178 src = src_base;
2179 c = *src++;
2180 EMIT_CHAR (c);
2181 }
2182
2183 label_end_of_loop:
2184 coding->consumed = coding->consumed_char = src_base - source;
2185 coding->produced = dst - destination;
2186 return;
2187 }
2188
2189
2190 /* ISO2022 encoding stuff. */
2191
2192 /*
2193 It is not enough to say just "ISO2022" on encoding, we have to
2194 specify more details. In Emacs, each ISO2022 coding system
2195 variant has the following specifications:
2196 1. Initial designation to G0 through G3.
2197 2. Allows short-form designation?
2198 3. ASCII should be designated to G0 before control characters?
2199 4. ASCII should be designated to G0 at end of line?
2200 5. 7-bit environment or 8-bit environment?
2201 6. Use locking-shift?
2202 7. Use Single-shift?
2203 And the following two are only for Japanese:
2204 8. Use ASCII in place of JIS0201-1976-Roman?
2205 9. Use JISX0208-1983 in place of JISX0208-1978?
2206 These specifications are encoded in `coding->flags' as flag bits
2207 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
2208 details.
2209 */
2210
2211 /* Produce codes (escape sequence) for designating CHARSET to graphic
2212 register REG at DST, and increment DST. If <final-char> of CHARSET is
2213 '@', 'A', or 'B' and the coding system CODING allows, produce
2214 designation sequence of short-form. */
2215
2216 #define ENCODE_DESIGNATION(charset, reg, coding) \
2217 do { \
2218 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
2219 char *intermediate_char_94 = "()*+"; \
2220 char *intermediate_char_96 = ",-./"; \
2221 int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset); \
2222 \
2223 if (revision < 255) \
2224 { \
2225 *dst++ = ISO_CODE_ESC; \
2226 *dst++ = '&'; \
2227 *dst++ = '@' + revision; \
2228 } \
2229 *dst++ = ISO_CODE_ESC; \
2230 if (CHARSET_DIMENSION (charset) == 1) \
2231 { \
2232 if (CHARSET_CHARS (charset) == 94) \
2233 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
2234 else \
2235 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
2236 } \
2237 else \
2238 { \
2239 *dst++ = '$'; \
2240 if (CHARSET_CHARS (charset) == 94) \
2241 { \
2242 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
2243 || reg != 0 \
2244 || final_char < '@' || final_char > 'B') \
2245 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
2246 } \
2247 else \
2248 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
2249 } \
2250 *dst++ = final_char; \
2251 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
2252 } while (0)
2253
2254 /* The following two macros produce codes (control character or escape
2255 sequence) for ISO2022 single-shift functions (single-shift-2 and
2256 single-shift-3). */
2257
2258 #define ENCODE_SINGLE_SHIFT_2 \
2259 do { \
2260 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2261 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
2262 else \
2263 *dst++ = ISO_CODE_SS2; \
2264 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
2265 } while (0)
2266
2267 #define ENCODE_SINGLE_SHIFT_3 \
2268 do { \
2269 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2270 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
2271 else \
2272 *dst++ = ISO_CODE_SS3; \
2273 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
2274 } while (0)
2275
2276 /* The following four macros produce codes (control character or
2277 escape sequence) for ISO2022 locking-shift functions (shift-in,
2278 shift-out, locking-shift-2, and locking-shift-3). */
2279
2280 #define ENCODE_SHIFT_IN \
2281 do { \
2282 *dst++ = ISO_CODE_SI; \
2283 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
2284 } while (0)
2285
2286 #define ENCODE_SHIFT_OUT \
2287 do { \
2288 *dst++ = ISO_CODE_SO; \
2289 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
2290 } while (0)
2291
2292 #define ENCODE_LOCKING_SHIFT_2 \
2293 do { \
2294 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
2295 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
2296 } while (0)
2297
2298 #define ENCODE_LOCKING_SHIFT_3 \
2299 do { \
2300 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
2301 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
2302 } while (0)
2303
2304 /* Produce codes for a DIMENSION1 character whose character set is
2305 CHARSET and whose position-code is C1. Designation and invocation
2306 sequences are also produced in advance if necessary. */
2307
2308 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
2309 do { \
2310 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
2311 { \
2312 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2313 *dst++ = c1 & 0x7F; \
2314 else \
2315 *dst++ = c1 | 0x80; \
2316 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
2317 break; \
2318 } \
2319 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
2320 { \
2321 *dst++ = c1 & 0x7F; \
2322 break; \
2323 } \
2324 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
2325 { \
2326 *dst++ = c1 | 0x80; \
2327 break; \
2328 } \
2329 else \
2330 /* Since CHARSET is not yet invoked to any graphic planes, we \
2331 must invoke it, or, at first, designate it to some graphic \
2332 register. Then repeat the loop to actually produce the \
2333 character. */ \
2334 dst = encode_invocation_designation (charset, coding, dst); \
2335 } while (1)
2336
2337 /* Produce codes for a DIMENSION2 character whose character set is
2338 CHARSET and whose position-codes are C1 and C2. Designation and
2339 invocation codes are also produced in advance if necessary. */
2340
2341 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
2342 do { \
2343 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
2344 { \
2345 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2346 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
2347 else \
2348 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
2349 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
2350 break; \
2351 } \
2352 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
2353 { \
2354 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
2355 break; \
2356 } \
2357 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
2358 { \
2359 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
2360 break; \
2361 } \
2362 else \
2363 /* Since CHARSET is not yet invoked to any graphic planes, we \
2364 must invoke it, or, at first, designate it to some graphic \
2365 register. Then repeat the loop to actually produce the \
2366 character. */ \
2367 dst = encode_invocation_designation (charset, coding, dst); \
2368 } while (1)
2369
2370 #define ENCODE_ISO_CHARACTER(c) \
2371 do { \
2372 int charset, c1, c2; \
2373 \
2374 SPLIT_CHAR (c, charset, c1, c2); \
2375 if (CHARSET_DEFINED_P (charset)) \
2376 { \
2377 if (CHARSET_DIMENSION (charset) == 1) \
2378 { \
2379 if (charset == CHARSET_ASCII \
2380 && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \
2381 charset = charset_latin_jisx0201; \
2382 ENCODE_ISO_CHARACTER_DIMENSION1 (charset, c1); \
2383 } \
2384 else \
2385 { \
2386 if (charset == charset_jisx0208 \
2387 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \
2388 charset = charset_jisx0208_1978; \
2389 ENCODE_ISO_CHARACTER_DIMENSION2 (charset, c1, c2); \
2390 } \
2391 } \
2392 else \
2393 { \
2394 *dst++ = c1; \
2395 if (c2 >= 0) \
2396 *dst++ = c2; \
2397 } \
2398 } while (0)
2399
2400
2401 /* Instead of encoding character C, produce one or two `?'s. */
2402
2403 #define ENCODE_UNSAFE_CHARACTER(c) \
2404 do { \
2405 ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER); \
2406 if (CHARSET_WIDTH (CHAR_CHARSET (c)) > 1) \
2407 ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER); \
2408 } while (0)
2409
2410
2411 /* Produce designation and invocation codes at a place pointed by DST
2412 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
2413 Return new DST. */
2414
2415 unsigned char *
2416 encode_invocation_designation (charset, coding, dst)
2417 int charset;
2418 struct coding_system *coding;
2419 unsigned char *dst;
2420 {
2421 int reg; /* graphic register number */
2422
2423 /* At first, check designations. */
2424 for (reg = 0; reg < 4; reg++)
2425 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
2426 break;
2427
2428 if (reg >= 4)
2429 {
2430 /* CHARSET is not yet designated to any graphic registers. */
2431 /* At first check the requested designation. */
2432 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2433 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
2434 /* Since CHARSET requests no special designation, designate it
2435 to graphic register 0. */
2436 reg = 0;
2437
2438 ENCODE_DESIGNATION (charset, reg, coding);
2439 }
2440
2441 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
2442 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
2443 {
2444 /* Since the graphic register REG is not invoked to any graphic
2445 planes, invoke it to graphic plane 0. */
2446 switch (reg)
2447 {
2448 case 0: /* graphic register 0 */
2449 ENCODE_SHIFT_IN;
2450 break;
2451
2452 case 1: /* graphic register 1 */
2453 ENCODE_SHIFT_OUT;
2454 break;
2455
2456 case 2: /* graphic register 2 */
2457 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2458 ENCODE_SINGLE_SHIFT_2;
2459 else
2460 ENCODE_LOCKING_SHIFT_2;
2461 break;
2462
2463 case 3: /* graphic register 3 */
2464 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2465 ENCODE_SINGLE_SHIFT_3;
2466 else
2467 ENCODE_LOCKING_SHIFT_3;
2468 break;
2469 }
2470 }
2471
2472 return dst;
2473 }
2474
2475 /* Produce 2-byte codes for encoded composition rule RULE. */
2476
2477 #define ENCODE_COMPOSITION_RULE(rule) \
2478 do { \
2479 int gref, nref; \
2480 COMPOSITION_DECODE_RULE (rule, gref, nref); \
2481 *dst++ = 32 + 81 + gref; \
2482 *dst++ = 32 + nref; \
2483 } while (0)
2484
2485 /* Produce codes for indicating the start of a composition sequence
2486 (ESC 0, ESC 3, or ESC 4). DATA points to an array of integers
2487 which specify information about the composition. See the comment
2488 in coding.h for the format of DATA. */
2489
2490 #define ENCODE_COMPOSITION_START(coding, data) \
2491 do { \
2492 coding->composing = data[3]; \
2493 *dst++ = ISO_CODE_ESC; \
2494 if (coding->composing == COMPOSITION_RELATIVE) \
2495 *dst++ = '0'; \
2496 else \
2497 { \
2498 *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS \
2499 ? '3' : '4'); \
2500 coding->cmp_data_index = coding->cmp_data_start + 4; \
2501 coding->composition_rule_follows = 0; \
2502 } \
2503 } while (0)
2504
2505 /* Produce codes for indicating the end of the current composition. */
2506
2507 #define ENCODE_COMPOSITION_END(coding, data) \
2508 do { \
2509 *dst++ = ISO_CODE_ESC; \
2510 *dst++ = '1'; \
2511 coding->cmp_data_start += data[0]; \
2512 coding->composing = COMPOSITION_NO; \
2513 if (coding->cmp_data_start == coding->cmp_data->used \
2514 && coding->cmp_data->next) \
2515 { \
2516 coding->cmp_data = coding->cmp_data->next; \
2517 coding->cmp_data_start = 0; \
2518 } \
2519 } while (0)
2520
2521 /* Produce composition start sequence ESC 0. Here, this sequence
2522 doesn't mean the start of a new composition but means that we have
2523 just produced components (alternate chars and composition rules) of
2524 the composition and the actual text follows in SRC. */
2525
2526 #define ENCODE_COMPOSITION_FAKE_START(coding) \
2527 do { \
2528 *dst++ = ISO_CODE_ESC; \
2529 *dst++ = '0'; \
2530 coding->composing = COMPOSITION_RELATIVE; \
2531 } while (0)
2532
2533 /* The following three macros produce codes for indicating direction
2534 of text. */
2535 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
2536 do { \
2537 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
2538 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
2539 else \
2540 *dst++ = ISO_CODE_CSI; \
2541 } while (0)
2542
2543 #define ENCODE_DIRECTION_R2L \
2544 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
2545
2546 #define ENCODE_DIRECTION_L2R \
2547 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
2548
2549 /* Produce codes for designation and invocation to reset the graphic
2550 planes and registers to initial state. */
2551 #define ENCODE_RESET_PLANE_AND_REGISTER \
2552 do { \
2553 int reg; \
2554 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
2555 ENCODE_SHIFT_IN; \
2556 for (reg = 0; reg < 4; reg++) \
2557 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
2558 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
2559 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
2560 ENCODE_DESIGNATION \
2561 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
2562 } while (0)
2563
2564 /* Produce designation sequences of charsets in the line started from
2565 SRC to a place pointed by DST, and return updated DST.
2566
2567 If the current block ends before any end-of-line, we may fail to
2568 find all the necessary designations. */
2569
2570 static unsigned char *
2571 encode_designation_at_bol (coding, translation_table, src, src_end, dst)
2572 struct coding_system *coding;
2573 Lisp_Object translation_table;
2574 unsigned char *src, *src_end, *dst;
2575 {
2576 int charset, c, found = 0, reg;
2577 /* Table of charsets to be designated to each graphic register. */
2578 int r[4];
2579
2580 for (reg = 0; reg < 4; reg++)
2581 r[reg] = -1;
2582
2583 while (found < 4)
2584 {
2585 ONE_MORE_CHAR (c);
2586 if (c == '\n')
2587 break;
2588
2589 charset = CHAR_CHARSET (c);
2590 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2591 if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
2592 {
2593 found++;
2594 r[reg] = charset;
2595 }
2596 }
2597
2598 label_end_of_loop:
2599 if (found)
2600 {
2601 for (reg = 0; reg < 4; reg++)
2602 if (r[reg] >= 0
2603 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
2604 ENCODE_DESIGNATION (r[reg], reg, coding);
2605 }
2606
2607 return dst;
2608 }
2609
2610 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
2611
2612 static void
2613 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
2614 struct coding_system *coding;
2615 unsigned char *source, *destination;
2616 int src_bytes, dst_bytes;
2617 {
2618 unsigned char *src = source;
2619 unsigned char *src_end = source + src_bytes;
2620 unsigned char *dst = destination;
2621 unsigned char *dst_end = destination + dst_bytes;
2622 /* Since the maximum bytes produced by each loop is 20, we subtract 19
2623 from DST_END to assure overflow checking is necessary only at the
2624 head of loop. */
2625 unsigned char *adjusted_dst_end = dst_end - 19;
2626 /* SRC_BASE remembers the start position in source in each loop.
2627 The loop will be exited when there's not enough source text to
2628 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2629 there's not enough destination area to produce encoded codes
2630 (within macro EMIT_BYTES). */
2631 unsigned char *src_base;
2632 int c;
2633 Lisp_Object translation_table;
2634 Lisp_Object safe_chars;
2635
2636 if (coding->flags & CODING_FLAG_ISO_SAFE)
2637 coding->mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
2638
2639 safe_chars = coding_safe_chars (coding->symbol);
2640
2641 if (NILP (Venable_character_translation))
2642 translation_table = Qnil;
2643 else
2644 {
2645 translation_table = coding->translation_table_for_encode;
2646 if (NILP (translation_table))
2647 translation_table = Vstandard_translation_table_for_encode;
2648 }
2649
2650 coding->consumed_char = 0;
2651 coding->errors = 0;
2652 while (1)
2653 {
2654 src_base = src;
2655
2656 if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2657 {
2658 coding->result = CODING_FINISH_INSUFFICIENT_DST;
2659 break;
2660 }
2661
2662 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2663 && CODING_SPEC_ISO_BOL (coding))
2664 {
2665 /* We have to produce designation sequences if any now. */
2666 dst = encode_designation_at_bol (coding, translation_table,
2667 src, src_end, dst);
2668 CODING_SPEC_ISO_BOL (coding) = 0;
2669 }
2670
2671 /* Check composition start and end. */
2672 if (coding->composing != COMPOSITION_DISABLED
2673 && coding->cmp_data_start < coding->cmp_data->used)
2674 {
2675 struct composition_data *cmp_data = coding->cmp_data;
2676 int *data = cmp_data->data + coding->cmp_data_start;
2677 int this_pos = cmp_data->char_offset + coding->consumed_char;
2678
2679 if (coding->composing == COMPOSITION_RELATIVE)
2680 {
2681 if (this_pos == data[2])
2682 {
2683 ENCODE_COMPOSITION_END (coding, data);
2684 cmp_data = coding->cmp_data;
2685 data = cmp_data->data + coding->cmp_data_start;
2686 }
2687 }
2688 else if (COMPOSING_P (coding))
2689 {
2690 /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR */
2691 if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2692 /* We have consumed components of the composition.
2693 What follows in SRC is the composition's base
2694 text. */
2695 ENCODE_COMPOSITION_FAKE_START (coding);
2696 else
2697 {
2698 int c = cmp_data->data[coding->cmp_data_index++];
2699 if (coding->composition_rule_follows)
2700 {
2701 ENCODE_COMPOSITION_RULE (c);
2702 coding->composition_rule_follows = 0;
2703 }
2704 else
2705 {
2706 if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2707 && ! CODING_SAFE_CHAR_P (safe_chars, c))
2708 ENCODE_UNSAFE_CHARACTER (c);
2709 else
2710 ENCODE_ISO_CHARACTER (c);
2711 if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2712 coding->composition_rule_follows = 1;
2713 }
2714 continue;
2715 }
2716 }
2717 if (!COMPOSING_P (coding))
2718 {
2719 if (this_pos == data[1])
2720 {
2721 ENCODE_COMPOSITION_START (coding, data);
2722 continue;
2723 }
2724 }
2725 }
2726
2727 ONE_MORE_CHAR (c);
2728
2729 /* Now encode the character C. */
2730 if (c < 0x20 || c == 0x7F)
2731 {
2732 if (c == '\r')
2733 {
2734 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2735 {
2736 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2737 ENCODE_RESET_PLANE_AND_REGISTER;
2738 *dst++ = c;
2739 continue;
2740 }
2741 /* fall down to treat '\r' as '\n' ... */
2742 c = '\n';
2743 }
2744 if (c == '\n')
2745 {
2746 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2747 ENCODE_RESET_PLANE_AND_REGISTER;
2748 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2749 bcopy (coding->spec.iso2022.initial_designation,
2750 coding->spec.iso2022.current_designation,
2751 sizeof coding->spec.iso2022.initial_designation);
2752 if (coding->eol_type == CODING_EOL_LF
2753 || coding->eol_type == CODING_EOL_UNDECIDED)
2754 *dst++ = ISO_CODE_LF;
2755 else if (coding->eol_type == CODING_EOL_CRLF)
2756 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2757 else
2758 *dst++ = ISO_CODE_CR;
2759 CODING_SPEC_ISO_BOL (coding) = 1;
2760 }
2761 else
2762 {
2763 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2764 ENCODE_RESET_PLANE_AND_REGISTER;
2765 *dst++ = c;
2766 }
2767 }
2768 else if (ASCII_BYTE_P (c))
2769 ENCODE_ISO_CHARACTER (c);
2770 else if (SINGLE_BYTE_CHAR_P (c))
2771 {
2772 *dst++ = c;
2773 coding->errors++;
2774 }
2775 else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2776 && ! CODING_SAFE_CHAR_P (safe_chars, c))
2777 ENCODE_UNSAFE_CHARACTER (c);
2778 else
2779 ENCODE_ISO_CHARACTER (c);
2780
2781 coding->consumed_char++;
2782 }
2783
2784 label_end_of_loop:
2785 coding->consumed = src_base - source;
2786 coding->produced = coding->produced_char = dst - destination;
2787 }
2788
2789 \f
2790 /*** 4. SJIS and BIG5 handlers ***/
2791
2792 /* Although SJIS and BIG5 are not ISO coding systems, they are used
2793 quite widely. So, for the moment, Emacs supports them in the bare
2794 C code. But, in the future, they may be supported only by CCL. */
2795
2796 /* SJIS is a coding system encoding three character sets: ASCII, right
2797 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2798 as is. A character of charset katakana-jisx0201 is encoded by
2799 "position-code + 0x80". A character of charset japanese-jisx0208
2800 is encoded in 2-byte but two position-codes are divided and shifted
2801 so that it fits in the range below.
2802
2803 --- CODE RANGE of SJIS ---
2804 (character set) (range)
2805 ASCII 0x00 .. 0x7F
2806 KATAKANA-JISX0201 0xA1 .. 0xDF
2807 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
2808 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2809 -------------------------------
2810
2811 */
2812
2813 /* BIG5 is a coding system encoding two character sets: ASCII and
2814 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2815 character set and is encoded in two bytes.
2816
2817 --- CODE RANGE of BIG5 ---
2818 (character set) (range)
2819 ASCII 0x00 .. 0x7F
2820 Big5 (1st byte) 0xA1 .. 0xFE
2821 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2822 --------------------------
2823
2824 Since the number of characters in Big5 is larger than maximum
2825 characters in Emacs' charset (96x96), it can't be handled as one
2826 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2827 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
2828 contains frequently used characters and the latter contains less
2829 frequently used characters. */
2830
2831 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
2832 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2833 C1 and C2 are the 1st and 2nd position-codes of Emacs' internal
2834 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
2835
2836 /* Number of Big5 characters which have the same code in 1st byte. */
2837 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2838
2839 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
2840 do { \
2841 unsigned int temp \
2842 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
2843 if (b1 < 0xC9) \
2844 charset = charset_big5_1; \
2845 else \
2846 { \
2847 charset = charset_big5_2; \
2848 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
2849 } \
2850 c1 = temp / (0xFF - 0xA1) + 0x21; \
2851 c2 = temp % (0xFF - 0xA1) + 0x21; \
2852 } while (0)
2853
2854 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
2855 do { \
2856 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
2857 if (charset == charset_big5_2) \
2858 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2859 b1 = temp / BIG5_SAME_ROW + 0xA1; \
2860 b2 = temp % BIG5_SAME_ROW; \
2861 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2862 } while (0)
2863
2864 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2865 Check if a text is encoded in SJIS. If it is, return
2866 CODING_CATEGORY_MASK_SJIS, else return 0. */
2867
2868 static int
2869 detect_coding_sjis (src, src_end, multibytep)
2870 unsigned char *src, *src_end;
2871 int multibytep;
2872 {
2873 int c;
2874 /* Dummy for ONE_MORE_BYTE. */
2875 struct coding_system dummy_coding;
2876 struct coding_system *coding = &dummy_coding;
2877
2878 while (1)
2879 {
2880 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2881 if (c < 0x80)
2882 continue;
2883 if (c == 0x80 || c == 0xA0 || c > 0xEF)
2884 return 0;
2885 if (c <= 0x9F || c >= 0xE0)
2886 {
2887 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2888 if (c < 0x40 || c == 0x7F || c > 0xFC)
2889 return 0;
2890 }
2891 }
2892 label_end_of_loop:
2893 return CODING_CATEGORY_MASK_SJIS;
2894 }
2895
2896 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2897 Check if a text is encoded in BIG5. If it is, return
2898 CODING_CATEGORY_MASK_BIG5, else return 0. */
2899
2900 static int
2901 detect_coding_big5 (src, src_end, multibytep)
2902 unsigned char *src, *src_end;
2903 int multibytep;
2904 {
2905 int c;
2906 /* Dummy for ONE_MORE_BYTE. */
2907 struct coding_system dummy_coding;
2908 struct coding_system *coding = &dummy_coding;
2909
2910 while (1)
2911 {
2912 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2913 if (c < 0x80)
2914 continue;
2915 if (c < 0xA1 || c > 0xFE)
2916 return 0;
2917 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2918 if (c < 0x40 || (c > 0x7F && c < 0xA1) || c > 0xFE)
2919 return 0;
2920 }
2921 label_end_of_loop:
2922 return CODING_CATEGORY_MASK_BIG5;
2923 }
2924
2925 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2926 Check if a text is encoded in UTF-8. If it is, return
2927 CODING_CATEGORY_MASK_UTF_8, else return 0. */
2928
2929 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
2930 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
2931 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2932 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2933 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2934 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2935 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2936
2937 static int
2938 detect_coding_utf_8 (src, src_end, multibytep)
2939 unsigned char *src, *src_end;
2940 int multibytep;
2941 {
2942 unsigned char c;
2943 int seq_maybe_bytes;
2944 /* Dummy for ONE_MORE_BYTE. */
2945 struct coding_system dummy_coding;
2946 struct coding_system *coding = &dummy_coding;
2947
2948 while (1)
2949 {
2950 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2951 if (UTF_8_1_OCTET_P (c))
2952 continue;
2953 else if (UTF_8_2_OCTET_LEADING_P (c))
2954 seq_maybe_bytes = 1;
2955 else if (UTF_8_3_OCTET_LEADING_P (c))
2956 seq_maybe_bytes = 2;
2957 else if (UTF_8_4_OCTET_LEADING_P (c))
2958 seq_maybe_bytes = 3;
2959 else if (UTF_8_5_OCTET_LEADING_P (c))
2960 seq_maybe_bytes = 4;
2961 else if (UTF_8_6_OCTET_LEADING_P (c))
2962 seq_maybe_bytes = 5;
2963 else
2964 return 0;
2965
2966 do
2967 {
2968 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2969 if (!UTF_8_EXTRA_OCTET_P (c))
2970 return 0;
2971 seq_maybe_bytes--;
2972 }
2973 while (seq_maybe_bytes > 0);
2974 }
2975
2976 label_end_of_loop:
2977 return CODING_CATEGORY_MASK_UTF_8;
2978 }
2979
2980 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2981 Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
2982 Little Endian (otherwise). If it is, return
2983 CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
2984 else return 0. */
2985
2986 #define UTF_16_INVALID_P(val) \
2987 (((val) == 0xFFFE) \
2988 || ((val) == 0xFFFF))
2989
2990 #define UTF_16_HIGH_SURROGATE_P(val) \
2991 (((val) & 0xD800) == 0xD800)
2992
2993 #define UTF_16_LOW_SURROGATE_P(val) \
2994 (((val) & 0xDC00) == 0xDC00)
2995
2996 static int
2997 detect_coding_utf_16 (src, src_end, multibytep)
2998 unsigned char *src, *src_end;
2999 int multibytep;
3000 {
3001 unsigned char c1, c2;
3002 /* Dummy for ONE_MORE_BYTE_CHECK_MULTIBYTE. */
3003 struct coding_system dummy_coding;
3004 struct coding_system *coding = &dummy_coding;
3005
3006 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
3007 ONE_MORE_BYTE_CHECK_MULTIBYTE (c2, multibytep);
3008
3009 if ((c1 == 0xFF) && (c2 == 0xFE))
3010 return CODING_CATEGORY_MASK_UTF_16_LE;
3011 else if ((c1 == 0xFE) && (c2 == 0xFF))
3012 return CODING_CATEGORY_MASK_UTF_16_BE;
3013
3014 label_end_of_loop:
3015 return 0;
3016 }
3017
3018 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3019 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
3020
3021 static void
3022 decode_coding_sjis_big5 (coding, source, destination,
3023 src_bytes, dst_bytes, sjis_p)
3024 struct coding_system *coding;
3025 unsigned char *source, *destination;
3026 int src_bytes, dst_bytes;
3027 int sjis_p;
3028 {
3029 unsigned char *src = source;
3030 unsigned char *src_end = source + src_bytes;
3031 unsigned char *dst = destination;
3032 unsigned char *dst_end = destination + dst_bytes;
3033 /* SRC_BASE remembers the start position in source in each loop.
3034 The loop will be exited when there's not enough source code
3035 (within macro ONE_MORE_BYTE), or when there's not enough
3036 destination area to produce a character (within macro
3037 EMIT_CHAR). */
3038 unsigned char *src_base;
3039 Lisp_Object translation_table;
3040
3041 if (NILP (Venable_character_translation))
3042 translation_table = Qnil;
3043 else
3044 {
3045 translation_table = coding->translation_table_for_decode;
3046 if (NILP (translation_table))
3047 translation_table = Vstandard_translation_table_for_decode;
3048 }
3049
3050 coding->produced_char = 0;
3051 while (1)
3052 {
3053 int c, charset, c1, c2;
3054
3055 src_base = src;
3056 ONE_MORE_BYTE (c1);
3057
3058 if (c1 < 0x80)
3059 {
3060 charset = CHARSET_ASCII;
3061 if (c1 < 0x20)
3062 {
3063 if (c1 == '\r')
3064 {
3065 if (coding->eol_type == CODING_EOL_CRLF)
3066 {
3067 ONE_MORE_BYTE (c2);
3068 if (c2 == '\n')
3069 c1 = c2;
3070 else
3071 /* To process C2 again, SRC is subtracted by 1. */
3072 src--;
3073 }
3074 else if (coding->eol_type == CODING_EOL_CR)
3075 c1 = '\n';
3076 }
3077 else if (c1 == '\n'
3078 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3079 && (coding->eol_type == CODING_EOL_CR
3080 || coding->eol_type == CODING_EOL_CRLF))
3081 {
3082 coding->result = CODING_FINISH_INCONSISTENT_EOL;
3083 goto label_end_of_loop;
3084 }
3085 }
3086 }
3087 else
3088 {
3089 if (sjis_p)
3090 {
3091 if (c1 == 0x80 || c1 == 0xA0 || c1 > 0xEF)
3092 goto label_invalid_code;
3093 if (c1 <= 0x9F || c1 >= 0xE0)
3094 {
3095 /* SJIS -> JISX0208 */
3096 ONE_MORE_BYTE (c2);
3097 if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
3098 goto label_invalid_code;
3099 DECODE_SJIS (c1, c2, c1, c2);
3100 charset = charset_jisx0208;
3101 }
3102 else
3103 /* SJIS -> JISX0201-Kana */
3104 charset = charset_katakana_jisx0201;
3105 }
3106 else
3107 {
3108 /* BIG5 -> Big5 */
3109 if (c1 < 0xA0 || c1 > 0xFE)
3110 goto label_invalid_code;
3111 ONE_MORE_BYTE (c2);
3112 if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
3113 goto label_invalid_code;
3114 DECODE_BIG5 (c1, c2, charset, c1, c2);
3115 }
3116 }
3117
3118 c = DECODE_ISO_CHARACTER (charset, c1, c2);
3119 EMIT_CHAR (c);
3120 continue;
3121
3122 label_invalid_code:
3123 coding->errors++;
3124 src = src_base;
3125 c = *src++;
3126 EMIT_CHAR (c);
3127 }
3128
3129 label_end_of_loop:
3130 coding->consumed = coding->consumed_char = src_base - source;
3131 coding->produced = dst - destination;
3132 return;
3133 }
3134
3135 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
3136 This function can encode charsets `ascii', `katakana-jisx0201',
3137 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
3138 are sure that all these charsets are registered as official charset
3139 (i.e. do not have extended leading-codes). Characters of other
3140 charsets are produced without any encoding. If SJIS_P is 1, encode
3141 SJIS text, else encode BIG5 text. */
3142
3143 static void
3144 encode_coding_sjis_big5 (coding, source, destination,
3145 src_bytes, dst_bytes, sjis_p)
3146 struct coding_system *coding;
3147 unsigned char *source, *destination;
3148 int src_bytes, dst_bytes;
3149 int sjis_p;
3150 {
3151 unsigned char *src = source;
3152 unsigned char *src_end = source + src_bytes;
3153 unsigned char *dst = destination;
3154 unsigned char *dst_end = destination + dst_bytes;
3155 /* SRC_BASE remembers the start position in source in each loop.
3156 The loop will be exited when there's not enough source text to
3157 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3158 there's not enough destination area to produce encoded codes
3159 (within macro EMIT_BYTES). */
3160 unsigned char *src_base;
3161 Lisp_Object translation_table;
3162
3163 if (NILP (Venable_character_translation))
3164 translation_table = Qnil;
3165 else
3166 {
3167 translation_table = coding->translation_table_for_encode;
3168 if (NILP (translation_table))
3169 translation_table = Vstandard_translation_table_for_encode;
3170 }
3171
3172 while (1)
3173 {
3174 int c, charset, c1, c2;
3175
3176 src_base = src;
3177 ONE_MORE_CHAR (c);
3178
3179 /* Now encode the character C. */
3180 if (SINGLE_BYTE_CHAR_P (c))
3181 {
3182 switch (c)
3183 {
3184 case '\r':
3185 if (!(coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
3186 {
3187 EMIT_ONE_BYTE (c);
3188 break;
3189 }
3190 c = '\n';
3191 case '\n':
3192 if (coding->eol_type == CODING_EOL_CRLF)
3193 {
3194 EMIT_TWO_BYTES ('\r', c);
3195 break;
3196 }
3197 else if (coding->eol_type == CODING_EOL_CR)
3198 c = '\r';
3199 default:
3200 EMIT_ONE_BYTE (c);
3201 }
3202 }
3203 else
3204 {
3205 SPLIT_CHAR (c, charset, c1, c2);
3206 if (sjis_p)
3207 {
3208 if (charset == charset_jisx0208
3209 || charset == charset_jisx0208_1978)
3210 {
3211 ENCODE_SJIS (c1, c2, c1, c2);
3212 EMIT_TWO_BYTES (c1, c2);
3213 }
3214 else if (charset == charset_katakana_jisx0201)
3215 EMIT_ONE_BYTE (c1 | 0x80);
3216 else if (charset == charset_latin_jisx0201)
3217 EMIT_ONE_BYTE (c1);
3218 else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3219 {
3220 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3221 if (CHARSET_WIDTH (charset) > 1)
3222 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3223 }
3224 else
3225 /* There's no way other than producing the internal
3226 codes as is. */
3227 EMIT_BYTES (src_base, src);
3228 }
3229 else
3230 {
3231 if (charset == charset_big5_1 || charset == charset_big5_2)
3232 {
3233 ENCODE_BIG5 (charset, c1, c2, c1, c2);
3234 EMIT_TWO_BYTES (c1, c2);
3235 }
3236 else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3237 {
3238 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3239 if (CHARSET_WIDTH (charset) > 1)
3240 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3241 }
3242 else
3243 /* There's no way other than producing the internal
3244 codes as is. */
3245 EMIT_BYTES (src_base, src);
3246 }
3247 }
3248 coding->consumed_char++;
3249 }
3250
3251 label_end_of_loop:
3252 coding->consumed = src_base - source;
3253 coding->produced = coding->produced_char = dst - destination;
3254 }
3255
3256 \f
3257 /*** 5. CCL handlers ***/
3258
3259 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3260 Check if a text is encoded in a coding system of which
3261 encoder/decoder are written in CCL program. If it is, return
3262 CODING_CATEGORY_MASK_CCL, else return 0. */
3263
3264 static int
3265 detect_coding_ccl (src, src_end, multibytep)
3266 unsigned char *src, *src_end;
3267 int multibytep;
3268 {
3269 unsigned char *valid;
3270 int c;
3271 /* Dummy for ONE_MORE_BYTE. */
3272 struct coding_system dummy_coding;
3273 struct coding_system *coding = &dummy_coding;
3274
3275 /* No coding system is assigned to coding-category-ccl. */
3276 if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
3277 return 0;
3278
3279 valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
3280 while (1)
3281 {
3282 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
3283 if (! valid[c])
3284 return 0;
3285 }
3286 label_end_of_loop:
3287 return CODING_CATEGORY_MASK_CCL;
3288 }
3289
3290 \f
3291 /*** 6. End-of-line handlers ***/
3292
3293 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
3294
3295 static void
3296 decode_eol (coding, source, destination, src_bytes, dst_bytes)
3297 struct coding_system *coding;
3298 unsigned char *source, *destination;
3299 int src_bytes, dst_bytes;
3300 {
3301 unsigned char *src = source;
3302 unsigned char *dst = destination;
3303 unsigned char *src_end = src + src_bytes;
3304 unsigned char *dst_end = dst + dst_bytes;
3305 Lisp_Object translation_table;
3306 /* SRC_BASE remembers the start position in source in each loop.
3307 The loop will be exited when there's not enough source code
3308 (within macro ONE_MORE_BYTE), or when there's not enough
3309 destination area to produce a character (within macro
3310 EMIT_CHAR). */
3311 unsigned char *src_base;
3312 int c;
3313
3314 translation_table = Qnil;
3315 switch (coding->eol_type)
3316 {
3317 case CODING_EOL_CRLF:
3318 while (1)
3319 {
3320 src_base = src;
3321 ONE_MORE_BYTE (c);
3322 if (c == '\r')
3323 {
3324 ONE_MORE_BYTE (c);
3325 if (c != '\n')
3326 {
3327 src--;
3328 c = '\r';
3329 }
3330 }
3331 else if (c == '\n'
3332 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
3333 {
3334 coding->result = CODING_FINISH_INCONSISTENT_EOL;
3335 goto label_end_of_loop;
3336 }
3337 EMIT_CHAR (c);
3338 }
3339 break;
3340
3341 case CODING_EOL_CR:
3342 while (1)
3343 {
3344 src_base = src;
3345 ONE_MORE_BYTE (c);
3346 if (c == '\n')
3347 {
3348 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3349 {
3350 coding->result = CODING_FINISH_INCONSISTENT_EOL;
3351 goto label_end_of_loop;
3352 }
3353 }
3354 else if (c == '\r')
3355 c = '\n';
3356 EMIT_CHAR (c);
3357 }
3358 break;
3359
3360 default: /* no need for EOL handling */
3361 while (1)
3362 {
3363 src_base = src;
3364 ONE_MORE_BYTE (c);
3365 EMIT_CHAR (c);
3366 }
3367 }
3368
3369 label_end_of_loop:
3370 coding->consumed = coding->consumed_char = src_base - source;
3371 coding->produced = dst - destination;
3372 return;
3373 }
3374
3375 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
3376 format of end-of-line according to `coding->eol_type'. It also
3377 convert multibyte form 8-bit characters to unibyte if
3378 CODING->src_multibyte is nonzero. If `coding->mode &
3379 CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
3380 also means end-of-line. */
3381
3382 static void
3383 encode_eol (coding, source, destination, src_bytes, dst_bytes)
3384 struct coding_system *coding;
3385 const unsigned char *source;
3386 unsigned char *destination;
3387 int src_bytes, dst_bytes;
3388 {
3389 const unsigned char *src = source;
3390 unsigned char *dst = destination;
3391 const unsigned char *src_end = src + src_bytes;
3392 unsigned char *dst_end = dst + dst_bytes;
3393 Lisp_Object translation_table;
3394 /* SRC_BASE remembers the start position in source in each loop.
3395 The loop will be exited when there's not enough source text to
3396 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3397 there's not enough destination area to produce encoded codes
3398 (within macro EMIT_BYTES). */
3399 const unsigned char *src_base;
3400 unsigned char *tmp;
3401 int c;
3402 int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
3403
3404 translation_table = Qnil;
3405 if (coding->src_multibyte
3406 && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
3407 {
3408 src_end--;
3409 src_bytes--;
3410 coding->result = CODING_FINISH_INSUFFICIENT_SRC;
3411 }
3412
3413 if (coding->eol_type == CODING_EOL_CRLF)
3414 {
3415 while (src < src_end)
3416 {
3417 src_base = src;
3418 c = *src++;
3419 if (c >= 0x20)
3420 EMIT_ONE_BYTE (c);
3421 else if (c == '\n' || (c == '\r' && selective_display))
3422 EMIT_TWO_BYTES ('\r', '\n');
3423 else
3424 EMIT_ONE_BYTE (c);
3425 }
3426 src_base = src;
3427 label_end_of_loop:
3428 ;
3429 }
3430 else
3431 {
3432 if (!dst_bytes || src_bytes <= dst_bytes)
3433 {
3434 safe_bcopy (src, dst, src_bytes);
3435 src_base = src_end;
3436 dst += src_bytes;
3437 }
3438 else
3439 {
3440 if (coding->src_multibyte
3441 && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
3442 dst_bytes--;
3443 safe_bcopy (src, dst, dst_bytes);
3444 src_base = src + dst_bytes;
3445 dst = destination + dst_bytes;
3446 coding->result = CODING_FINISH_INSUFFICIENT_DST;
3447 }
3448 if (coding->eol_type == CODING_EOL_CR)
3449 {
3450 for (tmp = destination; tmp < dst; tmp++)
3451 if (*tmp == '\n') *tmp = '\r';
3452 }
3453 else if (selective_display)
3454 {
3455 for (tmp = destination; tmp < dst; tmp++)
3456 if (*tmp == '\r') *tmp = '\n';
3457 }
3458 }
3459 if (coding->src_multibyte)
3460 dst = destination + str_as_unibyte (destination, dst - destination);
3461
3462 coding->consumed = src_base - source;
3463 coding->produced = dst - destination;
3464 coding->produced_char = coding->produced;
3465 }
3466
3467 \f
3468 /*** 7. C library functions ***/
3469
3470 /* In Emacs Lisp, a coding system is represented by a Lisp symbol which
3471 has a property `coding-system'. The value of this property is a
3472 vector of length 5 (called the coding-vector). Among elements of
3473 this vector, the first (element[0]) and the fifth (element[4])
3474 carry important information for decoding/encoding. Before
3475 decoding/encoding, this information should be set in fields of a
3476 structure of type `coding_system'.
3477
3478 The value of the property `coding-system' can be a symbol of another
3479 subsidiary coding-system. In that case, Emacs gets coding-vector
3480 from that symbol.
3481
3482 `element[0]' contains information to be set in `coding->type'. The
3483 value and its meaning is as follows:
3484
3485 0 -- coding_type_emacs_mule
3486 1 -- coding_type_sjis
3487 2 -- coding_type_iso2022
3488 3 -- coding_type_big5
3489 4 -- coding_type_ccl encoder/decoder written in CCL
3490 nil -- coding_type_no_conversion
3491 t -- coding_type_undecided (automatic conversion on decoding,
3492 no-conversion on encoding)
3493
3494 `element[4]' contains information to be set in `coding->flags' and
3495 `coding->spec'. The meaning varies by `coding->type'.
3496
3497 If `coding->type' is `coding_type_iso2022', element[4] is a vector
3498 of length 32 (of which the first 13 sub-elements are used now).
3499 Meanings of these sub-elements are:
3500
3501 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
3502 If the value is an integer of valid charset, the charset is
3503 assumed to be designated to graphic register N initially.
3504
3505 If the value is minus, it is a minus value of charset which
3506 reserves graphic register N, which means that the charset is
3507 not designated initially but should be designated to graphic
3508 register N just before encoding a character in that charset.
3509
3510 If the value is nil, graphic register N is never used on
3511 encoding.
3512
3513 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
3514 Each value takes t or nil. See the section ISO2022 of
3515 `coding.h' for more information.
3516
3517 If `coding->type' is `coding_type_big5', element[4] is t to denote
3518 BIG5-ETen or nil to denote BIG5-HKU.
3519
3520 If `coding->type' takes the other value, element[4] is ignored.
3521
3522 Emacs Lisp's coding systems also carry information about format of
3523 end-of-line in a value of property `eol-type'. If the value is
3524 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
3525 means CODING_EOL_CR. If it is not integer, it should be a vector
3526 of subsidiary coding systems of which property `eol-type' has one
3527 of the above values.
3528
3529 */
3530
3531 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
3532 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
3533 is setup so that no conversion is necessary and return -1, else
3534 return 0. */
3535
3536 int
3537 setup_coding_system (coding_system, coding)
3538 Lisp_Object coding_system;
3539 struct coding_system *coding;
3540 {
3541 Lisp_Object coding_spec, coding_type, eol_type, plist;
3542 Lisp_Object val;
3543
3544 /* At first, zero clear all members. */
3545 bzero (coding, sizeof (struct coding_system));
3546
3547 /* Initialize some fields required for all kinds of coding systems. */
3548 coding->symbol = coding_system;
3549 coding->heading_ascii = -1;
3550 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
3551 coding->composing = COMPOSITION_DISABLED;
3552 coding->cmp_data = NULL;
3553
3554 if (NILP (coding_system))
3555 goto label_invalid_coding_system;
3556
3557 coding_spec = Fget (coding_system, Qcoding_system);
3558
3559 if (!VECTORP (coding_spec)
3560 || XVECTOR (coding_spec)->size != 5
3561 || !CONSP (XVECTOR (coding_spec)->contents[3]))
3562 goto label_invalid_coding_system;
3563
3564 eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
3565 if (VECTORP (eol_type))
3566 {
3567 coding->eol_type = CODING_EOL_UNDECIDED;
3568 coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
3569 }
3570 else if (XFASTINT (eol_type) == 1)
3571 {
3572 coding->eol_type = CODING_EOL_CRLF;
3573 coding->common_flags
3574 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3575 }
3576 else if (XFASTINT (eol_type) == 2)
3577 {
3578 coding->eol_type = CODING_EOL_CR;
3579 coding->common_flags
3580 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3581 }
3582 else
3583 coding->eol_type = CODING_EOL_LF;
3584
3585 coding_type = XVECTOR (coding_spec)->contents[0];
3586 /* Try short cut. */
3587 if (SYMBOLP (coding_type))
3588 {
3589 if (EQ (coding_type, Qt))
3590 {
3591 coding->type = coding_type_undecided;
3592 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
3593 }
3594 else
3595 coding->type = coding_type_no_conversion;
3596 /* Initialize this member. Any thing other than
3597 CODING_CATEGORY_IDX_UTF_16_BE and
3598 CODING_CATEGORY_IDX_UTF_16_LE are ok because they have
3599 special treatment in detect_eol. */
3600 coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
3601
3602 return 0;
3603 }
3604
3605 /* Get values of coding system properties:
3606 `post-read-conversion', `pre-write-conversion',
3607 `translation-table-for-decode', `translation-table-for-encode'. */
3608 plist = XVECTOR (coding_spec)->contents[3];
3609 /* Pre & post conversion functions should be disabled if
3610 inhibit_eol_conversion is nonzero. This is the case that a code
3611 conversion function is called while those functions are running. */
3612 if (! inhibit_pre_post_conversion)
3613 {
3614 coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
3615 coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
3616 }
3617 val = Fplist_get (plist, Qtranslation_table_for_decode);
3618 if (SYMBOLP (val))
3619 val = Fget (val, Qtranslation_table_for_decode);
3620 coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
3621 val = Fplist_get (plist, Qtranslation_table_for_encode);
3622 if (SYMBOLP (val))
3623 val = Fget (val, Qtranslation_table_for_encode);
3624 coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
3625 val = Fplist_get (plist, Qcoding_category);
3626 if (!NILP (val))
3627 {
3628 val = Fget (val, Qcoding_category_index);
3629 if (INTEGERP (val))
3630 coding->category_idx = XINT (val);
3631 else
3632 goto label_invalid_coding_system;
3633 }
3634 else
3635 goto label_invalid_coding_system;
3636
3637 /* If the coding system has non-nil `composition' property, enable
3638 composition handling. */
3639 val = Fplist_get (plist, Qcomposition);
3640 if (!NILP (val))
3641 coding->composing = COMPOSITION_NO;
3642
3643 switch (XFASTINT (coding_type))
3644 {
3645 case 0:
3646 coding->type = coding_type_emacs_mule;
3647 coding->common_flags
3648 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3649 if (!NILP (coding->post_read_conversion))
3650 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
3651 if (!NILP (coding->pre_write_conversion))
3652 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3653 break;
3654
3655 case 1:
3656 coding->type = coding_type_sjis;
3657 coding->common_flags
3658 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3659 break;
3660
3661 case 2:
3662 coding->type = coding_type_iso2022;
3663 coding->common_flags
3664 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3665 {
3666 Lisp_Object val, temp;
3667 Lisp_Object *flags;
3668 int i, charset, reg_bits = 0;
3669
3670 val = XVECTOR (coding_spec)->contents[4];
3671
3672 if (!VECTORP (val) || XVECTOR (val)->size != 32)
3673 goto label_invalid_coding_system;
3674
3675 flags = XVECTOR (val)->contents;
3676 coding->flags
3677 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3678 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3679 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3680 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3681 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3682 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3683 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3684 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3685 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3686 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3687 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3688 | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3689 | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3690 );
3691
3692 /* Invoke graphic register 0 to plane 0. */
3693 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3694 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
3695 CODING_SPEC_ISO_INVOCATION (coding, 1)
3696 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3697 /* Not single shifting at first. */
3698 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3699 /* Beginning of buffer should also be regarded as bol. */
3700 CODING_SPEC_ISO_BOL (coding) = 1;
3701
3702 for (charset = 0; charset <= MAX_CHARSET; charset++)
3703 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3704 val = Vcharset_revision_alist;
3705 while (CONSP (val))
3706 {
3707 charset = get_charset_id (Fcar_safe (XCAR (val)));
3708 if (charset >= 0
3709 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3710 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3711 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3712 val = XCDR (val);
3713 }
3714
3715 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3716 FLAGS[REG] can be one of below:
3717 integer CHARSET: CHARSET occupies register I,
3718 t: designate nothing to REG initially, but can be used
3719 by any charsets,
3720 list of integer, nil, or t: designate the first
3721 element (if integer) to REG initially, the remaining
3722 elements (if integer) is designated to REG on request,
3723 if an element is t, REG can be used by any charsets,
3724 nil: REG is never used. */
3725 for (charset = 0; charset <= MAX_CHARSET; charset++)
3726 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3727 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3728 for (i = 0; i < 4; i++)
3729 {
3730 if ((INTEGERP (flags[i])
3731 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset)))
3732 || (charset = get_charset_id (flags[i])) >= 0)
3733 {
3734 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3735 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3736 }
3737 else if (EQ (flags[i], Qt))
3738 {
3739 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3740 reg_bits |= 1 << i;
3741 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3742 }
3743 else if (CONSP (flags[i]))
3744 {
3745 Lisp_Object tail;
3746 tail = flags[i];
3747
3748 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3749 if ((INTEGERP (XCAR (tail))
3750 && (charset = XINT (XCAR (tail)),
3751 CHARSET_VALID_P (charset)))
3752 || (charset = get_charset_id (XCAR (tail))) >= 0)
3753 {
3754 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3755 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3756 }
3757 else
3758 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3759 tail = XCDR (tail);
3760 while (CONSP (tail))
3761 {
3762 if ((INTEGERP (XCAR (tail))
3763 && (charset = XINT (XCAR (tail)),
3764 CHARSET_VALID_P (charset)))
3765 || (charset = get_charset_id (XCAR (tail))) >= 0)
3766 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3767 = i;
3768 else if (EQ (XCAR (tail), Qt))
3769 reg_bits |= 1 << i;
3770 tail = XCDR (tail);
3771 }
3772 }
3773 else
3774 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3775
3776 CODING_SPEC_ISO_DESIGNATION (coding, i)
3777 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3778 }
3779
3780 if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3781 {
3782 /* REG 1 can be used only by locking shift in 7-bit env. */
3783 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3784 reg_bits &= ~2;
3785 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3786 /* Without any shifting, only REG 0 and 1 can be used. */
3787 reg_bits &= 3;
3788 }
3789
3790 if (reg_bits)
3791 for (charset = 0; charset <= MAX_CHARSET; charset++)
3792 {
3793 if (CHARSET_DEFINED_P (charset)
3794 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3795 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
3796 {
3797 /* There exist some default graphic registers to be
3798 used by CHARSET. */
3799
3800 /* We had better avoid designating a charset of
3801 CHARS96 to REG 0 as far as possible. */
3802 if (CHARSET_CHARS (charset) == 96)
3803 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3804 = (reg_bits & 2
3805 ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3806 else
3807 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3808 = (reg_bits & 1
3809 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3810 }
3811 }
3812 }
3813 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3814 coding->spec.iso2022.last_invalid_designation_register = -1;
3815 break;
3816
3817 case 3:
3818 coding->type = coding_type_big5;
3819 coding->common_flags
3820 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3821 coding->flags
3822 = (NILP (XVECTOR (coding_spec)->contents[4])
3823 ? CODING_FLAG_BIG5_HKU
3824 : CODING_FLAG_BIG5_ETEN);
3825 break;
3826
3827 case 4:
3828 coding->type = coding_type_ccl;
3829 coding->common_flags
3830 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3831 {
3832 val = XVECTOR (coding_spec)->contents[4];
3833 if (! CONSP (val)
3834 || setup_ccl_program (&(coding->spec.ccl.decoder),
3835 XCAR (val)) < 0
3836 || setup_ccl_program (&(coding->spec.ccl.encoder),
3837 XCDR (val)) < 0)
3838 goto label_invalid_coding_system;
3839
3840 bzero (coding->spec.ccl.valid_codes, 256);
3841 val = Fplist_get (plist, Qvalid_codes);
3842 if (CONSP (val))
3843 {
3844 Lisp_Object this;
3845
3846 for (; CONSP (val); val = XCDR (val))
3847 {
3848 this = XCAR (val);
3849 if (INTEGERP (this)
3850 && XINT (this) >= 0 && XINT (this) < 256)
3851 coding->spec.ccl.valid_codes[XINT (this)] = 1;
3852 else if (CONSP (this)
3853 && INTEGERP (XCAR (this))
3854 && INTEGERP (XCDR (this)))
3855 {
3856 int start = XINT (XCAR (this));
3857 int end = XINT (XCDR (this));
3858
3859 if (start >= 0 && start <= end && end < 256)
3860 while (start <= end)
3861 coding->spec.ccl.valid_codes[start++] = 1;
3862 }
3863 }
3864 }
3865 }
3866 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3867 coding->spec.ccl.cr_carryover = 0;
3868 coding->spec.ccl.eight_bit_carryover[0] = 0;
3869 break;
3870
3871 case 5:
3872 coding->type = coding_type_raw_text;
3873 break;
3874
3875 default:
3876 goto label_invalid_coding_system;
3877 }
3878 return 0;
3879
3880 label_invalid_coding_system:
3881 coding->type = coding_type_no_conversion;
3882 coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3883 coding->common_flags = 0;
3884 coding->eol_type = CODING_EOL_LF;
3885 coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3886 return -1;
3887 }
3888
3889 /* Free memory blocks allocated for storing composition information. */
3890
3891 void
3892 coding_free_composition_data (coding)
3893 struct coding_system *coding;
3894 {
3895 struct composition_data *cmp_data = coding->cmp_data, *next;
3896
3897 if (!cmp_data)
3898 return;
3899 /* Memory blocks are chained. At first, rewind to the first, then,
3900 free blocks one by one. */
3901 while (cmp_data->prev)
3902 cmp_data = cmp_data->prev;
3903 while (cmp_data)
3904 {
3905 next = cmp_data->next;
3906 xfree (cmp_data);
3907 cmp_data = next;
3908 }
3909 coding->cmp_data = NULL;
3910 }
3911
3912 /* Set `char_offset' member of all memory blocks pointed by
3913 coding->cmp_data to POS. */
3914
3915 void
3916 coding_adjust_composition_offset (coding, pos)
3917 struct coding_system *coding;
3918 int pos;
3919 {
3920 struct composition_data *cmp_data;
3921
3922 for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3923 cmp_data->char_offset = pos;
3924 }
3925
3926 /* Setup raw-text or one of its subsidiaries in the structure
3927 coding_system CODING according to the already setup value eol_type
3928 in CODING. CODING should be setup for some coding system in
3929 advance. */
3930
3931 void
3932 setup_raw_text_coding_system (coding)
3933 struct coding_system *coding;
3934 {
3935 if (coding->type != coding_type_raw_text)
3936 {
3937 coding->symbol = Qraw_text;
3938 coding->type = coding_type_raw_text;
3939 if (coding->eol_type != CODING_EOL_UNDECIDED)
3940 {
3941 Lisp_Object subsidiaries;
3942 subsidiaries = Fget (Qraw_text, Qeol_type);
3943
3944 if (VECTORP (subsidiaries)
3945 && XVECTOR (subsidiaries)->size == 3)
3946 coding->symbol
3947 = XVECTOR (subsidiaries)->contents[coding->eol_type];
3948 }
3949 setup_coding_system (coding->symbol, coding);
3950 }
3951 return;
3952 }
3953
3954 /* Emacs has a mechanism to automatically detect a coding system if it
3955 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
3956 it's impossible to distinguish some coding systems accurately
3957 because they use the same range of codes. So, at first, coding
3958 systems are categorized into 7, those are:
3959
3960 o coding-category-emacs-mule
3961
3962 The category for a coding system which has the same code range
3963 as Emacs' internal format. Assigned the coding-system (Lisp
3964 symbol) `emacs-mule' by default.
3965
3966 o coding-category-sjis
3967
3968 The category for a coding system which has the same code range
3969 as SJIS. Assigned the coding-system (Lisp
3970 symbol) `japanese-shift-jis' by default.
3971
3972 o coding-category-iso-7
3973
3974 The category for a coding system which has the same code range
3975 as ISO2022 of 7-bit environment. This doesn't use any locking
3976 shift and single shift functions. This can encode/decode all
3977 charsets. Assigned the coding-system (Lisp symbol)
3978 `iso-2022-7bit' by default.
3979
3980 o coding-category-iso-7-tight
3981
3982 Same as coding-category-iso-7 except that this can
3983 encode/decode only the specified charsets.
3984
3985 o coding-category-iso-8-1
3986
3987 The category for a coding system which has the same code range
3988 as ISO2022 of 8-bit environment and graphic plane 1 used only
3989 for DIMENSION1 charset. This doesn't use any locking shift
3990 and single shift functions. Assigned the coding-system (Lisp
3991 symbol) `iso-latin-1' by default.
3992
3993 o coding-category-iso-8-2
3994
3995 The category for a coding system which has the same code range
3996 as ISO2022 of 8-bit environment and graphic plane 1 used only
3997 for DIMENSION2 charset. This doesn't use any locking shift
3998 and single shift functions. Assigned the coding-system (Lisp
3999 symbol) `japanese-iso-8bit' by default.
4000
4001 o coding-category-iso-7-else
4002
4003 The category for a coding system which has the same code range
4004 as ISO2022 of 7-bit environment but uses locking shift or
4005 single shift functions. Assigned the coding-system (Lisp
4006 symbol) `iso-2022-7bit-lock' by default.
4007
4008 o coding-category-iso-8-else
4009
4010 The category for a coding system which has the same code range
4011 as ISO2022 of 8-bit environment but uses locking shift or
4012 single shift functions. Assigned the coding-system (Lisp
4013 symbol) `iso-2022-8bit-ss2' by default.
4014
4015 o coding-category-big5
4016
4017 The category for a coding system which has the same code range
4018 as BIG5. Assigned the coding-system (Lisp symbol)
4019 `cn-big5' by default.
4020
4021 o coding-category-utf-8
4022
4023 The category for a coding system which has the same code range
4024 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
4025 symbol) `utf-8' by default.
4026
4027 o coding-category-utf-16-be
4028
4029 The category for a coding system in which a text has an
4030 Unicode signature (cf. Unicode Standard) in the order of BIG
4031 endian at the head. Assigned the coding-system (Lisp symbol)
4032 `utf-16-be' by default.
4033
4034 o coding-category-utf-16-le
4035
4036 The category for a coding system in which a text has an
4037 Unicode signature (cf. Unicode Standard) in the order of
4038 LITTLE endian at the head. Assigned the coding-system (Lisp
4039 symbol) `utf-16-le' by default.
4040
4041 o coding-category-ccl
4042
4043 The category for a coding system of which encoder/decoder is
4044 written in CCL programs. The default value is nil, i.e., no
4045 coding system is assigned.
4046
4047 o coding-category-binary
4048
4049 The category for a coding system not categorized in any of the
4050 above. Assigned the coding-system (Lisp symbol)
4051 `no-conversion' by default.
4052
4053 Each of them is a Lisp symbol and the value is an actual
4054 `coding-system' (this is also a Lisp symbol) assigned by a user.
4055 What Emacs does actually is to detect a category of coding system.
4056 Then, it uses a `coding-system' assigned to it. If Emacs can't
4057 decide a single possible category, it selects a category of the
4058 highest priority. Priorities of categories are also specified by a
4059 user in a Lisp variable `coding-category-list'.
4060
4061 */
4062
4063 static
4064 int ascii_skip_code[256];
4065
4066 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
4067 If it detects possible coding systems, return an integer in which
4068 appropriate flag bits are set. Flag bits are defined by macros
4069 CODING_CATEGORY_MASK_XXX in `coding.h'. If PRIORITIES is non-NULL,
4070 it should point the table `coding_priorities'. In that case, only
4071 the flag bit for a coding system of the highest priority is set in
4072 the returned value. If MULTIBYTEP is nonzero, 8-bit codes of the
4073 range 0x80..0x9F are in multibyte form.
4074
4075 How many ASCII characters are at the head is returned as *SKIP. */
4076
4077 static int
4078 detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
4079 unsigned char *source;
4080 int src_bytes, *priorities, *skip;
4081 int multibytep;
4082 {
4083 register unsigned char c;
4084 unsigned char *src = source, *src_end = source + src_bytes;
4085 unsigned int mask, utf16_examined_p, iso2022_examined_p;
4086 int i;
4087
4088 /* At first, skip all ASCII characters and control characters except
4089 for three ISO2022 specific control characters. */
4090 ascii_skip_code[ISO_CODE_SO] = 0;
4091 ascii_skip_code[ISO_CODE_SI] = 0;
4092 ascii_skip_code[ISO_CODE_ESC] = 0;
4093
4094 label_loop_detect_coding:
4095 while (src < src_end && ascii_skip_code[*src]) src++;
4096 *skip = src - source;
4097
4098 if (src >= src_end)
4099 /* We found nothing other than ASCII. There's nothing to do. */
4100 return 0;
4101
4102 c = *src;
4103 /* The text seems to be encoded in some multilingual coding system.
4104 Now, try to find in which coding system the text is encoded. */
4105 if (c < 0x80)
4106 {
4107 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
4108 /* C is an ISO2022 specific control code of C0. */
4109 mask = detect_coding_iso2022 (src, src_end, multibytep);
4110 if (mask == 0)
4111 {
4112 /* No valid ISO2022 code follows C. Try again. */
4113 src++;
4114 if (c == ISO_CODE_ESC)
4115 ascii_skip_code[ISO_CODE_ESC] = 1;
4116 else
4117 ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
4118 goto label_loop_detect_coding;
4119 }
4120 if (priorities)
4121 {
4122 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4123 {
4124 if (mask & priorities[i])
4125 return priorities[i];
4126 }
4127 return CODING_CATEGORY_MASK_RAW_TEXT;
4128 }
4129 }
4130 else
4131 {
4132 int try;
4133
4134 if (multibytep && c == LEADING_CODE_8_BIT_CONTROL)
4135 c = src[1] - 0x20;
4136
4137 if (c < 0xA0)
4138 {
4139 /* C is the first byte of SJIS character code,
4140 or a leading-code of Emacs' internal format (emacs-mule),
4141 or the first byte of UTF-16. */
4142 try = (CODING_CATEGORY_MASK_SJIS
4143 | CODING_CATEGORY_MASK_EMACS_MULE
4144 | CODING_CATEGORY_MASK_UTF_16_BE
4145 | CODING_CATEGORY_MASK_UTF_16_LE);
4146
4147 /* Or, if C is a special latin extra code,
4148 or is an ISO2022 specific control code of C1 (SS2 or SS3),
4149 or is an ISO2022 control-sequence-introducer (CSI),
4150 we should also consider the possibility of ISO2022 codings. */
4151 if ((VECTORP (Vlatin_extra_code_table)
4152 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
4153 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
4154 || (c == ISO_CODE_CSI
4155 && (src < src_end
4156 && (*src == ']'
4157 || ((*src == '0' || *src == '1' || *src == '2')
4158 && src + 1 < src_end
4159 && src[1] == ']')))))
4160 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
4161 | CODING_CATEGORY_MASK_ISO_8BIT);
4162 }
4163 else
4164 /* C is a character of ISO2022 in graphic plane right,
4165 or a SJIS's 1-byte character code (i.e. JISX0201),
4166 or the first byte of BIG5's 2-byte code,
4167 or the first byte of UTF-8/16. */
4168 try = (CODING_CATEGORY_MASK_ISO_8_ELSE
4169 | CODING_CATEGORY_MASK_ISO_8BIT
4170 | CODING_CATEGORY_MASK_SJIS
4171 | CODING_CATEGORY_MASK_BIG5
4172 | CODING_CATEGORY_MASK_UTF_8
4173 | CODING_CATEGORY_MASK_UTF_16_BE
4174 | CODING_CATEGORY_MASK_UTF_16_LE);
4175
4176 /* Or, we may have to consider the possibility of CCL. */
4177 if (coding_system_table[CODING_CATEGORY_IDX_CCL]
4178 && (coding_system_table[CODING_CATEGORY_IDX_CCL]
4179 ->spec.ccl.valid_codes)[c])
4180 try |= CODING_CATEGORY_MASK_CCL;
4181
4182 mask = 0;
4183 utf16_examined_p = iso2022_examined_p = 0;
4184 if (priorities)
4185 {
4186 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4187 {
4188 if (!iso2022_examined_p
4189 && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
4190 {
4191 mask |= detect_coding_iso2022 (src, src_end, multibytep);
4192 iso2022_examined_p = 1;
4193 }
4194 else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
4195 mask |= detect_coding_sjis (src, src_end, multibytep);
4196 else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
4197 mask |= detect_coding_utf_8 (src, src_end, multibytep);
4198 else if (!utf16_examined_p
4199 && (priorities[i] & try &
4200 CODING_CATEGORY_MASK_UTF_16_BE_LE))
4201 {
4202 mask |= detect_coding_utf_16 (src, src_end, multibytep);
4203 utf16_examined_p = 1;
4204 }
4205 else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
4206 mask |= detect_coding_big5 (src, src_end, multibytep);
4207 else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
4208 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4209 else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
4210 mask |= detect_coding_ccl (src, src_end, multibytep);
4211 else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
4212 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
4213 else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
4214 mask |= CODING_CATEGORY_MASK_BINARY;
4215 if (mask & priorities[i])
4216 return priorities[i];
4217 }
4218 return CODING_CATEGORY_MASK_RAW_TEXT;
4219 }
4220 if (try & CODING_CATEGORY_MASK_ISO)
4221 mask |= detect_coding_iso2022 (src, src_end, multibytep);
4222 if (try & CODING_CATEGORY_MASK_SJIS)
4223 mask |= detect_coding_sjis (src, src_end, multibytep);
4224 if (try & CODING_CATEGORY_MASK_BIG5)
4225 mask |= detect_coding_big5 (src, src_end, multibytep);
4226 if (try & CODING_CATEGORY_MASK_UTF_8)
4227 mask |= detect_coding_utf_8 (src, src_end, multibytep);
4228 if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
4229 mask |= detect_coding_utf_16 (src, src_end, multibytep);
4230 if (try & CODING_CATEGORY_MASK_EMACS_MULE)
4231 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4232 if (try & CODING_CATEGORY_MASK_CCL)
4233 mask |= detect_coding_ccl (src, src_end, multibytep);
4234 }
4235 return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
4236 }
4237
4238 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
4239 The information of the detected coding system is set in CODING. */
4240
4241 void
4242 detect_coding (coding, src, src_bytes)
4243 struct coding_system *coding;
4244 const unsigned char *src;
4245 int src_bytes;
4246 {
4247 unsigned int idx;
4248 int skip, mask;
4249 Lisp_Object val;
4250
4251 val = Vcoding_category_list;
4252 mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip,
4253 coding->src_multibyte);
4254 coding->heading_ascii = skip;
4255
4256 if (!mask) return;
4257
4258 /* We found a single coding system of the highest priority in MASK. */
4259 idx = 0;
4260 while (mask && ! (mask & 1)) mask >>= 1, idx++;
4261 if (! mask)
4262 idx = CODING_CATEGORY_IDX_RAW_TEXT;
4263
4264 val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[idx]);
4265
4266 if (coding->eol_type != CODING_EOL_UNDECIDED)
4267 {
4268 Lisp_Object tmp;
4269
4270 tmp = Fget (val, Qeol_type);
4271 if (VECTORP (tmp))
4272 val = XVECTOR (tmp)->contents[coding->eol_type];
4273 }
4274
4275 /* Setup this new coding system while preserving some slots. */
4276 {
4277 int src_multibyte = coding->src_multibyte;
4278 int dst_multibyte = coding->dst_multibyte;
4279
4280 setup_coding_system (val, coding);
4281 coding->src_multibyte = src_multibyte;
4282 coding->dst_multibyte = dst_multibyte;
4283 coding->heading_ascii = skip;
4284 }
4285 }
4286
4287 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
4288 SOURCE is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
4289 CODING_EOL_CR, and CODING_EOL_UNDECIDED.
4290
4291 How many non-eol characters are at the head is returned as *SKIP. */
4292
4293 #define MAX_EOL_CHECK_COUNT 3
4294
4295 static int
4296 detect_eol_type (source, src_bytes, skip)
4297 unsigned char *source;
4298 int src_bytes, *skip;
4299 {
4300 unsigned char *src = source, *src_end = src + src_bytes;
4301 unsigned char c;
4302 int total = 0; /* How many end-of-lines are found so far. */
4303 int eol_type = CODING_EOL_UNDECIDED;
4304 int this_eol_type;
4305
4306 *skip = 0;
4307
4308 while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4309 {
4310 c = *src++;
4311 if (c == '\n' || c == '\r')
4312 {
4313 if (*skip == 0)
4314 *skip = src - 1 - source;
4315 total++;
4316 if (c == '\n')
4317 this_eol_type = CODING_EOL_LF;
4318 else if (src >= src_end || *src != '\n')
4319 this_eol_type = CODING_EOL_CR;
4320 else
4321 this_eol_type = CODING_EOL_CRLF, src++;
4322
4323 if (eol_type == CODING_EOL_UNDECIDED)
4324 /* This is the first end-of-line. */
4325 eol_type = this_eol_type;
4326 else if (eol_type != this_eol_type)
4327 {
4328 /* The found type is different from what found before. */
4329 eol_type = CODING_EOL_INCONSISTENT;
4330 break;
4331 }
4332 }
4333 }
4334
4335 if (*skip == 0)
4336 *skip = src_end - source;
4337 return eol_type;
4338 }
4339
4340 /* Like detect_eol_type, but detect EOL type in 2-octet
4341 big-endian/little-endian format for coding systems utf-16-be and
4342 utf-16-le. */
4343
4344 static int
4345 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
4346 unsigned char *source;
4347 int src_bytes, *skip, big_endian_p;
4348 {
4349 unsigned char *src = source, *src_end = src + src_bytes;
4350 unsigned int c1, c2;
4351 int total = 0; /* How many end-of-lines are found so far. */
4352 int eol_type = CODING_EOL_UNDECIDED;
4353 int this_eol_type;
4354 int msb, lsb;
4355
4356 if (big_endian_p)
4357 msb = 0, lsb = 1;
4358 else
4359 msb = 1, lsb = 0;
4360
4361 *skip = 0;
4362
4363 while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
4364 {
4365 c1 = (src[msb] << 8) | (src[lsb]);
4366 src += 2;
4367
4368 if (c1 == '\n' || c1 == '\r')
4369 {
4370 if (*skip == 0)
4371 *skip = src - 2 - source;
4372 total++;
4373 if (c1 == '\n')
4374 {
4375 this_eol_type = CODING_EOL_LF;
4376 }
4377 else
4378 {
4379 if ((src + 1) >= src_end)
4380 {
4381 this_eol_type = CODING_EOL_CR;
4382 }
4383 else
4384 {
4385 c2 = (src[msb] << 8) | (src[lsb]);
4386 if (c2 == '\n')
4387 this_eol_type = CODING_EOL_CRLF, src += 2;
4388 else
4389 this_eol_type = CODING_EOL_CR;
4390 }
4391 }
4392
4393 if (eol_type == CODING_EOL_UNDECIDED)
4394 /* This is the first end-of-line. */
4395 eol_type = this_eol_type;
4396 else if (eol_type != this_eol_type)
4397 {
4398 /* The found type is different from what found before. */
4399 eol_type = CODING_EOL_INCONSISTENT;
4400 break;
4401 }
4402 }
4403 }
4404
4405 if (*skip == 0)
4406 *skip = src_end - source;
4407 return eol_type;
4408 }
4409
4410 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
4411 is encoded. If it detects an appropriate format of end-of-line, it
4412 sets the information in *CODING. */
4413
4414 void
4415 detect_eol (coding, src, src_bytes)
4416 struct coding_system *coding;
4417 const unsigned char *src;
4418 int src_bytes;
4419 {
4420 Lisp_Object val;
4421 int skip;
4422 int eol_type;
4423
4424 switch (coding->category_idx)
4425 {
4426 case CODING_CATEGORY_IDX_UTF_16_BE:
4427 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
4428 break;
4429 case CODING_CATEGORY_IDX_UTF_16_LE:
4430 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
4431 break;
4432 default:
4433 eol_type = detect_eol_type (src, src_bytes, &skip);
4434 break;
4435 }
4436
4437 if (coding->heading_ascii > skip)
4438 coding->heading_ascii = skip;
4439 else
4440 skip = coding->heading_ascii;
4441
4442 if (eol_type == CODING_EOL_UNDECIDED)
4443 return;
4444 if (eol_type == CODING_EOL_INCONSISTENT)
4445 {
4446 #if 0
4447 /* This code is suppressed until we find a better way to
4448 distinguish raw text file and binary file. */
4449
4450 /* If we have already detected that the coding is raw-text, the
4451 coding should actually be no-conversion. */
4452 if (coding->type == coding_type_raw_text)
4453 {
4454 setup_coding_system (Qno_conversion, coding);
4455 return;
4456 }
4457 /* Else, let's decode only text code anyway. */
4458 #endif /* 0 */
4459 eol_type = CODING_EOL_LF;
4460 }
4461
4462 val = Fget (coding->symbol, Qeol_type);
4463 if (VECTORP (val) && XVECTOR (val)->size == 3)
4464 {
4465 int src_multibyte = coding->src_multibyte;
4466 int dst_multibyte = coding->dst_multibyte;
4467 struct composition_data *cmp_data = coding->cmp_data;
4468
4469 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
4470 coding->src_multibyte = src_multibyte;
4471 coding->dst_multibyte = dst_multibyte;
4472 coding->heading_ascii = skip;
4473 coding->cmp_data = cmp_data;
4474 }
4475 }
4476
4477 #define CONVERSION_BUFFER_EXTRA_ROOM 256
4478
4479 #define DECODING_BUFFER_MAG(coding) \
4480 (coding->type == coding_type_iso2022 \
4481 ? 3 \
4482 : (coding->type == coding_type_ccl \
4483 ? coding->spec.ccl.decoder.buf_magnification \
4484 : 2))
4485
4486 /* Return maximum size (bytes) of a buffer enough for decoding
4487 SRC_BYTES of text encoded in CODING. */
4488
4489 int
4490 decoding_buffer_size (coding, src_bytes)
4491 struct coding_system *coding;
4492 int src_bytes;
4493 {
4494 return (src_bytes * DECODING_BUFFER_MAG (coding)
4495 + CONVERSION_BUFFER_EXTRA_ROOM);
4496 }
4497
4498 /* Return maximum size (bytes) of a buffer enough for encoding
4499 SRC_BYTES of text to CODING. */
4500
4501 int
4502 encoding_buffer_size (coding, src_bytes)
4503 struct coding_system *coding;
4504 int src_bytes;
4505 {
4506 int magnification;
4507
4508 if (coding->type == coding_type_ccl)
4509 {
4510 magnification = coding->spec.ccl.encoder.buf_magnification;
4511 if (coding->eol_type == CODING_EOL_CRLF)
4512 magnification *= 2;
4513 }
4514 else if (CODING_REQUIRE_ENCODING (coding))
4515 magnification = 3;
4516 else
4517 magnification = 1;
4518
4519 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
4520 }
4521
4522 /* Working buffer for code conversion. */
4523 struct conversion_buffer
4524 {
4525 int size; /* size of data. */
4526 int on_stack; /* 1 if allocated by alloca. */
4527 unsigned char *data;
4528 };
4529
4530 /* Don't use alloca for allocating memory space larger than this, lest
4531 we overflow their stack. */
4532 #define MAX_ALLOCA 16*1024
4533
4534 /* Allocate LEN bytes of memory for BUF (struct conversion_buffer). */
4535 #define allocate_conversion_buffer(buf, len) \
4536 do { \
4537 if (len < MAX_ALLOCA) \
4538 { \
4539 buf.data = (unsigned char *) alloca (len); \
4540 buf.on_stack = 1; \
4541 } \
4542 else \
4543 { \
4544 buf.data = (unsigned char *) xmalloc (len); \
4545 buf.on_stack = 0; \
4546 } \
4547 buf.size = len; \
4548 } while (0)
4549
4550 /* Double the allocated memory for *BUF. */
4551 static void
4552 extend_conversion_buffer (buf)
4553 struct conversion_buffer *buf;
4554 {
4555 if (buf->on_stack)
4556 {
4557 unsigned char *save = buf->data;
4558 buf->data = (unsigned char *) xmalloc (buf->size * 2);
4559 bcopy (save, buf->data, buf->size);
4560 buf->on_stack = 0;
4561 }
4562 else
4563 {
4564 buf->data = (unsigned char *) xrealloc (buf->data, buf->size * 2);
4565 }
4566 buf->size *= 2;
4567 }
4568
4569 /* Free the allocated memory for BUF if it is not on stack. */
4570 static void
4571 free_conversion_buffer (buf)
4572 struct conversion_buffer *buf;
4573 {
4574 if (!buf->on_stack)
4575 xfree (buf->data);
4576 }
4577
4578 int
4579 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
4580 struct coding_system *coding;
4581 unsigned char *source, *destination;
4582 int src_bytes, dst_bytes, encodep;
4583 {
4584 struct ccl_program *ccl
4585 = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
4586 unsigned char *dst = destination;
4587
4588 ccl->suppress_error = coding->suppress_error;
4589 ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4590 if (encodep)
4591 {
4592 /* On encoding, EOL format is converted within ccl_driver. For
4593 that, setup proper information in the structure CCL. */
4594 ccl->eol_type = coding->eol_type;
4595 if (ccl->eol_type ==CODING_EOL_UNDECIDED)
4596 ccl->eol_type = CODING_EOL_LF;
4597 ccl->cr_consumed = coding->spec.ccl.cr_carryover;
4598 ccl->eight_bit_control = coding->dst_multibyte;
4599 }
4600 else
4601 ccl->eight_bit_control = 1;
4602 ccl->multibyte = coding->src_multibyte;
4603 if (coding->spec.ccl.eight_bit_carryover[0] != 0)
4604 {
4605 /* Move carryover bytes to DESTINATION. */
4606 unsigned char *p = coding->spec.ccl.eight_bit_carryover;
4607 while (*p)
4608 *dst++ = *p++;
4609 coding->spec.ccl.eight_bit_carryover[0] = 0;
4610 if (dst_bytes)
4611 dst_bytes -= dst - destination;
4612 }
4613
4614 coding->produced = (ccl_driver (ccl, source, dst, src_bytes, dst_bytes,
4615 &(coding->consumed))
4616 + dst - destination);
4617
4618 if (encodep)
4619 {
4620 coding->produced_char = coding->produced;
4621 coding->spec.ccl.cr_carryover = ccl->cr_consumed;
4622 }
4623 else if (!ccl->eight_bit_control)
4624 {
4625 /* The produced bytes forms a valid multibyte sequence. */
4626 coding->produced_char
4627 = multibyte_chars_in_text (destination, coding->produced);
4628 coding->spec.ccl.eight_bit_carryover[0] = 0;
4629 }
4630 else
4631 {
4632 /* On decoding, the destination should always multibyte. But,
4633 CCL program might have been generated an invalid multibyte
4634 sequence. Here we make such a sequence valid as
4635 multibyte. */
4636 int bytes
4637 = dst_bytes ? dst_bytes : source + coding->consumed - destination;
4638
4639 if ((coding->consumed < src_bytes
4640 || !ccl->last_block)
4641 && coding->produced >= 1
4642 && destination[coding->produced - 1] >= 0x80)
4643 {
4644 /* We should not convert the tailing 8-bit codes to
4645 multibyte form even if they doesn't form a valid
4646 multibyte sequence. They may form a valid sequence in
4647 the next call. */
4648 int carryover = 0;
4649
4650 if (destination[coding->produced - 1] < 0xA0)
4651 carryover = 1;
4652 else if (coding->produced >= 2)
4653 {
4654 if (destination[coding->produced - 2] >= 0x80)
4655 {
4656 if (destination[coding->produced - 2] < 0xA0)
4657 carryover = 2;
4658 else if (coding->produced >= 3
4659 && destination[coding->produced - 3] >= 0x80
4660 && destination[coding->produced - 3] < 0xA0)
4661 carryover = 3;
4662 }
4663 }
4664 if (carryover > 0)
4665 {
4666 BCOPY_SHORT (destination + coding->produced - carryover,
4667 coding->spec.ccl.eight_bit_carryover,
4668 carryover);
4669 coding->spec.ccl.eight_bit_carryover[carryover] = 0;
4670 coding->produced -= carryover;
4671 }
4672 }
4673 coding->produced = str_as_multibyte (destination, bytes,
4674 coding->produced,
4675 &(coding->produced_char));
4676 }
4677
4678 switch (ccl->status)
4679 {
4680 case CCL_STAT_SUSPEND_BY_SRC:
4681 coding->result = CODING_FINISH_INSUFFICIENT_SRC;
4682 break;
4683 case CCL_STAT_SUSPEND_BY_DST:
4684 coding->result = CODING_FINISH_INSUFFICIENT_DST;
4685 break;
4686 case CCL_STAT_QUIT:
4687 case CCL_STAT_INVALID_CMD:
4688 coding->result = CODING_FINISH_INTERRUPT;
4689 break;
4690 default:
4691 coding->result = CODING_FINISH_NORMAL;
4692 break;
4693 }
4694 return coding->result;
4695 }
4696
4697 /* Decode EOL format of the text at PTR of BYTES length destructively
4698 according to CODING->eol_type. This is called after the CCL
4699 program produced a decoded text at PTR. If we do CRLF->LF
4700 conversion, update CODING->produced and CODING->produced_char. */
4701
4702 static void
4703 decode_eol_post_ccl (coding, ptr, bytes)
4704 struct coding_system *coding;
4705 unsigned char *ptr;
4706 int bytes;
4707 {
4708 Lisp_Object val, saved_coding_symbol;
4709 unsigned char *pend = ptr + bytes;
4710 int dummy;
4711
4712 /* Remember the current coding system symbol. We set it back when
4713 an inconsistent EOL is found so that `last-coding-system-used' is
4714 set to the coding system that doesn't specify EOL conversion. */
4715 saved_coding_symbol = coding->symbol;
4716
4717 coding->spec.ccl.cr_carryover = 0;
4718 if (coding->eol_type == CODING_EOL_UNDECIDED)
4719 {
4720 /* Here, to avoid the call of setup_coding_system, we directly
4721 call detect_eol_type. */
4722 coding->eol_type = detect_eol_type (ptr, bytes, &dummy);
4723 if (coding->eol_type == CODING_EOL_INCONSISTENT)
4724 coding->eol_type = CODING_EOL_LF;
4725 if (coding->eol_type != CODING_EOL_UNDECIDED)
4726 {
4727 val = Fget (coding->symbol, Qeol_type);
4728 if (VECTORP (val) && XVECTOR (val)->size == 3)
4729 coding->symbol = XVECTOR (val)->contents[coding->eol_type];
4730 }
4731 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4732 }
4733
4734 if (coding->eol_type == CODING_EOL_LF
4735 || coding->eol_type == CODING_EOL_UNDECIDED)
4736 {
4737 /* We have nothing to do. */
4738 ptr = pend;
4739 }
4740 else if (coding->eol_type == CODING_EOL_CRLF)
4741 {
4742 unsigned char *pstart = ptr, *p = ptr;
4743
4744 if (! (coding->mode & CODING_MODE_LAST_BLOCK)
4745 && *(pend - 1) == '\r')
4746 {
4747 /* If the last character is CR, we can't handle it here
4748 because LF will be in the not-yet-decoded source text.
4749 Record that the CR is not yet processed. */
4750 coding->spec.ccl.cr_carryover = 1;
4751 coding->produced--;
4752 coding->produced_char--;
4753 pend--;
4754 }
4755 while (ptr < pend)
4756 {
4757 if (*ptr == '\r')
4758 {
4759 if (ptr + 1 < pend && *(ptr + 1) == '\n')
4760 {
4761 *p++ = '\n';
4762 ptr += 2;
4763 }
4764 else
4765 {
4766 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4767 goto undo_eol_conversion;
4768 *p++ = *ptr++;
4769 }
4770 }
4771 else if (*ptr == '\n'
4772 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4773 goto undo_eol_conversion;
4774 else
4775 *p++ = *ptr++;
4776 continue;
4777
4778 undo_eol_conversion:
4779 /* We have faced with inconsistent EOL format at PTR.
4780 Convert all LFs before PTR back to CRLFs. */
4781 for (p--, ptr--; p >= pstart; p--)
4782 {
4783 if (*p == '\n')
4784 *ptr-- = '\n', *ptr-- = '\r';
4785 else
4786 *ptr-- = *p;
4787 }
4788 /* If carryover is recorded, cancel it because we don't
4789 convert CRLF anymore. */
4790 if (coding->spec.ccl.cr_carryover)
4791 {
4792 coding->spec.ccl.cr_carryover = 0;
4793 coding->produced++;
4794 coding->produced_char++;
4795 pend++;
4796 }
4797 p = ptr = pend;
4798 coding->eol_type = CODING_EOL_LF;
4799 coding->symbol = saved_coding_symbol;
4800 }
4801 if (p < pend)
4802 {
4803 /* As each two-byte sequence CRLF was converted to LF, (PEND
4804 - P) is the number of deleted characters. */
4805 coding->produced -= pend - p;
4806 coding->produced_char -= pend - p;
4807 }
4808 }
4809 else /* i.e. coding->eol_type == CODING_EOL_CR */
4810 {
4811 unsigned char *p = ptr;
4812
4813 for (; ptr < pend; ptr++)
4814 {
4815 if (*ptr == '\r')
4816 *ptr = '\n';
4817 else if (*ptr == '\n'
4818 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4819 {
4820 for (; p < ptr; p++)
4821 {
4822 if (*p == '\n')
4823 *p = '\r';
4824 }
4825 ptr = pend;
4826 coding->eol_type = CODING_EOL_LF;
4827 coding->symbol = saved_coding_symbol;
4828 }
4829 }
4830 }
4831 }
4832
4833 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
4834 decoding, it may detect coding system and format of end-of-line if
4835 those are not yet decided. The source should be unibyte, the
4836 result is multibyte if CODING->dst_multibyte is nonzero, else
4837 unibyte. */
4838
4839 int
4840 decode_coding (coding, source, destination, src_bytes, dst_bytes)
4841 struct coding_system *coding;
4842 const unsigned char *source;
4843 unsigned char *destination;
4844 int src_bytes, dst_bytes;
4845 {
4846 int extra = 0;
4847
4848 if (coding->type == coding_type_undecided)
4849 detect_coding (coding, source, src_bytes);
4850
4851 if (coding->eol_type == CODING_EOL_UNDECIDED
4852 && coding->type != coding_type_ccl)
4853 {
4854 detect_eol (coding, source, src_bytes);
4855 /* We had better recover the original eol format if we
4856 encounter an inconsistent eol format while decoding. */
4857 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4858 }
4859
4860 coding->produced = coding->produced_char = 0;
4861 coding->consumed = coding->consumed_char = 0;
4862 coding->errors = 0;
4863 coding->result = CODING_FINISH_NORMAL;
4864
4865 switch (coding->type)
4866 {
4867 case coding_type_sjis:
4868 decode_coding_sjis_big5 (coding, source, destination,
4869 src_bytes, dst_bytes, 1);
4870 break;
4871
4872 case coding_type_iso2022:
4873 decode_coding_iso2022 (coding, source, destination,
4874 src_bytes, dst_bytes);
4875 break;
4876
4877 case coding_type_big5:
4878 decode_coding_sjis_big5 (coding, source, destination,
4879 src_bytes, dst_bytes, 0);
4880 break;
4881
4882 case coding_type_emacs_mule:
4883 decode_coding_emacs_mule (coding, source, destination,
4884 src_bytes, dst_bytes);
4885 break;
4886
4887 case coding_type_ccl:
4888 if (coding->spec.ccl.cr_carryover)
4889 {
4890 /* Put the CR which was not processed by the previous call
4891 of decode_eol_post_ccl in DESTINATION. It will be
4892 decoded together with the following LF by the call to
4893 decode_eol_post_ccl below. */
4894 *destination = '\r';
4895 coding->produced++;
4896 coding->produced_char++;
4897 dst_bytes--;
4898 extra = coding->spec.ccl.cr_carryover;
4899 }
4900 ccl_coding_driver (coding, source, destination + extra,
4901 src_bytes, dst_bytes, 0);
4902 if (coding->eol_type != CODING_EOL_LF)
4903 {
4904 coding->produced += extra;
4905 coding->produced_char += extra;
4906 decode_eol_post_ccl (coding, destination, coding->produced);
4907 }
4908 break;
4909
4910 default:
4911 decode_eol (coding, source, destination, src_bytes, dst_bytes);
4912 }
4913
4914 if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4915 && coding->mode & CODING_MODE_LAST_BLOCK
4916 && coding->consumed == src_bytes)
4917 coding->result = CODING_FINISH_NORMAL;
4918
4919 if (coding->mode & CODING_MODE_LAST_BLOCK
4920 && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4921 {
4922 const unsigned char *src = source + coding->consumed;
4923 unsigned char *dst = destination + coding->produced;
4924
4925 src_bytes -= coding->consumed;
4926 coding->errors++;
4927 if (COMPOSING_P (coding))
4928 DECODE_COMPOSITION_END ('1');
4929 while (src_bytes--)
4930 {
4931 int c = *src++;
4932 dst += CHAR_STRING (c, dst);
4933 coding->produced_char++;
4934 }
4935 coding->consumed = coding->consumed_char = src - source;
4936 coding->produced = dst - destination;
4937 coding->result = CODING_FINISH_NORMAL;
4938 }
4939
4940 if (!coding->dst_multibyte)
4941 {
4942 coding->produced = str_as_unibyte (destination, coding->produced);
4943 coding->produced_char = coding->produced;
4944 }
4945
4946 return coding->result;
4947 }
4948
4949 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". The
4950 multibyteness of the source is CODING->src_multibyte, the
4951 multibyteness of the result is always unibyte. */
4952
4953 int
4954 encode_coding (coding, source, destination, src_bytes, dst_bytes)
4955 struct coding_system *coding;
4956 const unsigned char *source;
4957 unsigned char *destination;
4958 int src_bytes, dst_bytes;
4959 {
4960 coding->produced = coding->produced_char = 0;
4961 coding->consumed = coding->consumed_char = 0;
4962 coding->errors = 0;
4963 coding->result = CODING_FINISH_NORMAL;
4964
4965 switch (coding->type)
4966 {
4967 case coding_type_sjis:
4968 encode_coding_sjis_big5 (coding, source, destination,
4969 src_bytes, dst_bytes, 1);
4970 break;
4971
4972 case coding_type_iso2022:
4973 encode_coding_iso2022 (coding, source, destination,
4974 src_bytes, dst_bytes);
4975 break;
4976
4977 case coding_type_big5:
4978 encode_coding_sjis_big5 (coding, source, destination,
4979 src_bytes, dst_bytes, 0);
4980 break;
4981
4982 case coding_type_emacs_mule:
4983 encode_coding_emacs_mule (coding, source, destination,
4984 src_bytes, dst_bytes);
4985 break;
4986
4987 case coding_type_ccl:
4988 ccl_coding_driver (coding, source, destination,
4989 src_bytes, dst_bytes, 1);
4990 break;
4991
4992 default:
4993 encode_eol (coding, source, destination, src_bytes, dst_bytes);
4994 }
4995
4996 if (coding->mode & CODING_MODE_LAST_BLOCK
4997 && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4998 {
4999 const unsigned char *src = source + coding->consumed;
5000 unsigned char *dst = destination + coding->produced;
5001
5002 if (coding->type == coding_type_iso2022)
5003 ENCODE_RESET_PLANE_AND_REGISTER;
5004 if (COMPOSING_P (coding))
5005 *dst++ = ISO_CODE_ESC, *dst++ = '1';
5006 if (coding->consumed < src_bytes)
5007 {
5008 int len = src_bytes - coding->consumed;
5009
5010 BCOPY_SHORT (src, dst, len);
5011 if (coding->src_multibyte)
5012 len = str_as_unibyte (dst, len);
5013 dst += len;
5014 coding->consumed = src_bytes;
5015 }
5016 coding->produced = coding->produced_char = dst - destination;
5017 coding->result = CODING_FINISH_NORMAL;
5018 }
5019
5020 if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
5021 && coding->consumed == src_bytes)
5022 coding->result = CODING_FINISH_NORMAL;
5023
5024 return coding->result;
5025 }
5026
5027 /* Scan text in the region between *BEG and *END (byte positions),
5028 skip characters which we don't have to decode by coding system
5029 CODING at the head and tail, then set *BEG and *END to the region
5030 of the text we actually have to convert. The caller should move
5031 the gap out of the region in advance if the region is from a
5032 buffer.
5033
5034 If STR is not NULL, *BEG and *END are indices into STR. */
5035
5036 static void
5037 shrink_decoding_region (beg, end, coding, str)
5038 int *beg, *end;
5039 struct coding_system *coding;
5040 unsigned char *str;
5041 {
5042 unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
5043 int eol_conversion;
5044 Lisp_Object translation_table;
5045
5046 if (coding->type == coding_type_ccl
5047 || coding->type == coding_type_undecided
5048 || coding->eol_type != CODING_EOL_LF
5049 || !NILP (coding->post_read_conversion)
5050 || coding->composing != COMPOSITION_DISABLED)
5051 {
5052 /* We can't skip any data. */
5053 return;
5054 }
5055 if (coding->type == coding_type_no_conversion
5056 || coding->type == coding_type_raw_text
5057 || coding->type == coding_type_emacs_mule)
5058 {
5059 /* We need no conversion, but don't have to skip any data here.
5060 Decoding routine handles them effectively anyway. */
5061 return;
5062 }
5063
5064 translation_table = coding->translation_table_for_decode;
5065 if (NILP (translation_table) && !NILP (Venable_character_translation))
5066 translation_table = Vstandard_translation_table_for_decode;
5067 if (CHAR_TABLE_P (translation_table))
5068 {
5069 int i;
5070 for (i = 0; i < 128; i++)
5071 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5072 break;
5073 if (i < 128)
5074 /* Some ASCII character should be translated. We give up
5075 shrinking. */
5076 return;
5077 }
5078
5079 if (coding->heading_ascii >= 0)
5080 /* Detection routine has already found how much we can skip at the
5081 head. */
5082 *beg += coding->heading_ascii;
5083
5084 if (str)
5085 {
5086 begp_orig = begp = str + *beg;
5087 endp_orig = endp = str + *end;
5088 }
5089 else
5090 {
5091 begp_orig = begp = BYTE_POS_ADDR (*beg);
5092 endp_orig = endp = begp + *end - *beg;
5093 }
5094
5095 eol_conversion = (coding->eol_type == CODING_EOL_CR
5096 || coding->eol_type == CODING_EOL_CRLF);
5097
5098 switch (coding->type)
5099 {
5100 case coding_type_sjis:
5101 case coding_type_big5:
5102 /* We can skip all ASCII characters at the head. */
5103 if (coding->heading_ascii < 0)
5104 {
5105 if (eol_conversion)
5106 while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
5107 else
5108 while (begp < endp && *begp < 0x80) begp++;
5109 }
5110 /* We can skip all ASCII characters at the tail except for the
5111 second byte of SJIS or BIG5 code. */
5112 if (eol_conversion)
5113 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
5114 else
5115 while (begp < endp && endp[-1] < 0x80) endp--;
5116 /* Do not consider LF as ascii if preceded by CR, since that
5117 confuses eol decoding. */
5118 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5119 endp++;
5120 if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
5121 endp++;
5122 break;
5123
5124 case coding_type_iso2022:
5125 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5126 /* We can't skip any data. */
5127 break;
5128 if (coding->heading_ascii < 0)
5129 {
5130 /* We can skip all ASCII characters at the head except for a
5131 few control codes. */
5132 while (begp < endp && (c = *begp) < 0x80
5133 && c != ISO_CODE_CR && c != ISO_CODE_SO
5134 && c != ISO_CODE_SI && c != ISO_CODE_ESC
5135 && (!eol_conversion || c != ISO_CODE_LF))
5136 begp++;
5137 }
5138 switch (coding->category_idx)
5139 {
5140 case CODING_CATEGORY_IDX_ISO_8_1:
5141 case CODING_CATEGORY_IDX_ISO_8_2:
5142 /* We can skip all ASCII characters at the tail. */
5143 if (eol_conversion)
5144 while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
5145 else
5146 while (begp < endp && endp[-1] < 0x80) endp--;
5147 /* Do not consider LF as ascii if preceded by CR, since that
5148 confuses eol decoding. */
5149 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5150 endp++;
5151 break;
5152
5153 case CODING_CATEGORY_IDX_ISO_7:
5154 case CODING_CATEGORY_IDX_ISO_7_TIGHT:
5155 {
5156 /* We can skip all characters at the tail except for 8-bit
5157 codes and ESC and the following 2-byte at the tail. */
5158 unsigned char *eight_bit = NULL;
5159
5160 if (eol_conversion)
5161 while (begp < endp
5162 && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
5163 {
5164 if (!eight_bit && c & 0x80) eight_bit = endp;
5165 endp--;
5166 }
5167 else
5168 while (begp < endp
5169 && (c = endp[-1]) != ISO_CODE_ESC)
5170 {
5171 if (!eight_bit && c & 0x80) eight_bit = endp;
5172 endp--;
5173 }
5174 /* Do not consider LF as ascii if preceded by CR, since that
5175 confuses eol decoding. */
5176 if (begp < endp && endp < endp_orig
5177 && endp[-1] == '\r' && endp[0] == '\n')
5178 endp++;
5179 if (begp < endp && endp[-1] == ISO_CODE_ESC)
5180 {
5181 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
5182 /* This is an ASCII designation sequence. We can
5183 surely skip the tail. But, if we have
5184 encountered an 8-bit code, skip only the codes
5185 after that. */
5186 endp = eight_bit ? eight_bit : endp + 2;
5187 else
5188 /* Hmmm, we can't skip the tail. */
5189 endp = endp_orig;
5190 }
5191 else if (eight_bit)
5192 endp = eight_bit;
5193 }
5194 }
5195 break;
5196
5197 default:
5198 abort ();
5199 }
5200 *beg += begp - begp_orig;
5201 *end += endp - endp_orig;
5202 return;
5203 }
5204
5205 /* Like shrink_decoding_region but for encoding. */
5206
5207 static void
5208 shrink_encoding_region (beg, end, coding, str)
5209 int *beg, *end;
5210 struct coding_system *coding;
5211 unsigned char *str;
5212 {
5213 unsigned char *begp_orig, *begp, *endp_orig, *endp;
5214 int eol_conversion;
5215 Lisp_Object translation_table;
5216
5217 if (coding->type == coding_type_ccl
5218 || coding->eol_type == CODING_EOL_CRLF
5219 || coding->eol_type == CODING_EOL_CR
5220 || (coding->cmp_data && coding->cmp_data->used > 0))
5221 {
5222 /* We can't skip any data. */
5223 return;
5224 }
5225 if (coding->type == coding_type_no_conversion
5226 || coding->type == coding_type_raw_text
5227 || coding->type == coding_type_emacs_mule
5228 || coding->type == coding_type_undecided)
5229 {
5230 /* We need no conversion, but don't have to skip any data here.
5231 Encoding routine handles them effectively anyway. */
5232 return;
5233 }
5234
5235 translation_table = coding->translation_table_for_encode;
5236 if (NILP (translation_table) && !NILP (Venable_character_translation))
5237 translation_table = Vstandard_translation_table_for_encode;
5238 if (CHAR_TABLE_P (translation_table))
5239 {
5240 int i;
5241 for (i = 0; i < 128; i++)
5242 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5243 break;
5244 if (i < 128)
5245 /* Some ASCII character should be translated. We give up
5246 shrinking. */
5247 return;
5248 }
5249
5250 if (str)
5251 {
5252 begp_orig = begp = str + *beg;
5253 endp_orig = endp = str + *end;
5254 }
5255 else
5256 {
5257 begp_orig = begp = BYTE_POS_ADDR (*beg);
5258 endp_orig = endp = begp + *end - *beg;
5259 }
5260
5261 eol_conversion = (coding->eol_type == CODING_EOL_CR
5262 || coding->eol_type == CODING_EOL_CRLF);
5263
5264 /* Here, we don't have to check coding->pre_write_conversion because
5265 the caller is expected to have handled it already. */
5266 switch (coding->type)
5267 {
5268 case coding_type_iso2022:
5269 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5270 /* We can't skip any data. */
5271 break;
5272 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
5273 {
5274 unsigned char *bol = begp;
5275 while (begp < endp && *begp < 0x80)
5276 {
5277 begp++;
5278 if (begp[-1] == '\n')
5279 bol = begp;
5280 }
5281 begp = bol;
5282 goto label_skip_tail;
5283 }
5284 /* fall down ... */
5285
5286 case coding_type_sjis:
5287 case coding_type_big5:
5288 /* We can skip all ASCII characters at the head and tail. */
5289 if (eol_conversion)
5290 while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
5291 else
5292 while (begp < endp && *begp < 0x80) begp++;
5293 label_skip_tail:
5294 if (eol_conversion)
5295 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
5296 else
5297 while (begp < endp && *(endp - 1) < 0x80) endp--;
5298 break;
5299
5300 default:
5301 abort ();
5302 }
5303
5304 *beg += begp - begp_orig;
5305 *end += endp - endp_orig;
5306 return;
5307 }
5308
5309 /* As shrinking conversion region requires some overhead, we don't try
5310 shrinking if the length of conversion region is less than this
5311 value. */
5312 static int shrink_conversion_region_threshhold = 1024;
5313
5314 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep) \
5315 do { \
5316 if (*(end) - *(beg) > shrink_conversion_region_threshhold) \
5317 { \
5318 if (encodep) shrink_encoding_region (beg, end, coding, str); \
5319 else shrink_decoding_region (beg, end, coding, str); \
5320 } \
5321 } while (0)
5322
5323 static Lisp_Object
5324 code_convert_region_unwind (arg)
5325 Lisp_Object arg;
5326 {
5327 inhibit_pre_post_conversion = 0;
5328 Vlast_coding_system_used = arg;
5329 return Qnil;
5330 }
5331
5332 /* Store information about all compositions in the range FROM and TO
5333 of OBJ in memory blocks pointed by CODING->cmp_data. OBJ is a
5334 buffer or a string, defaults to the current buffer. */
5335
5336 void
5337 coding_save_composition (coding, from, to, obj)
5338 struct coding_system *coding;
5339 int from, to;
5340 Lisp_Object obj;
5341 {
5342 Lisp_Object prop;
5343 int start, end;
5344
5345 if (coding->composing == COMPOSITION_DISABLED)
5346 return;
5347 if (!coding->cmp_data)
5348 coding_allocate_composition_data (coding, from);
5349 if (!find_composition (from, to, &start, &end, &prop, obj)
5350 || end > to)
5351 return;
5352 if (start < from
5353 && (!find_composition (end, to, &start, &end, &prop, obj)
5354 || end > to))
5355 return;
5356 coding->composing = COMPOSITION_NO;
5357 do
5358 {
5359 if (COMPOSITION_VALID_P (start, end, prop))
5360 {
5361 enum composition_method method = COMPOSITION_METHOD (prop);
5362 if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
5363 >= COMPOSITION_DATA_SIZE)
5364 coding_allocate_composition_data (coding, from);
5365 /* For relative composition, we remember start and end
5366 positions, for the other compositions, we also remember
5367 components. */
5368 CODING_ADD_COMPOSITION_START (coding, start - from, method);
5369 if (method != COMPOSITION_RELATIVE)
5370 {
5371 /* We must store a*/
5372 Lisp_Object val, ch;
5373
5374 val = COMPOSITION_COMPONENTS (prop);
5375 if (CONSP (val))
5376 while (CONSP (val))
5377 {
5378 ch = XCAR (val), val = XCDR (val);
5379 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5380 }
5381 else if (VECTORP (val) || STRINGP (val))
5382 {
5383 int len = (VECTORP (val)
5384 ? XVECTOR (val)->size : SCHARS (val));
5385 int i;
5386 for (i = 0; i < len; i++)
5387 {
5388 ch = (STRINGP (val)
5389 ? Faref (val, make_number (i))
5390 : XVECTOR (val)->contents[i]);
5391 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5392 }
5393 }
5394 else /* INTEGERP (val) */
5395 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
5396 }
5397 CODING_ADD_COMPOSITION_END (coding, end - from);
5398 }
5399 start = end;
5400 }
5401 while (start < to
5402 && find_composition (start, to, &start, &end, &prop, obj)
5403 && end <= to);
5404
5405 /* Make coding->cmp_data point to the first memory block. */
5406 while (coding->cmp_data->prev)
5407 coding->cmp_data = coding->cmp_data->prev;
5408 coding->cmp_data_start = 0;
5409 }
5410
5411 /* Reflect the saved information about compositions to OBJ.
5412 CODING->cmp_data points to a memory block for the information. OBJ
5413 is a buffer or a string, defaults to the current buffer. */
5414
5415 void
5416 coding_restore_composition (coding, obj)
5417 struct coding_system *coding;
5418 Lisp_Object obj;
5419 {
5420 struct composition_data *cmp_data = coding->cmp_data;
5421
5422 if (!cmp_data)
5423 return;
5424
5425 while (cmp_data->prev)
5426 cmp_data = cmp_data->prev;
5427
5428 while (cmp_data)
5429 {
5430 int i;
5431
5432 for (i = 0; i < cmp_data->used && cmp_data->data[i] > 0;
5433 i += cmp_data->data[i])
5434 {
5435 int *data = cmp_data->data + i;
5436 enum composition_method method = (enum composition_method) data[3];
5437 Lisp_Object components;
5438
5439 if (method == COMPOSITION_RELATIVE)
5440 components = Qnil;
5441 else
5442 {
5443 int len = data[0] - 4, j;
5444 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5445
5446 if (method == COMPOSITION_WITH_RULE_ALTCHARS
5447 && len % 2 == 0)
5448 len --;
5449 for (j = 0; j < len; j++)
5450 args[j] = make_number (data[4 + j]);
5451 components = (method == COMPOSITION_WITH_ALTCHARS
5452 ? Fstring (len, args) : Fvector (len, args));
5453 }
5454 compose_text (data[1], data[2], components, Qnil, obj);
5455 }
5456 cmp_data = cmp_data->next;
5457 }
5458 }
5459
5460 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
5461 text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
5462 coding system CODING, and return the status code of code conversion
5463 (currently, this value has no meaning).
5464
5465 How many characters (and bytes) are converted to how many
5466 characters (and bytes) are recorded in members of the structure
5467 CODING.
5468
5469 If REPLACE is nonzero, we do various things as if the original text
5470 is deleted and a new text is inserted. See the comments in
5471 replace_range (insdel.c) to know what we are doing.
5472
5473 If REPLACE is zero, it is assumed that the source text is unibyte.
5474 Otherwise, it is assumed that the source text is multibyte. */
5475
5476 int
5477 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
5478 int from, from_byte, to, to_byte, encodep, replace;
5479 struct coding_system *coding;
5480 {
5481 int len = to - from, len_byte = to_byte - from_byte;
5482 int nchars_del = 0, nbytes_del = 0;
5483 int require, inserted, inserted_byte;
5484 int head_skip, tail_skip, total_skip = 0;
5485 Lisp_Object saved_coding_symbol;
5486 int first = 1;
5487 unsigned char *src, *dst;
5488 Lisp_Object deletion;
5489 int orig_point = PT, orig_len = len;
5490 int prev_Z;
5491 int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
5492
5493 deletion = Qnil;
5494 saved_coding_symbol = coding->symbol;
5495
5496 if (from < PT && PT < to)
5497 {
5498 TEMP_SET_PT_BOTH (from, from_byte);
5499 orig_point = from;
5500 }
5501
5502 if (replace)
5503 {
5504 int saved_from = from;
5505 int saved_inhibit_modification_hooks;
5506
5507 prepare_to_modify_buffer (from, to, &from);
5508 if (saved_from != from)
5509 {
5510 to = from + len;
5511 from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
5512 len_byte = to_byte - from_byte;
5513 }
5514
5515 /* The code conversion routine can not preserve text properties
5516 for now. So, we must remove all text properties in the
5517 region. Here, we must suppress all modification hooks. */
5518 saved_inhibit_modification_hooks = inhibit_modification_hooks;
5519 inhibit_modification_hooks = 1;
5520 Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
5521 inhibit_modification_hooks = saved_inhibit_modification_hooks;
5522 }
5523
5524 if (! encodep && CODING_REQUIRE_DETECTION (coding))
5525 {
5526 /* We must detect encoding of text and eol format. */
5527
5528 if (from < GPT && to > GPT)
5529 move_gap_both (from, from_byte);
5530 if (coding->type == coding_type_undecided)
5531 {
5532 detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
5533 if (coding->type == coding_type_undecided)
5534 {
5535 /* It seems that the text contains only ASCII, but we
5536 should not leave it undecided because the deeper
5537 decoding routine (decode_coding) tries to detect the
5538 encodings again in vain. */
5539 coding->type = coding_type_emacs_mule;
5540 coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
5541 /* As emacs-mule decoder will handle composition, we
5542 need this setting to allocate coding->cmp_data
5543 later. */
5544 coding->composing = COMPOSITION_NO;
5545 }
5546 }
5547 if (coding->eol_type == CODING_EOL_UNDECIDED
5548 && coding->type != coding_type_ccl)
5549 {
5550 detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
5551 if (coding->eol_type == CODING_EOL_UNDECIDED)
5552 coding->eol_type = CODING_EOL_LF;
5553 /* We had better recover the original eol format if we
5554 encounter an inconsistent eol format while decoding. */
5555 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5556 }
5557 }
5558
5559 /* Now we convert the text. */
5560
5561 /* For encoding, we must process pre-write-conversion in advance. */
5562 if (! inhibit_pre_post_conversion
5563 && encodep
5564 && SYMBOLP (coding->pre_write_conversion)
5565 && ! NILP (Ffboundp (coding->pre_write_conversion)))
5566 {
5567 /* The function in pre-write-conversion may put a new text in a
5568 new buffer. */
5569 struct buffer *prev = current_buffer;
5570 Lisp_Object new;
5571
5572 record_unwind_protect (code_convert_region_unwind,
5573 Vlast_coding_system_used);
5574 /* We should not call any more pre-write/post-read-conversion
5575 functions while this pre-write-conversion is running. */
5576 inhibit_pre_post_conversion = 1;
5577 call2 (coding->pre_write_conversion,
5578 make_number (from), make_number (to));
5579 inhibit_pre_post_conversion = 0;
5580 /* Discard the unwind protect. */
5581 specpdl_ptr--;
5582
5583 if (current_buffer != prev)
5584 {
5585 len = ZV - BEGV;
5586 new = Fcurrent_buffer ();
5587 set_buffer_internal_1 (prev);
5588 del_range_2 (from, from_byte, to, to_byte, 0);
5589 TEMP_SET_PT_BOTH (from, from_byte);
5590 insert_from_buffer (XBUFFER (new), 1, len, 0);
5591 Fkill_buffer (new);
5592 if (orig_point >= to)
5593 orig_point += len - orig_len;
5594 else if (orig_point > from)
5595 orig_point = from;
5596 orig_len = len;
5597 to = from + len;
5598 from_byte = CHAR_TO_BYTE (from);
5599 to_byte = CHAR_TO_BYTE (to);
5600 len_byte = to_byte - from_byte;
5601 TEMP_SET_PT_BOTH (from, from_byte);
5602 }
5603 }
5604
5605 if (replace)
5606 {
5607 if (! EQ (current_buffer->undo_list, Qt))
5608 deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
5609 else
5610 {
5611 nchars_del = to - from;
5612 nbytes_del = to_byte - from_byte;
5613 }
5614 }
5615
5616 if (coding->composing != COMPOSITION_DISABLED)
5617 {
5618 if (encodep)
5619 coding_save_composition (coding, from, to, Fcurrent_buffer ());
5620 else
5621 coding_allocate_composition_data (coding, from);
5622 }
5623
5624 /* Try to skip the heading and tailing ASCIIs. */
5625 if (coding->type != coding_type_ccl)
5626 {
5627 int from_byte_orig = from_byte, to_byte_orig = to_byte;
5628
5629 if (from < GPT && GPT < to)
5630 move_gap_both (from, from_byte);
5631 SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
5632 if (from_byte == to_byte
5633 && (encodep || NILP (coding->post_read_conversion))
5634 && ! CODING_REQUIRE_FLUSHING (coding))
5635 {
5636 coding->produced = len_byte;
5637 coding->produced_char = len;
5638 if (!replace)
5639 /* We must record and adjust for this new text now. */
5640 adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
5641 return 0;
5642 }
5643
5644 head_skip = from_byte - from_byte_orig;
5645 tail_skip = to_byte_orig - to_byte;
5646 total_skip = head_skip + tail_skip;
5647 from += head_skip;
5648 to -= tail_skip;
5649 len -= total_skip; len_byte -= total_skip;
5650 }
5651
5652 /* For conversion, we must put the gap before the text in addition to
5653 making the gap larger for efficient decoding. The required gap
5654 size starts from 2000 which is the magic number used in make_gap.
5655 But, after one batch of conversion, it will be incremented if we
5656 find that it is not enough . */
5657 require = 2000;
5658
5659 if (GAP_SIZE < require)
5660 make_gap (require - GAP_SIZE);
5661 move_gap_both (from, from_byte);
5662
5663 inserted = inserted_byte = 0;
5664
5665 GAP_SIZE += len_byte;
5666 ZV -= len;
5667 Z -= len;
5668 ZV_BYTE -= len_byte;
5669 Z_BYTE -= len_byte;
5670
5671 if (GPT - BEG < BEG_UNCHANGED)
5672 BEG_UNCHANGED = GPT - BEG;
5673 if (Z - GPT < END_UNCHANGED)
5674 END_UNCHANGED = Z - GPT;
5675
5676 if (!encodep && coding->src_multibyte)
5677 {
5678 /* Decoding routines expects that the source text is unibyte.
5679 We must convert 8-bit characters of multibyte form to
5680 unibyte. */
5681 int len_byte_orig = len_byte;
5682 len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
5683 if (len_byte < len_byte_orig)
5684 safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
5685 len_byte);
5686 coding->src_multibyte = 0;
5687 }
5688
5689 for (;;)
5690 {
5691 int result;
5692
5693 /* The buffer memory is now:
5694 +--------+converted-text+---------+-------original-text-------+---+
5695 |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
5696 |<---------------------- GAP ----------------------->| */
5697 src = GAP_END_ADDR - len_byte;
5698 dst = GPT_ADDR + inserted_byte;
5699
5700 if (encodep)
5701 result = encode_coding (coding, src, dst, len_byte, 0);
5702 else
5703 {
5704 if (coding->composing != COMPOSITION_DISABLED)
5705 coding->cmp_data->char_offset = from + inserted;
5706 result = decode_coding (coding, src, dst, len_byte, 0);
5707 }
5708
5709 /* The buffer memory is now:
5710 +--------+-------converted-text----+--+------original-text----+---+
5711 |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
5712 |<---------------------- GAP ----------------------->| */
5713
5714 inserted += coding->produced_char;
5715 inserted_byte += coding->produced;
5716 len_byte -= coding->consumed;
5717
5718 if (result == CODING_FINISH_INSUFFICIENT_CMP)
5719 {
5720 coding_allocate_composition_data (coding, from + inserted);
5721 continue;
5722 }
5723
5724 src += coding->consumed;
5725 dst += coding->produced;
5726
5727 if (result == CODING_FINISH_NORMAL)
5728 {
5729 src += len_byte;
5730 break;
5731 }
5732 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
5733 {
5734 unsigned char *pend = dst, *p = pend - inserted_byte;
5735 Lisp_Object eol_type;
5736
5737 /* Encode LFs back to the original eol format (CR or CRLF). */
5738 if (coding->eol_type == CODING_EOL_CR)
5739 {
5740 while (p < pend) if (*p++ == '\n') p[-1] = '\r';
5741 }
5742 else
5743 {
5744 int count = 0;
5745
5746 while (p < pend) if (*p++ == '\n') count++;
5747 if (src - dst < count)
5748 {
5749 /* We don't have sufficient room for encoding LFs
5750 back to CRLF. We must record converted and
5751 not-yet-converted text back to the buffer
5752 content, enlarge the gap, then record them out of
5753 the buffer contents again. */
5754 int add = len_byte + inserted_byte;
5755
5756 GAP_SIZE -= add;
5757 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5758 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5759 make_gap (count - GAP_SIZE);
5760 GAP_SIZE += add;
5761 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5762 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5763 /* Don't forget to update SRC, DST, and PEND. */
5764 src = GAP_END_ADDR - len_byte;
5765 dst = GPT_ADDR + inserted_byte;
5766 pend = dst;
5767 }
5768 inserted += count;
5769 inserted_byte += count;
5770 coding->produced += count;
5771 p = dst = pend + count;
5772 while (count)
5773 {
5774 *--p = *--pend;
5775 if (*p == '\n') count--, *--p = '\r';
5776 }
5777 }
5778
5779 /* Suppress eol-format conversion in the further conversion. */
5780 coding->eol_type = CODING_EOL_LF;
5781
5782 /* Set the coding system symbol to that for Unix-like EOL. */
5783 eol_type = Fget (saved_coding_symbol, Qeol_type);
5784 if (VECTORP (eol_type)
5785 && XVECTOR (eol_type)->size == 3
5786 && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
5787 coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
5788 else
5789 coding->symbol = saved_coding_symbol;
5790
5791 continue;
5792 }
5793 if (len_byte <= 0)
5794 {
5795 if (coding->type != coding_type_ccl
5796 || coding->mode & CODING_MODE_LAST_BLOCK)
5797 break;
5798 coding->mode |= CODING_MODE_LAST_BLOCK;
5799 continue;
5800 }
5801 if (result == CODING_FINISH_INSUFFICIENT_SRC)
5802 {
5803 /* The source text ends in invalid codes. Let's just
5804 make them valid buffer contents, and finish conversion. */
5805 if (multibyte_p)
5806 {
5807 unsigned char *start = dst;
5808
5809 inserted += len_byte;
5810 while (len_byte--)
5811 {
5812 int c = *src++;
5813 dst += CHAR_STRING (c, dst);
5814 }
5815
5816 inserted_byte += dst - start;
5817 }
5818 else
5819 {
5820 inserted += len_byte;
5821 inserted_byte += len_byte;
5822 while (len_byte--)
5823 *dst++ = *src++;
5824 }
5825 break;
5826 }
5827 if (result == CODING_FINISH_INTERRUPT)
5828 {
5829 /* The conversion procedure was interrupted by a user. */
5830 break;
5831 }
5832 /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST */
5833 if (coding->consumed < 1)
5834 {
5835 /* It's quite strange to require more memory without
5836 consuming any bytes. Perhaps CCL program bug. */
5837 break;
5838 }
5839 if (first)
5840 {
5841 /* We have just done the first batch of conversion which was
5842 stopped because of insufficient gap. Let's reconsider the
5843 required gap size (i.e. SRT - DST) now.
5844
5845 We have converted ORIG bytes (== coding->consumed) into
5846 NEW bytes (coding->produced). To convert the remaining
5847 LEN bytes, we may need REQUIRE bytes of gap, where:
5848 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
5849 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
5850 Here, we are sure that NEW >= ORIG. */
5851 float ratio;
5852
5853 if (coding->produced <= coding->consumed)
5854 {
5855 /* This happens because of CCL-based coding system with
5856 eol-type CRLF. */
5857 require = 0;
5858 }
5859 else
5860 {
5861 ratio = (coding->produced - coding->consumed) / coding->consumed;
5862 require = len_byte * ratio;
5863 }
5864 first = 0;
5865 }
5866 if ((src - dst) < (require + 2000))
5867 {
5868 /* See the comment above the previous call of make_gap. */
5869 int add = len_byte + inserted_byte;
5870
5871 GAP_SIZE -= add;
5872 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5873 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5874 make_gap (require + 2000);
5875 GAP_SIZE += add;
5876 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5877 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5878 }
5879 }
5880 if (src - dst > 0) *dst = 0; /* Put an anchor. */
5881
5882 if (encodep && coding->dst_multibyte)
5883 {
5884 /* The output is unibyte. We must convert 8-bit characters to
5885 multibyte form. */
5886 if (inserted_byte * 2 > GAP_SIZE)
5887 {
5888 GAP_SIZE -= inserted_byte;
5889 ZV += inserted_byte; Z += inserted_byte;
5890 ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
5891 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5892 make_gap (inserted_byte - GAP_SIZE);
5893 GAP_SIZE += inserted_byte;
5894 ZV -= inserted_byte; Z -= inserted_byte;
5895 ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
5896 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5897 }
5898 inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
5899 }
5900
5901 /* If we shrank the conversion area, adjust it now. */
5902 if (total_skip > 0)
5903 {
5904 if (tail_skip > 0)
5905 safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
5906 inserted += total_skip; inserted_byte += total_skip;
5907 GAP_SIZE += total_skip;
5908 GPT -= head_skip; GPT_BYTE -= head_skip;
5909 ZV -= total_skip; ZV_BYTE -= total_skip;
5910 Z -= total_skip; Z_BYTE -= total_skip;
5911 from -= head_skip; from_byte -= head_skip;
5912 to += tail_skip; to_byte += tail_skip;
5913 }
5914
5915 prev_Z = Z;
5916 if (! EQ (current_buffer->undo_list, Qt))
5917 adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
5918 else
5919 adjust_after_replace_noundo (from, from_byte, nchars_del, nbytes_del,
5920 inserted, inserted_byte);
5921 inserted = Z - prev_Z;
5922
5923 if (!encodep && coding->cmp_data && coding->cmp_data->used)
5924 coding_restore_composition (coding, Fcurrent_buffer ());
5925 coding_free_composition_data (coding);
5926
5927 if (! inhibit_pre_post_conversion
5928 && ! encodep && ! NILP (coding->post_read_conversion))
5929 {
5930 Lisp_Object val;
5931 Lisp_Object saved_coding_system;
5932
5933 if (from != PT)
5934 TEMP_SET_PT_BOTH (from, from_byte);
5935 prev_Z = Z;
5936 record_unwind_protect (code_convert_region_unwind,
5937 Vlast_coding_system_used);
5938 saved_coding_system = Vlast_coding_system_used;
5939 Vlast_coding_system_used = coding->symbol;
5940 /* We should not call any more pre-write/post-read-conversion
5941 functions while this post-read-conversion is running. */
5942 inhibit_pre_post_conversion = 1;
5943 val = call1 (coding->post_read_conversion, make_number (inserted));
5944 inhibit_pre_post_conversion = 0;
5945 coding->symbol = Vlast_coding_system_used;
5946 Vlast_coding_system_used = saved_coding_system;
5947 /* Discard the unwind protect. */
5948 specpdl_ptr--;
5949 CHECK_NUMBER (val);
5950 inserted += Z - prev_Z;
5951 }
5952
5953 if (orig_point >= from)
5954 {
5955 if (orig_point >= from + orig_len)
5956 orig_point += inserted - orig_len;
5957 else
5958 orig_point = from;
5959 TEMP_SET_PT (orig_point);
5960 }
5961
5962 if (replace)
5963 {
5964 signal_after_change (from, to - from, inserted);
5965 update_compositions (from, from + inserted, CHECK_BORDER);
5966 }
5967
5968 {
5969 coding->consumed = to_byte - from_byte;
5970 coding->consumed_char = to - from;
5971 coding->produced = inserted_byte;
5972 coding->produced_char = inserted;
5973 }
5974
5975 return 0;
5976 }
5977
5978 Lisp_Object
5979 run_pre_post_conversion_on_str (str, coding, encodep)
5980 Lisp_Object str;
5981 struct coding_system *coding;
5982 int encodep;
5983 {
5984 int count = SPECPDL_INDEX ();
5985 struct gcpro gcpro1, gcpro2;
5986 int multibyte = STRING_MULTIBYTE (str);
5987 Lisp_Object buffer;
5988 struct buffer *buf;
5989 Lisp_Object old_deactivate_mark;
5990
5991 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
5992 record_unwind_protect (code_convert_region_unwind,
5993 Vlast_coding_system_used);
5994 /* It is not crucial to specbind this. */
5995 old_deactivate_mark = Vdeactivate_mark;
5996 GCPRO2 (str, old_deactivate_mark);
5997
5998 buffer = Fget_buffer_create (build_string (" *code-converting-work*"));
5999 buf = XBUFFER (buffer);
6000
6001 delete_all_overlays (buf);
6002 buf->directory = current_buffer->directory;
6003 buf->read_only = Qnil;
6004 buf->filename = Qnil;
6005 buf->undo_list = Qt;
6006 eassert (buf->overlays_before == NULL);
6007 eassert (buf->overlays_after == NULL);
6008
6009 set_buffer_internal (buf);
6010 /* We must insert the contents of STR as is without
6011 unibyte<->multibyte conversion. For that, we adjust the
6012 multibyteness of the working buffer to that of STR. */
6013 Ferase_buffer ();
6014 buf->enable_multibyte_characters = multibyte ? Qt : Qnil;
6015
6016 insert_from_string (str, 0, 0,
6017 SCHARS (str), SBYTES (str), 0);
6018 UNGCPRO;
6019 inhibit_pre_post_conversion = 1;
6020 if (encodep)
6021 call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
6022 else
6023 {
6024 Vlast_coding_system_used = coding->symbol;
6025 TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
6026 call1 (coding->post_read_conversion, make_number (Z - BEG));
6027 coding->symbol = Vlast_coding_system_used;
6028 }
6029 inhibit_pre_post_conversion = 0;
6030 Vdeactivate_mark = old_deactivate_mark;
6031 str = make_buffer_string (BEG, Z, 1);
6032 return unbind_to (count, str);
6033 }
6034
6035 Lisp_Object
6036 decode_coding_string (str, coding, nocopy)
6037 Lisp_Object str;
6038 struct coding_system *coding;
6039 int nocopy;
6040 {
6041 int len;
6042 struct conversion_buffer buf;
6043 int from, to_byte;
6044 Lisp_Object saved_coding_symbol;
6045 int result;
6046 int require_decoding;
6047 int shrinked_bytes = 0;
6048 Lisp_Object newstr;
6049 int consumed, consumed_char, produced, produced_char;
6050
6051 from = 0;
6052 to_byte = SBYTES (str);
6053
6054 saved_coding_symbol = coding->symbol;
6055 coding->src_multibyte = STRING_MULTIBYTE (str);
6056 coding->dst_multibyte = 1;
6057 if (CODING_REQUIRE_DETECTION (coding))
6058 {
6059 /* See the comments in code_convert_region. */
6060 if (coding->type == coding_type_undecided)
6061 {
6062 detect_coding (coding, SDATA (str), to_byte);
6063 if (coding->type == coding_type_undecided)
6064 {
6065 coding->type = coding_type_emacs_mule;
6066 coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
6067 /* As emacs-mule decoder will handle composition, we
6068 need this setting to allocate coding->cmp_data
6069 later. */
6070 coding->composing = COMPOSITION_NO;
6071 }
6072 }
6073 if (coding->eol_type == CODING_EOL_UNDECIDED
6074 && coding->type != coding_type_ccl)
6075 {
6076 saved_coding_symbol = coding->symbol;
6077 detect_eol (coding, SDATA (str), to_byte);
6078 if (coding->eol_type == CODING_EOL_UNDECIDED)
6079 coding->eol_type = CODING_EOL_LF;
6080 /* We had better recover the original eol format if we
6081 encounter an inconsistent eol format while decoding. */
6082 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
6083 }
6084 }
6085
6086 if (coding->type == coding_type_no_conversion
6087 || coding->type == coding_type_raw_text)
6088 coding->dst_multibyte = 0;
6089
6090 require_decoding = CODING_REQUIRE_DECODING (coding);
6091
6092 if (STRING_MULTIBYTE (str))
6093 {
6094 /* Decoding routines expect the source text to be unibyte. */
6095 str = Fstring_as_unibyte (str);
6096 to_byte = SBYTES (str);
6097 nocopy = 1;
6098 coding->src_multibyte = 0;
6099 }
6100
6101 /* Try to skip the heading and tailing ASCIIs. */
6102 if (require_decoding && coding->type != coding_type_ccl)
6103 {
6104 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6105 0);
6106 if (from == to_byte)
6107 require_decoding = 0;
6108 shrinked_bytes = from + (SBYTES (str) - to_byte);
6109 }
6110
6111 if (!require_decoding
6112 && !(SYMBOLP (coding->post_read_conversion)
6113 && !NILP (Ffboundp (coding->post_read_conversion))))
6114 {
6115 coding->consumed = SBYTES (str);
6116 coding->consumed_char = SCHARS (str);
6117 if (coding->dst_multibyte)
6118 {
6119 str = Fstring_as_multibyte (str);
6120 nocopy = 1;
6121 }
6122 coding->produced = SBYTES (str);
6123 coding->produced_char = SCHARS (str);
6124 return (nocopy ? str : Fcopy_sequence (str));
6125 }
6126
6127 if (coding->composing != COMPOSITION_DISABLED)
6128 coding_allocate_composition_data (coding, from);
6129 len = decoding_buffer_size (coding, to_byte - from);
6130 allocate_conversion_buffer (buf, len);
6131
6132 consumed = consumed_char = produced = produced_char = 0;
6133 while (1)
6134 {
6135 result = decode_coding (coding, SDATA (str) + from + consumed,
6136 buf.data + produced, to_byte - from - consumed,
6137 buf.size - produced);
6138 consumed += coding->consumed;
6139 consumed_char += coding->consumed_char;
6140 produced += coding->produced;
6141 produced_char += coding->produced_char;
6142 if (result == CODING_FINISH_NORMAL
6143 || (result == CODING_FINISH_INSUFFICIENT_SRC
6144 && coding->consumed == 0))
6145 break;
6146 if (result == CODING_FINISH_INSUFFICIENT_CMP)
6147 coding_allocate_composition_data (coding, from + produced_char);
6148 else if (result == CODING_FINISH_INSUFFICIENT_DST)
6149 extend_conversion_buffer (&buf);
6150 else if (result == CODING_FINISH_INCONSISTENT_EOL)
6151 {
6152 Lisp_Object eol_type;
6153
6154 /* Recover the original EOL format. */
6155 if (coding->eol_type == CODING_EOL_CR)
6156 {
6157 unsigned char *p;
6158 for (p = buf.data; p < buf.data + produced; p++)
6159 if (*p == '\n') *p = '\r';
6160 }
6161 else if (coding->eol_type == CODING_EOL_CRLF)
6162 {
6163 int num_eol = 0;
6164 unsigned char *p0, *p1;
6165 for (p0 = buf.data, p1 = p0 + produced; p0 < p1; p0++)
6166 if (*p0 == '\n') num_eol++;
6167 if (produced + num_eol >= buf.size)
6168 extend_conversion_buffer (&buf);
6169 for (p0 = buf.data + produced, p1 = p0 + num_eol; p0 > buf.data;)
6170 {
6171 *--p1 = *--p0;
6172 if (*p0 == '\n') *--p1 = '\r';
6173 }
6174 produced += num_eol;
6175 produced_char += num_eol;
6176 }
6177 /* Suppress eol-format conversion in the further conversion. */
6178 coding->eol_type = CODING_EOL_LF;
6179
6180 /* Set the coding system symbol to that for Unix-like EOL. */
6181 eol_type = Fget (saved_coding_symbol, Qeol_type);
6182 if (VECTORP (eol_type)
6183 && XVECTOR (eol_type)->size == 3
6184 && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
6185 coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
6186 else
6187 coding->symbol = saved_coding_symbol;
6188
6189
6190 }
6191 }
6192
6193 coding->consumed = consumed;
6194 coding->consumed_char = consumed_char;
6195 coding->produced = produced;
6196 coding->produced_char = produced_char;
6197
6198 if (coding->dst_multibyte)
6199 newstr = make_uninit_multibyte_string (produced_char + shrinked_bytes,
6200 produced + shrinked_bytes);
6201 else
6202 newstr = make_uninit_string (produced + shrinked_bytes);
6203 if (from > 0)
6204 STRING_COPYIN (newstr, 0, SDATA (str), from);
6205 STRING_COPYIN (newstr, from, buf.data, produced);
6206 if (shrinked_bytes > from)
6207 STRING_COPYIN (newstr, from + produced,
6208 SDATA (str) + to_byte,
6209 shrinked_bytes - from);
6210 free_conversion_buffer (&buf);
6211
6212 if (coding->cmp_data && coding->cmp_data->used)
6213 coding_restore_composition (coding, newstr);
6214 coding_free_composition_data (coding);
6215
6216 if (SYMBOLP (coding->post_read_conversion)
6217 && !NILP (Ffboundp (coding->post_read_conversion)))
6218 newstr = run_pre_post_conversion_on_str (newstr, coding, 0);
6219
6220 return newstr;
6221 }
6222
6223 Lisp_Object
6224 encode_coding_string (str, coding, nocopy)
6225 Lisp_Object str;
6226 struct coding_system *coding;
6227 int nocopy;
6228 {
6229 int len;
6230 struct conversion_buffer buf;
6231 int from, to, to_byte;
6232 int result;
6233 int shrinked_bytes = 0;
6234 Lisp_Object newstr;
6235 int consumed, consumed_char, produced, produced_char;
6236
6237 if (SYMBOLP (coding->pre_write_conversion)
6238 && !NILP (Ffboundp (coding->pre_write_conversion)))
6239 str = run_pre_post_conversion_on_str (str, coding, 1);
6240
6241 from = 0;
6242 to = SCHARS (str);
6243 to_byte = SBYTES (str);
6244
6245 /* Encoding routines determine the multibyteness of the source text
6246 by coding->src_multibyte. */
6247 coding->src_multibyte = STRING_MULTIBYTE (str);
6248 coding->dst_multibyte = 0;
6249 if (! CODING_REQUIRE_ENCODING (coding))
6250 {
6251 coding->consumed = SBYTES (str);
6252 coding->consumed_char = SCHARS (str);
6253 if (STRING_MULTIBYTE (str))
6254 {
6255 str = Fstring_as_unibyte (str);
6256 nocopy = 1;
6257 }
6258 coding->produced = SBYTES (str);
6259 coding->produced_char = SCHARS (str);
6260 return (nocopy ? str : Fcopy_sequence (str));
6261 }
6262
6263 if (coding->composing != COMPOSITION_DISABLED)
6264 coding_save_composition (coding, from, to, str);
6265
6266 /* Try to skip the heading and tailing ASCIIs. */
6267 if (coding->type != coding_type_ccl)
6268 {
6269 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6270 1);
6271 if (from == to_byte)
6272 return (nocopy ? str : Fcopy_sequence (str));
6273 shrinked_bytes = from + (SBYTES (str) - to_byte);
6274 }
6275
6276 len = encoding_buffer_size (coding, to_byte - from);
6277 allocate_conversion_buffer (buf, len);
6278
6279 consumed = consumed_char = produced = produced_char = 0;
6280 while (1)
6281 {
6282 result = encode_coding (coding, SDATA (str) + from + consumed,
6283 buf.data + produced, to_byte - from - consumed,
6284 buf.size - produced);
6285 consumed += coding->consumed;
6286 consumed_char += coding->consumed_char;
6287 produced += coding->produced;
6288 produced_char += coding->produced_char;
6289 if (result == CODING_FINISH_NORMAL
6290 || (result == CODING_FINISH_INSUFFICIENT_SRC
6291 && coding->consumed == 0))
6292 break;
6293 /* Now result should be CODING_FINISH_INSUFFICIENT_DST. */
6294 extend_conversion_buffer (&buf);
6295 }
6296
6297 coding->consumed = consumed;
6298 coding->consumed_char = consumed_char;
6299 coding->produced = produced;
6300 coding->produced_char = produced_char;
6301
6302 newstr = make_uninit_string (produced + shrinked_bytes);
6303 if (from > 0)
6304 STRING_COPYIN (newstr, 0, SDATA (str), from);
6305 STRING_COPYIN (newstr, from, buf.data, produced);
6306 if (shrinked_bytes > from)
6307 STRING_COPYIN (newstr, from + produced,
6308 SDATA (str) + to_byte,
6309 shrinked_bytes - from);
6310
6311 free_conversion_buffer (&buf);
6312 coding_free_composition_data (coding);
6313
6314 return newstr;
6315 }
6316
6317 \f
6318 #ifdef emacs
6319 /*** 8. Emacs Lisp library functions ***/
6320
6321 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6322 doc: /* Return t if OBJECT is nil or a coding-system.
6323 See the documentation of `make-coding-system' for information
6324 about coding-system objects. */)
6325 (obj)
6326 Lisp_Object obj;
6327 {
6328 if (NILP (obj))
6329 return Qt;
6330 if (!SYMBOLP (obj))
6331 return Qnil;
6332 /* Get coding-spec vector for OBJ. */
6333 obj = Fget (obj, Qcoding_system);
6334 return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
6335 ? Qt : Qnil);
6336 }
6337
6338 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6339 Sread_non_nil_coding_system, 1, 1, 0,
6340 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6341 (prompt)
6342 Lisp_Object prompt;
6343 {
6344 Lisp_Object val;
6345 do
6346 {
6347 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6348 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6349 }
6350 while (SCHARS (val) == 0);
6351 return (Fintern (val, Qnil));
6352 }
6353
6354 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6355 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6356 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
6357 (prompt, default_coding_system)
6358 Lisp_Object prompt, default_coding_system;
6359 {
6360 Lisp_Object val;
6361 if (SYMBOLP (default_coding_system))
6362 default_coding_system = SYMBOL_NAME (default_coding_system);
6363 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6364 Qt, Qnil, Qcoding_system_history,
6365 default_coding_system, Qnil);
6366 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
6367 }
6368
6369 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6370 1, 1, 0,
6371 doc: /* Check validity of CODING-SYSTEM.
6372 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
6373 It is valid if it is a symbol with a non-nil `coding-system' property.
6374 The value of property should be a vector of length 5. */)
6375 (coding_system)
6376 Lisp_Object coding_system;
6377 {
6378 CHECK_SYMBOL (coding_system);
6379 if (!NILP (Fcoding_system_p (coding_system)))
6380 return coding_system;
6381 while (1)
6382 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
6383 }
6384 \f
6385 Lisp_Object
6386 detect_coding_system (src, src_bytes, highest, multibytep)
6387 const unsigned char *src;
6388 int src_bytes, highest;
6389 int multibytep;
6390 {
6391 int coding_mask, eol_type;
6392 Lisp_Object val, tmp;
6393 int dummy;
6394
6395 coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy, multibytep);
6396 eol_type = detect_eol_type (src, src_bytes, &dummy);
6397 if (eol_type == CODING_EOL_INCONSISTENT)
6398 eol_type = CODING_EOL_UNDECIDED;
6399
6400 if (!coding_mask)
6401 {
6402 val = Qundecided;
6403 if (eol_type != CODING_EOL_UNDECIDED)
6404 {
6405 Lisp_Object val2;
6406 val2 = Fget (Qundecided, Qeol_type);
6407 if (VECTORP (val2))
6408 val = XVECTOR (val2)->contents[eol_type];
6409 }
6410 return (highest ? val : Fcons (val, Qnil));
6411 }
6412
6413 /* At first, gather possible coding systems in VAL. */
6414 val = Qnil;
6415 for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
6416 {
6417 Lisp_Object category_val, category_index;
6418
6419 category_index = Fget (XCAR (tmp), Qcoding_category_index);
6420 category_val = Fsymbol_value (XCAR (tmp));
6421 if (!NILP (category_val)
6422 && NATNUMP (category_index)
6423 && (coding_mask & (1 << XFASTINT (category_index))))
6424 {
6425 val = Fcons (category_val, val);
6426 if (highest)
6427 break;
6428 }
6429 }
6430 if (!highest)
6431 val = Fnreverse (val);
6432
6433 /* Then, replace the elements with subsidiary coding systems. */
6434 for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
6435 {
6436 if (eol_type != CODING_EOL_UNDECIDED
6437 && eol_type != CODING_EOL_INCONSISTENT)
6438 {
6439 Lisp_Object eol;
6440 eol = Fget (XCAR (tmp), Qeol_type);
6441 if (VECTORP (eol))
6442 XSETCAR (tmp, XVECTOR (eol)->contents[eol_type]);
6443 }
6444 }
6445 return (highest ? XCAR (val) : val);
6446 }
6447
6448 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6449 2, 3, 0,
6450 doc: /* Detect how the byte sequence in the region is encoded.
6451 Return a list of possible coding systems used on decoding a byte
6452 sequence containing the bytes in the region between START and END when
6453 the coding system `undecided' is specified. The list is ordered by
6454 priority decided in the current language environment.
6455
6456 If only ASCII characters are found, it returns a list of single element
6457 `undecided' or its subsidiary coding system according to a detected
6458 end-of-line format.
6459
6460 If optional argument HIGHEST is non-nil, return the coding system of
6461 highest priority. */)
6462 (start, end, highest)
6463 Lisp_Object start, end, highest;
6464 {
6465 int from, to;
6466 int from_byte, to_byte;
6467 int include_anchor_byte = 0;
6468
6469 CHECK_NUMBER_COERCE_MARKER (start);
6470 CHECK_NUMBER_COERCE_MARKER (end);
6471
6472 validate_region (&start, &end);
6473 from = XINT (start), to = XINT (end);
6474 from_byte = CHAR_TO_BYTE (from);
6475 to_byte = CHAR_TO_BYTE (to);
6476
6477 if (from < GPT && to >= GPT)
6478 move_gap_both (to, to_byte);
6479 /* If we an anchor byte `\0' follows the region, we include it in
6480 the detecting source. Then code detectors can handle the tailing
6481 byte sequence more accurately.
6482
6483 Fix me: This is not a perfect solution. It is better that we
6484 add one more argument, say LAST_BLOCK, to all detect_coding_XXX.
6485 */
6486 if (to == Z || (to == GPT && GAP_SIZE > 0))
6487 include_anchor_byte = 1;
6488 return detect_coding_system (BYTE_POS_ADDR (from_byte),
6489 to_byte - from_byte + include_anchor_byte,
6490 !NILP (highest),
6491 !NILP (current_buffer
6492 ->enable_multibyte_characters));
6493 }
6494
6495 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6496 1, 2, 0,
6497 doc: /* Detect how the byte sequence in STRING is encoded.
6498 Return a list of possible coding systems used on decoding a byte
6499 sequence containing the bytes in STRING when the coding system
6500 `undecided' is specified. The list is ordered by priority decided in
6501 the current language environment.
6502
6503 If only ASCII characters are found, it returns a list of single element
6504 `undecided' or its subsidiary coding system according to a detected
6505 end-of-line format.
6506
6507 If optional argument HIGHEST is non-nil, return the coding system of
6508 highest priority. */)
6509 (string, highest)
6510 Lisp_Object string, highest;
6511 {
6512 CHECK_STRING (string);
6513
6514 return detect_coding_system (SDATA (string),
6515 /* "+ 1" is to include the anchor byte
6516 `\0'. With this, code detectors can
6517 handle the tailing bytes more
6518 accurately. */
6519 SBYTES (string) + 1,
6520 !NILP (highest),
6521 STRING_MULTIBYTE (string));
6522 }
6523
6524 /* Subroutine for Fsafe_coding_systems_region_internal.
6525
6526 Return a list of coding systems that safely encode the multibyte
6527 text between P and PEND. SAFE_CODINGS, if non-nil, is an alist of
6528 possible coding systems. If it is nil, it means that we have not
6529 yet found any coding systems.
6530
6531 WORK_TABLE is a copy of the char-table Vchar_coding_system_table. An
6532 element of WORK_TABLE is set to t once the element is looked up.
6533
6534 If a non-ASCII single byte char is found, set
6535 *single_byte_char_found to 1. */
6536
6537 static Lisp_Object
6538 find_safe_codings (p, pend, safe_codings, work_table, single_byte_char_found)
6539 unsigned char *p, *pend;
6540 Lisp_Object safe_codings, work_table;
6541 int *single_byte_char_found;
6542 {
6543 int c, len;
6544 Lisp_Object val, ch;
6545 Lisp_Object prev, tail;
6546
6547 while (p < pend)
6548 {
6549 c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6550 p += len;
6551 if (ASCII_BYTE_P (c))
6552 /* We can ignore ASCII characters here. */
6553 continue;
6554 if (SINGLE_BYTE_CHAR_P (c))
6555 *single_byte_char_found = 1;
6556 if (NILP (safe_codings))
6557 /* Already all coding systems are excluded. But, we can't
6558 terminate the loop here because non-ASCII single-byte char
6559 must be found. */
6560 continue;
6561 /* Check the safe coding systems for C. */
6562 ch = make_number (c);
6563 val = Faref (work_table, ch);
6564 if (EQ (val, Qt))
6565 /* This element was already checked. Ignore it. */
6566 continue;
6567 /* Remember that we checked this element. */
6568 Faset (work_table, ch, Qt);
6569
6570 for (prev = tail = safe_codings; CONSP (tail); tail = XCDR (tail))
6571 {
6572 Lisp_Object elt, translation_table, hash_table, accept_latin_extra;
6573 int encodable;
6574
6575 elt = XCAR (tail);
6576 if (CONSP (XCDR (elt)))
6577 {
6578 /* This entry has this format now:
6579 ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6580 ACCEPT-LATIN-EXTRA ) */
6581 val = XCDR (elt);
6582 encodable = ! NILP (Faref (XCAR (val), ch));
6583 if (! encodable)
6584 {
6585 val = XCDR (val);
6586 translation_table = XCAR (val);
6587 hash_table = XCAR (XCDR (val));
6588 accept_latin_extra = XCAR (XCDR (XCDR (val)));
6589 }
6590 }
6591 else
6592 {
6593 /* This entry has this format now: ( CODING . SAFE-CHARS) */
6594 encodable = ! NILP (Faref (XCDR (elt), ch));
6595 if (! encodable)
6596 {
6597 /* Transform the format to:
6598 ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6599 ACCEPT-LATIN-EXTRA ) */
6600 val = Fget (XCAR (elt), Qcoding_system);
6601 translation_table
6602 = Fplist_get (AREF (val, 3),
6603 Qtranslation_table_for_encode);
6604 if (SYMBOLP (translation_table))
6605 translation_table = Fget (translation_table,
6606 Qtranslation_table);
6607 hash_table
6608 = (CHAR_TABLE_P (translation_table)
6609 ? XCHAR_TABLE (translation_table)->extras[1]
6610 : Qnil);
6611 accept_latin_extra
6612 = ((EQ (AREF (val, 0), make_number (2))
6613 && VECTORP (AREF (val, 4)))
6614 ? AREF (AREF (val, 4), 16)
6615 : Qnil);
6616 XSETCAR (tail, list5 (XCAR (elt), XCDR (elt),
6617 translation_table, hash_table,
6618 accept_latin_extra));
6619 }
6620 }
6621
6622 if (! encodable
6623 && ((CHAR_TABLE_P (translation_table)
6624 && ! NILP (Faref (translation_table, ch)))
6625 || (HASH_TABLE_P (hash_table)
6626 && ! NILP (Fgethash (ch, hash_table, Qnil)))
6627 || (SINGLE_BYTE_CHAR_P (c)
6628 && ! NILP (accept_latin_extra)
6629 && VECTORP (Vlatin_extra_code_table)
6630 && ! NILP (AREF (Vlatin_extra_code_table, c)))))
6631 encodable = 1;
6632 if (encodable)
6633 prev = tail;
6634 else
6635 {
6636 /* Exclude this coding system from SAFE_CODINGS. */
6637 if (EQ (tail, safe_codings))
6638 safe_codings = XCDR (safe_codings);
6639 else
6640 XSETCDR (prev, XCDR (tail));
6641 }
6642 }
6643 }
6644 return safe_codings;
6645 }
6646
6647 DEFUN ("find-coding-systems-region-internal",
6648 Ffind_coding_systems_region_internal,
6649 Sfind_coding_systems_region_internal, 2, 2, 0,
6650 doc: /* Internal use only. */)
6651 (start, end)
6652 Lisp_Object start, end;
6653 {
6654 Lisp_Object work_table, safe_codings;
6655 int non_ascii_p = 0;
6656 int single_byte_char_found = 0;
6657 const unsigned char *p1, *p1end, *p2, *p2end, *p;
6658
6659 if (STRINGP (start))
6660 {
6661 if (!STRING_MULTIBYTE (start))
6662 return Qt;
6663 p1 = SDATA (start), p1end = p1 + SBYTES (start);
6664 p2 = p2end = p1end;
6665 if (SCHARS (start) != SBYTES (start))
6666 non_ascii_p = 1;
6667 }
6668 else
6669 {
6670 int from, to, stop;
6671
6672 CHECK_NUMBER_COERCE_MARKER (start);
6673 CHECK_NUMBER_COERCE_MARKER (end);
6674 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6675 args_out_of_range (start, end);
6676 if (NILP (current_buffer->enable_multibyte_characters))
6677 return Qt;
6678 from = CHAR_TO_BYTE (XINT (start));
6679 to = CHAR_TO_BYTE (XINT (end));
6680 stop = from < GPT_BYTE && GPT_BYTE < to ? GPT_BYTE : to;
6681 p1 = BYTE_POS_ADDR (from), p1end = p1 + (stop - from);
6682 if (stop == to)
6683 p2 = p2end = p1end;
6684 else
6685 p2 = BYTE_POS_ADDR (stop), p2end = p2 + (to - stop);
6686 if (XINT (end) - XINT (start) != to - from)
6687 non_ascii_p = 1;
6688 }
6689
6690 if (!non_ascii_p)
6691 {
6692 /* We are sure that the text contains no multibyte character.
6693 Check if it contains eight-bit-graphic. */
6694 p = p1;
6695 for (p = p1; p < p1end && ASCII_BYTE_P (*p); p++);
6696 if (p == p1end)
6697 {
6698 for (p = p2; p < p2end && ASCII_BYTE_P (*p); p++);
6699 if (p == p2end)
6700 return Qt;
6701 }
6702 }
6703
6704 /* The text contains non-ASCII characters. */
6705
6706 work_table = Fmake_char_table (Qchar_coding_system, Qnil);
6707 safe_codings = Fcopy_sequence (XCDR (Vcoding_system_safe_chars));
6708
6709 safe_codings = find_safe_codings (p1, p1end, safe_codings, work_table,
6710 &single_byte_char_found);
6711 if (p2 < p2end)
6712 safe_codings = find_safe_codings (p2, p2end, safe_codings, work_table,
6713 &single_byte_char_found);
6714 if (EQ (safe_codings, XCDR (Vcoding_system_safe_chars)))
6715 safe_codings = Qt;
6716 else
6717 {
6718 /* Turn safe_codings to a list of coding systems... */
6719 Lisp_Object val;
6720
6721 if (single_byte_char_found)
6722 /* ... and append these for eight-bit chars. */
6723 val = Fcons (Qraw_text,
6724 Fcons (Qemacs_mule, Fcons (Qno_conversion, Qnil)));
6725 else
6726 /* ... and append generic coding systems. */
6727 val = Fcopy_sequence (XCAR (Vcoding_system_safe_chars));
6728
6729 for (; CONSP (safe_codings); safe_codings = XCDR (safe_codings))
6730 val = Fcons (XCAR (XCAR (safe_codings)), val);
6731 safe_codings = val;
6732 }
6733
6734 return safe_codings;
6735 }
6736
6737
6738 /* Search from position POS for such characters that are unencodable
6739 accoding to SAFE_CHARS, and return a list of their positions. P
6740 points where in the memory the character at POS exists. Limit the
6741 search at PEND or when Nth unencodable characters are found.
6742
6743 If SAFE_CHARS is a char table, an element for an unencodable
6744 character is nil.
6745
6746 If SAFE_CHARS is nil, all non-ASCII characters are unencodable.
6747
6748 Otherwise, SAFE_CHARS is t, and only eight-bit-contrl and
6749 eight-bit-graphic characters are unencodable. */
6750
6751 static Lisp_Object
6752 unencodable_char_position (safe_chars, pos, p, pend, n)
6753 Lisp_Object safe_chars;
6754 int pos;
6755 unsigned char *p, *pend;
6756 int n;
6757 {
6758 Lisp_Object pos_list;
6759
6760 pos_list = Qnil;
6761 while (p < pend)
6762 {
6763 int len;
6764 int c = STRING_CHAR_AND_LENGTH (p, MAX_MULTIBYTE_LENGTH, len);
6765
6766 if (c >= 128
6767 && (CHAR_TABLE_P (safe_chars)
6768 ? NILP (CHAR_TABLE_REF (safe_chars, c))
6769 : (NILP (safe_chars) || c < 256)))
6770 {
6771 pos_list = Fcons (make_number (pos), pos_list);
6772 if (--n <= 0)
6773 break;
6774 }
6775 pos++;
6776 p += len;
6777 }
6778 return Fnreverse (pos_list);
6779 }
6780
6781
6782 DEFUN ("unencodable-char-position", Funencodable_char_position,
6783 Sunencodable_char_position, 3, 5, 0,
6784 doc: /*
6785 Return position of first un-encodable character in a region.
6786 START and END specfiy the region and CODING-SYSTEM specifies the
6787 encoding to check. Return nil if CODING-SYSTEM does encode the region.
6788
6789 If optional 4th argument COUNT is non-nil, it specifies at most how
6790 many un-encodable characters to search. In this case, the value is a
6791 list of positions.
6792
6793 If optional 5th argument STRING is non-nil, it is a string to search
6794 for un-encodable characters. In that case, START and END are indexes
6795 to the string. */)
6796 (start, end, coding_system, count, string)
6797 Lisp_Object start, end, coding_system, count, string;
6798 {
6799 int n;
6800 Lisp_Object safe_chars;
6801 struct coding_system coding;
6802 Lisp_Object positions;
6803 int from, to;
6804 unsigned char *p, *pend;
6805
6806 if (NILP (string))
6807 {
6808 validate_region (&start, &end);
6809 from = XINT (start);
6810 to = XINT (end);
6811 if (NILP (current_buffer->enable_multibyte_characters))
6812 return Qnil;
6813 p = CHAR_POS_ADDR (from);
6814 if (to == GPT)
6815 pend = GPT_ADDR;
6816 else
6817 pend = CHAR_POS_ADDR (to);
6818 }
6819 else
6820 {
6821 CHECK_STRING (string);
6822 CHECK_NATNUM (start);
6823 CHECK_NATNUM (end);
6824 from = XINT (start);
6825 to = XINT (end);
6826 if (from > to
6827 || to > SCHARS (string))
6828 args_out_of_range_3 (string, start, end);
6829 if (! STRING_MULTIBYTE (string))
6830 return Qnil;
6831 p = SDATA (string) + string_char_to_byte (string, from);
6832 pend = SDATA (string) + string_char_to_byte (string, to);
6833 }
6834
6835 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
6836
6837 if (NILP (count))
6838 n = 1;
6839 else
6840 {
6841 CHECK_NATNUM (count);
6842 n = XINT (count);
6843 }
6844
6845 if (coding.type == coding_type_no_conversion
6846 || coding.type == coding_type_raw_text)
6847 return Qnil;
6848
6849 if (coding.type == coding_type_undecided)
6850 safe_chars = Qnil;
6851 else
6852 safe_chars = coding_safe_chars (coding_system);
6853
6854 if (STRINGP (string)
6855 || from >= GPT || to <= GPT)
6856 positions = unencodable_char_position (safe_chars, from, p, pend, n);
6857 else
6858 {
6859 Lisp_Object args[2];
6860
6861 args[0] = unencodable_char_position (safe_chars, from, p, GPT_ADDR, n);
6862 n -= XINT (Flength (args[0]));
6863 if (n <= 0)
6864 positions = args[0];
6865 else
6866 {
6867 args[1] = unencodable_char_position (safe_chars, GPT, GAP_END_ADDR,
6868 pend, n);
6869 positions = Fappend (2, args);
6870 }
6871 }
6872
6873 return (NILP (count) ? Fcar (positions) : positions);
6874 }
6875
6876
6877 Lisp_Object
6878 code_convert_region1 (start, end, coding_system, encodep)
6879 Lisp_Object start, end, coding_system;
6880 int encodep;
6881 {
6882 struct coding_system coding;
6883 int from, to;
6884
6885 CHECK_NUMBER_COERCE_MARKER (start);
6886 CHECK_NUMBER_COERCE_MARKER (end);
6887 CHECK_SYMBOL (coding_system);
6888
6889 validate_region (&start, &end);
6890 from = XFASTINT (start);
6891 to = XFASTINT (end);
6892
6893 if (NILP (coding_system))
6894 return make_number (to - from);
6895
6896 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
6897 error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
6898
6899 coding.mode |= CODING_MODE_LAST_BLOCK;
6900 coding.src_multibyte = coding.dst_multibyte
6901 = !NILP (current_buffer->enable_multibyte_characters);
6902 code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
6903 &coding, encodep, 1);
6904 Vlast_coding_system_used = coding.symbol;
6905 return make_number (coding.produced_char);
6906 }
6907
6908 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
6909 3, 3, "r\nzCoding system: ",
6910 doc: /* Decode the current region from the specified coding system.
6911 When called from a program, takes three arguments:
6912 START, END, and CODING-SYSTEM. START and END are buffer positions.
6913 This function sets `last-coding-system-used' to the precise coding system
6914 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6915 not fully specified.)
6916 It returns the length of the decoded text. */)
6917 (start, end, coding_system)
6918 Lisp_Object start, end, coding_system;
6919 {
6920 return code_convert_region1 (start, end, coding_system, 0);
6921 }
6922
6923 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
6924 3, 3, "r\nzCoding system: ",
6925 doc: /* Encode the current region into the specified coding system.
6926 When called from a program, takes three arguments:
6927 START, END, and CODING-SYSTEM. START and END are buffer positions.
6928 This function sets `last-coding-system-used' to the precise coding system
6929 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6930 not fully specified.)
6931 It returns the length of the encoded text. */)
6932 (start, end, coding_system)
6933 Lisp_Object start, end, coding_system;
6934 {
6935 return code_convert_region1 (start, end, coding_system, 1);
6936 }
6937
6938 Lisp_Object
6939 code_convert_string1 (string, coding_system, nocopy, encodep)
6940 Lisp_Object string, coding_system, nocopy;
6941 int encodep;
6942 {
6943 struct coding_system coding;
6944
6945 CHECK_STRING (string);
6946 CHECK_SYMBOL (coding_system);
6947
6948 if (NILP (coding_system))
6949 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
6950
6951 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
6952 error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
6953
6954 coding.mode |= CODING_MODE_LAST_BLOCK;
6955 string = (encodep
6956 ? encode_coding_string (string, &coding, !NILP (nocopy))
6957 : decode_coding_string (string, &coding, !NILP (nocopy)));
6958 Vlast_coding_system_used = coding.symbol;
6959
6960 return string;
6961 }
6962
6963 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
6964 2, 3, 0,
6965 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
6966 Optional arg NOCOPY non-nil means it is OK to return STRING itself
6967 if the decoding operation is trivial.
6968 This function sets `last-coding-system-used' to the precise coding system
6969 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6970 not fully specified.) */)
6971 (string, coding_system, nocopy)
6972 Lisp_Object string, coding_system, nocopy;
6973 {
6974 return code_convert_string1 (string, coding_system, nocopy, 0);
6975 }
6976
6977 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
6978 2, 3, 0,
6979 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
6980 Optional arg NOCOPY non-nil means it is OK to return STRING itself
6981 if the encoding operation is trivial.
6982 This function sets `last-coding-system-used' to the precise coding system
6983 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6984 not fully specified.) */)
6985 (string, coding_system, nocopy)
6986 Lisp_Object string, coding_system, nocopy;
6987 {
6988 return code_convert_string1 (string, coding_system, nocopy, 1);
6989 }
6990
6991 /* Encode or decode STRING according to CODING_SYSTEM.
6992 Do not set Vlast_coding_system_used.
6993
6994 This function is called only from macros DECODE_FILE and
6995 ENCODE_FILE, thus we ignore character composition. */
6996
6997 Lisp_Object
6998 code_convert_string_norecord (string, coding_system, encodep)
6999 Lisp_Object string, coding_system;
7000 int encodep;
7001 {
7002 struct coding_system coding;
7003
7004 CHECK_STRING (string);
7005 CHECK_SYMBOL (coding_system);
7006
7007 if (NILP (coding_system))
7008 return string;
7009
7010 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7011 error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7012
7013 coding.composing = COMPOSITION_DISABLED;
7014 coding.mode |= CODING_MODE_LAST_BLOCK;
7015 return (encodep
7016 ? encode_coding_string (string, &coding, 1)
7017 : decode_coding_string (string, &coding, 1));
7018 }
7019 \f
7020 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
7021 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7022 Return the corresponding character. */)
7023 (code)
7024 Lisp_Object code;
7025 {
7026 unsigned char c1, c2, s1, s2;
7027 Lisp_Object val;
7028
7029 CHECK_NUMBER (code);
7030 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
7031 if (s1 == 0)
7032 {
7033 if (s2 < 0x80)
7034 XSETFASTINT (val, s2);
7035 else if (s2 >= 0xA0 || s2 <= 0xDF)
7036 XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
7037 else
7038 error ("Invalid Shift JIS code: %x", XFASTINT (code));
7039 }
7040 else
7041 {
7042 if ((s1 < 0x80 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF)
7043 || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
7044 error ("Invalid Shift JIS code: %x", XFASTINT (code));
7045 DECODE_SJIS (s1, s2, c1, c2);
7046 XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
7047 }
7048 return val;
7049 }
7050
7051 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
7052 doc: /* Encode a Japanese character CHAR to shift_jis encoding.
7053 Return the corresponding code in SJIS. */)
7054 (ch)
7055 Lisp_Object ch;
7056 {
7057 int charset, c1, c2, s1, s2;
7058 Lisp_Object val;
7059
7060 CHECK_NUMBER (ch);
7061 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7062 if (charset == CHARSET_ASCII)
7063 {
7064 val = ch;
7065 }
7066 else if (charset == charset_jisx0208
7067 && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
7068 {
7069 ENCODE_SJIS (c1, c2, s1, s2);
7070 XSETFASTINT (val, (s1 << 8) | s2);
7071 }
7072 else if (charset == charset_katakana_jisx0201
7073 && c1 > 0x20 && c2 < 0xE0)
7074 {
7075 XSETFASTINT (val, c1 | 0x80);
7076 }
7077 else
7078 error ("Can't encode to shift_jis: %d", XFASTINT (ch));
7079 return val;
7080 }
7081
7082 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7083 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7084 Return the corresponding character. */)
7085 (code)
7086 Lisp_Object code;
7087 {
7088 int charset;
7089 unsigned char b1, b2, c1, c2;
7090 Lisp_Object val;
7091
7092 CHECK_NUMBER (code);
7093 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
7094 if (b1 == 0)
7095 {
7096 if (b2 >= 0x80)
7097 error ("Invalid BIG5 code: %x", XFASTINT (code));
7098 val = code;
7099 }
7100 else
7101 {
7102 if ((b1 < 0xA1 || b1 > 0xFE)
7103 || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
7104 error ("Invalid BIG5 code: %x", XFASTINT (code));
7105 DECODE_BIG5 (b1, b2, charset, c1, c2);
7106 XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
7107 }
7108 return val;
7109 }
7110
7111 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7112 doc: /* Encode the Big5 character CHAR to BIG5 coding system.
7113 Return the corresponding character code in Big5. */)
7114 (ch)
7115 Lisp_Object ch;
7116 {
7117 int charset, c1, c2, b1, b2;
7118 Lisp_Object val;
7119
7120 CHECK_NUMBER (ch);
7121 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7122 if (charset == CHARSET_ASCII)
7123 {
7124 val = ch;
7125 }
7126 else if ((charset == charset_big5_1
7127 && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
7128 || (charset == charset_big5_2
7129 && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
7130 {
7131 ENCODE_BIG5 (charset, c1, c2, b1, b2);
7132 XSETFASTINT (val, (b1 << 8) | b2);
7133 }
7134 else
7135 error ("Can't encode to Big5: %d", XFASTINT (ch));
7136 return val;
7137 }
7138 \f
7139 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
7140 Sset_terminal_coding_system_internal, 1, 1, 0,
7141 doc: /* Internal use only. */)
7142 (coding_system)
7143 Lisp_Object coding_system;
7144 {
7145 CHECK_SYMBOL (coding_system);
7146 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
7147 /* We had better not send unsafe characters to terminal. */
7148 terminal_coding.mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
7149 /* Character composition should be disabled. */
7150 terminal_coding.composing = COMPOSITION_DISABLED;
7151 /* Error notification should be suppressed. */
7152 terminal_coding.suppress_error = 1;
7153 terminal_coding.src_multibyte = 1;
7154 terminal_coding.dst_multibyte = 0;
7155 return Qnil;
7156 }
7157
7158 DEFUN ("set-safe-terminal-coding-system-internal", Fset_safe_terminal_coding_system_internal,
7159 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7160 doc: /* Internal use only. */)
7161 (coding_system)
7162 Lisp_Object coding_system;
7163 {
7164 CHECK_SYMBOL (coding_system);
7165 setup_coding_system (Fcheck_coding_system (coding_system),
7166 &safe_terminal_coding);
7167 /* Character composition should be disabled. */
7168 safe_terminal_coding.composing = COMPOSITION_DISABLED;
7169 /* Error notification should be suppressed. */
7170 terminal_coding.suppress_error = 1;
7171 safe_terminal_coding.src_multibyte = 1;
7172 safe_terminal_coding.dst_multibyte = 0;
7173 return Qnil;
7174 }
7175
7176 DEFUN ("terminal-coding-system", Fterminal_coding_system,
7177 Sterminal_coding_system, 0, 0, 0,
7178 doc: /* Return coding system specified for terminal output. */)
7179 ()
7180 {
7181 return terminal_coding.symbol;
7182 }
7183
7184 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
7185 Sset_keyboard_coding_system_internal, 1, 1, 0,
7186 doc: /* Internal use only. */)
7187 (coding_system)
7188 Lisp_Object coding_system;
7189 {
7190 CHECK_SYMBOL (coding_system);
7191 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
7192 /* Character composition should be disabled. */
7193 keyboard_coding.composing = COMPOSITION_DISABLED;
7194 return Qnil;
7195 }
7196
7197 DEFUN ("keyboard-coding-system", Fkeyboard_coding_system,
7198 Skeyboard_coding_system, 0, 0, 0,
7199 doc: /* Return coding system specified for decoding keyboard input. */)
7200 ()
7201 {
7202 return keyboard_coding.symbol;
7203 }
7204
7205 \f
7206 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7207 Sfind_operation_coding_system, 1, MANY, 0,
7208 doc: /* Choose a coding system for an operation based on the target name.
7209 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7210 DECODING-SYSTEM is the coding system to use for decoding
7211 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7212 for encoding (in case OPERATION does encoding).
7213
7214 The first argument OPERATION specifies an I/O primitive:
7215 For file I/O, `insert-file-contents' or `write-region'.
7216 For process I/O, `call-process', `call-process-region', or `start-process'.
7217 For network I/O, `open-network-stream'.
7218
7219 The remaining arguments should be the same arguments that were passed
7220 to the primitive. Depending on which primitive, one of those arguments
7221 is selected as the TARGET. For example, if OPERATION does file I/O,
7222 whichever argument specifies the file name is TARGET.
7223
7224 TARGET has a meaning which depends on OPERATION:
7225 For file I/O, TARGET is a file name.
7226 For process I/O, TARGET is a process name.
7227 For network I/O, TARGET is a service name or a port number
7228
7229 This function looks up what specified for TARGET in,
7230 `file-coding-system-alist', `process-coding-system-alist',
7231 or `network-coding-system-alist' depending on OPERATION.
7232 They may specify a coding system, a cons of coding systems,
7233 or a function symbol to call.
7234 In the last case, we call the function with one argument,
7235 which is a list of all the arguments given to this function.
7236
7237 usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
7238 (nargs, args)
7239 int nargs;
7240 Lisp_Object *args;
7241 {
7242 Lisp_Object operation, target_idx, target, val;
7243 register Lisp_Object chain;
7244
7245 if (nargs < 2)
7246 error ("Too few arguments");
7247 operation = args[0];
7248 if (!SYMBOLP (operation)
7249 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7250 error ("Invalid first argument");
7251 if (nargs < 1 + XINT (target_idx))
7252 error ("Too few arguments for operation: %s",
7253 SDATA (SYMBOL_NAME (operation)));
7254 /* For write-region, if the 6th argument (i.e. VISIT, the 5th
7255 argument to write-region) is string, it must be treated as a
7256 target file name. */
7257 if (EQ (operation, Qwrite_region)
7258 && nargs > 5
7259 && STRINGP (args[5]))
7260 target_idx = make_number (4);
7261 target = args[XINT (target_idx) + 1];
7262 if (!(STRINGP (target)
7263 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7264 error ("Invalid argument %d", XINT (target_idx) + 1);
7265
7266 chain = ((EQ (operation, Qinsert_file_contents)
7267 || EQ (operation, Qwrite_region))
7268 ? Vfile_coding_system_alist
7269 : (EQ (operation, Qopen_network_stream)
7270 ? Vnetwork_coding_system_alist
7271 : Vprocess_coding_system_alist));
7272 if (NILP (chain))
7273 return Qnil;
7274
7275 for (; CONSP (chain); chain = XCDR (chain))
7276 {
7277 Lisp_Object elt;
7278 elt = XCAR (chain);
7279
7280 if (CONSP (elt)
7281 && ((STRINGP (target)
7282 && STRINGP (XCAR (elt))
7283 && fast_string_match (XCAR (elt), target) >= 0)
7284 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7285 {
7286 val = XCDR (elt);
7287 /* Here, if VAL is both a valid coding system and a valid
7288 function symbol, we return VAL as a coding system. */
7289 if (CONSP (val))
7290 return val;
7291 if (! SYMBOLP (val))
7292 return Qnil;
7293 if (! NILP (Fcoding_system_p (val)))
7294 return Fcons (val, val);
7295 if (! NILP (Ffboundp (val)))
7296 {
7297 val = call1 (val, Flist (nargs, args));
7298 if (CONSP (val))
7299 return val;
7300 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7301 return Fcons (val, val);
7302 }
7303 return Qnil;
7304 }
7305 }
7306 return Qnil;
7307 }
7308
7309 DEFUN ("update-coding-systems-internal", Fupdate_coding_systems_internal,
7310 Supdate_coding_systems_internal, 0, 0, 0,
7311 doc: /* Update internal database for ISO2022 and CCL based coding systems.
7312 When values of any coding categories are changed, you must
7313 call this function. */)
7314 ()
7315 {
7316 int i;
7317
7318 for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
7319 {
7320 Lisp_Object val;
7321
7322 val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[i]);
7323 if (!NILP (val))
7324 {
7325 if (! coding_system_table[i])
7326 coding_system_table[i] = ((struct coding_system *)
7327 xmalloc (sizeof (struct coding_system)));
7328 setup_coding_system (val, coding_system_table[i]);
7329 }
7330 else if (coding_system_table[i])
7331 {
7332 xfree (coding_system_table[i]);
7333 coding_system_table[i] = NULL;
7334 }
7335 }
7336
7337 return Qnil;
7338 }
7339
7340 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
7341 Sset_coding_priority_internal, 0, 0, 0,
7342 doc: /* Update internal database for the current value of `coding-category-list'.
7343 This function is internal use only. */)
7344 ()
7345 {
7346 int i = 0, idx;
7347 Lisp_Object val;
7348
7349 val = Vcoding_category_list;
7350
7351 while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
7352 {
7353 if (! SYMBOLP (XCAR (val)))
7354 break;
7355 idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
7356 if (idx >= CODING_CATEGORY_IDX_MAX)
7357 break;
7358 coding_priorities[i++] = (1 << idx);
7359 val = XCDR (val);
7360 }
7361 /* If coding-category-list is valid and contains all coding
7362 categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not,
7363 the following code saves Emacs from crashing. */
7364 while (i < CODING_CATEGORY_IDX_MAX)
7365 coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
7366
7367 return Qnil;
7368 }
7369
7370 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7371 Sdefine_coding_system_internal, 1, 1, 0,
7372 doc: /* Register CODING-SYSTEM as a base coding system.
7373 This function is internal use only. */)
7374 (coding_system)
7375 Lisp_Object coding_system;
7376 {
7377 Lisp_Object safe_chars, slot;
7378
7379 if (NILP (Fcheck_coding_system (coding_system)))
7380 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
7381 safe_chars = coding_safe_chars (coding_system);
7382 if (! EQ (safe_chars, Qt) && ! CHAR_TABLE_P (safe_chars))
7383 error ("No valid safe-chars property for %s",
7384 SDATA (SYMBOL_NAME (coding_system)));
7385 if (EQ (safe_chars, Qt))
7386 {
7387 if (NILP (Fmemq (coding_system, XCAR (Vcoding_system_safe_chars))))
7388 XSETCAR (Vcoding_system_safe_chars,
7389 Fcons (coding_system, XCAR (Vcoding_system_safe_chars)));
7390 }
7391 else
7392 {
7393 slot = Fassq (coding_system, XCDR (Vcoding_system_safe_chars));
7394 if (NILP (slot))
7395 XSETCDR (Vcoding_system_safe_chars,
7396 nconc2 (XCDR (Vcoding_system_safe_chars),
7397 Fcons (Fcons (coding_system, safe_chars), Qnil)));
7398 else
7399 XSETCDR (slot, safe_chars);
7400 }
7401 return Qnil;
7402 }
7403
7404 #endif /* emacs */
7405
7406 \f
7407 /*** 9. Post-amble ***/
7408
7409 void
7410 init_coding_once ()
7411 {
7412 int i;
7413
7414 /* Emacs' internal format specific initialize routine. */
7415 for (i = 0; i <= 0x20; i++)
7416 emacs_code_class[i] = EMACS_control_code;
7417 emacs_code_class[0x0A] = EMACS_linefeed_code;
7418 emacs_code_class[0x0D] = EMACS_carriage_return_code;
7419 for (i = 0x21 ; i < 0x7F; i++)
7420 emacs_code_class[i] = EMACS_ascii_code;
7421 emacs_code_class[0x7F] = EMACS_control_code;
7422 for (i = 0x80; i < 0xFF; i++)
7423 emacs_code_class[i] = EMACS_invalid_code;
7424 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
7425 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
7426 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
7427 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
7428
7429 /* ISO2022 specific initialize routine. */
7430 for (i = 0; i < 0x20; i++)
7431 iso_code_class[i] = ISO_control_0;
7432 for (i = 0x21; i < 0x7F; i++)
7433 iso_code_class[i] = ISO_graphic_plane_0;
7434 for (i = 0x80; i < 0xA0; i++)
7435 iso_code_class[i] = ISO_control_1;
7436 for (i = 0xA1; i < 0xFF; i++)
7437 iso_code_class[i] = ISO_graphic_plane_1;
7438 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
7439 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
7440 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
7441 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
7442 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
7443 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
7444 iso_code_class[ISO_CODE_ESC] = ISO_escape;
7445 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
7446 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
7447 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
7448
7449 setup_coding_system (Qnil, &keyboard_coding);
7450 setup_coding_system (Qnil, &terminal_coding);
7451 setup_coding_system (Qnil, &safe_terminal_coding);
7452 setup_coding_system (Qnil, &default_buffer_file_coding);
7453
7454 bzero (coding_system_table, sizeof coding_system_table);
7455
7456 bzero (ascii_skip_code, sizeof ascii_skip_code);
7457 for (i = 0; i < 128; i++)
7458 ascii_skip_code[i] = 1;
7459
7460 #if defined (MSDOS) || defined (WINDOWSNT)
7461 system_eol_type = CODING_EOL_CRLF;
7462 #else
7463 system_eol_type = CODING_EOL_LF;
7464 #endif
7465
7466 inhibit_pre_post_conversion = 0;
7467 }
7468
7469 #ifdef emacs
7470
7471 void
7472 syms_of_coding ()
7473 {
7474 Qtarget_idx = intern ("target-idx");
7475 staticpro (&Qtarget_idx);
7476
7477 Qcoding_system_history = intern ("coding-system-history");
7478 staticpro (&Qcoding_system_history);
7479 Fset (Qcoding_system_history, Qnil);
7480
7481 /* Target FILENAME is the first argument. */
7482 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
7483 /* Target FILENAME is the third argument. */
7484 Fput (Qwrite_region, Qtarget_idx, make_number (2));
7485
7486 Qcall_process = intern ("call-process");
7487 staticpro (&Qcall_process);
7488 /* Target PROGRAM is the first argument. */
7489 Fput (Qcall_process, Qtarget_idx, make_number (0));
7490
7491 Qcall_process_region = intern ("call-process-region");
7492 staticpro (&Qcall_process_region);
7493 /* Target PROGRAM is the third argument. */
7494 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
7495
7496 Qstart_process = intern ("start-process");
7497 staticpro (&Qstart_process);
7498 /* Target PROGRAM is the third argument. */
7499 Fput (Qstart_process, Qtarget_idx, make_number (2));
7500
7501 Qopen_network_stream = intern ("open-network-stream");
7502 staticpro (&Qopen_network_stream);
7503 /* Target SERVICE is the fourth argument. */
7504 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
7505
7506 Qcoding_system = intern ("coding-system");
7507 staticpro (&Qcoding_system);
7508
7509 Qeol_type = intern ("eol-type");
7510 staticpro (&Qeol_type);
7511
7512 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
7513 staticpro (&Qbuffer_file_coding_system);
7514
7515 Qpost_read_conversion = intern ("post-read-conversion");
7516 staticpro (&Qpost_read_conversion);
7517
7518 Qpre_write_conversion = intern ("pre-write-conversion");
7519 staticpro (&Qpre_write_conversion);
7520
7521 Qno_conversion = intern ("no-conversion");
7522 staticpro (&Qno_conversion);
7523
7524 Qundecided = intern ("undecided");
7525 staticpro (&Qundecided);
7526
7527 Qcoding_system_p = intern ("coding-system-p");
7528 staticpro (&Qcoding_system_p);
7529
7530 Qcoding_system_error = intern ("coding-system-error");
7531 staticpro (&Qcoding_system_error);
7532
7533 Fput (Qcoding_system_error, Qerror_conditions,
7534 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
7535 Fput (Qcoding_system_error, Qerror_message,
7536 build_string ("Invalid coding system"));
7537
7538 Qcoding_category = intern ("coding-category");
7539 staticpro (&Qcoding_category);
7540 Qcoding_category_index = intern ("coding-category-index");
7541 staticpro (&Qcoding_category_index);
7542
7543 Vcoding_category_table
7544 = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
7545 staticpro (&Vcoding_category_table);
7546 {
7547 int i;
7548 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
7549 {
7550 XVECTOR (Vcoding_category_table)->contents[i]
7551 = intern (coding_category_name[i]);
7552 Fput (XVECTOR (Vcoding_category_table)->contents[i],
7553 Qcoding_category_index, make_number (i));
7554 }
7555 }
7556
7557 Vcoding_system_safe_chars = Fcons (Qnil, Qnil);
7558 staticpro (&Vcoding_system_safe_chars);
7559
7560 Qtranslation_table = intern ("translation-table");
7561 staticpro (&Qtranslation_table);
7562 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
7563
7564 Qtranslation_table_id = intern ("translation-table-id");
7565 staticpro (&Qtranslation_table_id);
7566
7567 Qtranslation_table_for_decode = intern ("translation-table-for-decode");
7568 staticpro (&Qtranslation_table_for_decode);
7569
7570 Qtranslation_table_for_encode = intern ("translation-table-for-encode");
7571 staticpro (&Qtranslation_table_for_encode);
7572
7573 Qsafe_chars = intern ("safe-chars");
7574 staticpro (&Qsafe_chars);
7575
7576 Qchar_coding_system = intern ("char-coding-system");
7577 staticpro (&Qchar_coding_system);
7578
7579 /* Intern this now in case it isn't already done.
7580 Setting this variable twice is harmless.
7581 But don't staticpro it here--that is done in alloc.c. */
7582 Qchar_table_extra_slots = intern ("char-table-extra-slots");
7583 Fput (Qsafe_chars, Qchar_table_extra_slots, make_number (0));
7584 Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (0));
7585
7586 Qvalid_codes = intern ("valid-codes");
7587 staticpro (&Qvalid_codes);
7588
7589 Qemacs_mule = intern ("emacs-mule");
7590 staticpro (&Qemacs_mule);
7591
7592 Qraw_text = intern ("raw-text");
7593 staticpro (&Qraw_text);
7594
7595 Qutf_8 = intern ("utf-8");
7596 staticpro (&Qutf_8);
7597
7598 defsubr (&Scoding_system_p);
7599 defsubr (&Sread_coding_system);
7600 defsubr (&Sread_non_nil_coding_system);
7601 defsubr (&Scheck_coding_system);
7602 defsubr (&Sdetect_coding_region);
7603 defsubr (&Sdetect_coding_string);
7604 defsubr (&Sfind_coding_systems_region_internal);
7605 defsubr (&Sunencodable_char_position);
7606 defsubr (&Sdecode_coding_region);
7607 defsubr (&Sencode_coding_region);
7608 defsubr (&Sdecode_coding_string);
7609 defsubr (&Sencode_coding_string);
7610 defsubr (&Sdecode_sjis_char);
7611 defsubr (&Sencode_sjis_char);
7612 defsubr (&Sdecode_big5_char);
7613 defsubr (&Sencode_big5_char);
7614 defsubr (&Sset_terminal_coding_system_internal);
7615 defsubr (&Sset_safe_terminal_coding_system_internal);
7616 defsubr (&Sterminal_coding_system);
7617 defsubr (&Sset_keyboard_coding_system_internal);
7618 defsubr (&Skeyboard_coding_system);
7619 defsubr (&Sfind_operation_coding_system);
7620 defsubr (&Supdate_coding_systems_internal);
7621 defsubr (&Sset_coding_priority_internal);
7622 defsubr (&Sdefine_coding_system_internal);
7623
7624 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
7625 doc: /* List of coding systems.
7626
7627 Do not alter the value of this variable manually. This variable should be
7628 updated by the functions `make-coding-system' and
7629 `define-coding-system-alias'. */);
7630 Vcoding_system_list = Qnil;
7631
7632 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
7633 doc: /* Alist of coding system names.
7634 Each element is one element list of coding system name.
7635 This variable is given to `completing-read' as TABLE argument.
7636
7637 Do not alter the value of this variable manually. This variable should be
7638 updated by the functions `make-coding-system' and
7639 `define-coding-system-alias'. */);
7640 Vcoding_system_alist = Qnil;
7641
7642 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
7643 doc: /* List of coding-categories (symbols) ordered by priority.
7644
7645 On detecting a coding system, Emacs tries code detection algorithms
7646 associated with each coding-category one by one in this order. When
7647 one algorithm agrees with a byte sequence of source text, the coding
7648 system bound to the corresponding coding-category is selected. */);
7649 {
7650 int i;
7651
7652 Vcoding_category_list = Qnil;
7653 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
7654 Vcoding_category_list
7655 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
7656 Vcoding_category_list);
7657 }
7658
7659 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
7660 doc: /* Specify the coding system for read operations.
7661 It is useful to bind this variable with `let', but do not set it globally.
7662 If the value is a coding system, it is used for decoding on read operation.
7663 If not, an appropriate element is used from one of the coding system alists:
7664 There are three such tables, `file-coding-system-alist',
7665 `process-coding-system-alist', and `network-coding-system-alist'. */);
7666 Vcoding_system_for_read = Qnil;
7667
7668 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
7669 doc: /* Specify the coding system for write operations.
7670 Programs bind this variable with `let', but you should not set it globally.
7671 If the value is a coding system, it is used for encoding of output,
7672 when writing it to a file and when sending it to a file or subprocess.
7673
7674 If this does not specify a coding system, an appropriate element
7675 is used from one of the coding system alists:
7676 There are three such tables, `file-coding-system-alist',
7677 `process-coding-system-alist', and `network-coding-system-alist'.
7678 For output to files, if the above procedure does not specify a coding system,
7679 the value of `buffer-file-coding-system' is used. */);
7680 Vcoding_system_for_write = Qnil;
7681
7682 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
7683 doc: /* Coding system used in the latest file or process I/O.
7684 Also set by `encode-coding-region', `decode-coding-region',
7685 `encode-coding-string' and `decode-coding-string'. */);
7686 Vlast_coding_system_used = Qnil;
7687
7688 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
7689 doc: /* *Non-nil means always inhibit code conversion of end-of-line format.
7690 See info node `Coding Systems' and info node `Text and Binary' concerning
7691 such conversion. */);
7692 inhibit_eol_conversion = 0;
7693
7694 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
7695 doc: /* Non-nil means process buffer inherits coding system of process output.
7696 Bind it to t if the process output is to be treated as if it were a file
7697 read from some filesystem. */);
7698 inherit_process_coding_system = 0;
7699
7700 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
7701 doc: /* Alist to decide a coding system to use for a file I/O operation.
7702 The format is ((PATTERN . VAL) ...),
7703 where PATTERN is a regular expression matching a file name,
7704 VAL is a coding system, a cons of coding systems, or a function symbol.
7705 If VAL is a coding system, it is used for both decoding and encoding
7706 the file contents.
7707 If VAL is a cons of coding systems, the car part is used for decoding,
7708 and the cdr part is used for encoding.
7709 If VAL is a function symbol, the function must return a coding system
7710 or a cons of coding systems which are used as above. The function gets
7711 the arguments with which `find-operation-coding-system' was called.
7712
7713 See also the function `find-operation-coding-system'
7714 and the variable `auto-coding-alist'. */);
7715 Vfile_coding_system_alist = Qnil;
7716
7717 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
7718 doc: /* Alist to decide a coding system to use for a process I/O operation.
7719 The format is ((PATTERN . VAL) ...),
7720 where PATTERN is a regular expression matching a program name,
7721 VAL is a coding system, a cons of coding systems, or a function symbol.
7722 If VAL is a coding system, it is used for both decoding what received
7723 from the program and encoding what sent to the program.
7724 If VAL is a cons of coding systems, the car part is used for decoding,
7725 and the cdr part is used for encoding.
7726 If VAL is a function symbol, the function must return a coding system
7727 or a cons of coding systems which are used as above.
7728
7729 See also the function `find-operation-coding-system'. */);
7730 Vprocess_coding_system_alist = Qnil;
7731
7732 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
7733 doc: /* Alist to decide a coding system to use for a network I/O operation.
7734 The format is ((PATTERN . VAL) ...),
7735 where PATTERN is a regular expression matching a network service name
7736 or is a port number to connect to,
7737 VAL is a coding system, a cons of coding systems, or a function symbol.
7738 If VAL is a coding system, it is used for both decoding what received
7739 from the network stream and encoding what sent to the network stream.
7740 If VAL is a cons of coding systems, the car part is used for decoding,
7741 and the cdr part is used for encoding.
7742 If VAL is a function symbol, the function must return a coding system
7743 or a cons of coding systems which are used as above.
7744
7745 See also the function `find-operation-coding-system'. */);
7746 Vnetwork_coding_system_alist = Qnil;
7747
7748 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
7749 doc: /* Coding system to use with system messages.
7750 Also used for decoding keyboard input on X Window system. */);
7751 Vlocale_coding_system = Qnil;
7752
7753 /* The eol mnemonics are reset in startup.el system-dependently. */
7754 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
7755 doc: /* *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
7756 eol_mnemonic_unix = build_string (":");
7757
7758 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
7759 doc: /* *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
7760 eol_mnemonic_dos = build_string ("\\");
7761
7762 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
7763 doc: /* *String displayed in mode line for MAC-like (CR) end-of-line format. */);
7764 eol_mnemonic_mac = build_string ("/");
7765
7766 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
7767 doc: /* *String displayed in mode line when end-of-line format is not yet determined. */);
7768 eol_mnemonic_undecided = build_string (":");
7769
7770 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
7771 doc: /* *Non-nil enables character translation while encoding and decoding. */);
7772 Venable_character_translation = Qt;
7773
7774 DEFVAR_LISP ("standard-translation-table-for-decode",
7775 &Vstandard_translation_table_for_decode,
7776 doc: /* Table for translating characters while decoding. */);
7777 Vstandard_translation_table_for_decode = Qnil;
7778
7779 DEFVAR_LISP ("standard-translation-table-for-encode",
7780 &Vstandard_translation_table_for_encode,
7781 doc: /* Table for translating characters while encoding. */);
7782 Vstandard_translation_table_for_encode = Qnil;
7783
7784 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
7785 doc: /* Alist of charsets vs revision numbers.
7786 While encoding, if a charset (car part of an element) is found,
7787 designate it with the escape sequence identifying revision (cdr part of the element). */);
7788 Vcharset_revision_alist = Qnil;
7789
7790 DEFVAR_LISP ("default-process-coding-system",
7791 &Vdefault_process_coding_system,
7792 doc: /* Cons of coding systems used for process I/O by default.
7793 The car part is used for decoding a process output,
7794 the cdr part is used for encoding a text to be sent to a process. */);
7795 Vdefault_process_coding_system = Qnil;
7796
7797 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
7798 doc: /* Table of extra Latin codes in the range 128..159 (inclusive).
7799 This is a vector of length 256.
7800 If Nth element is non-nil, the existence of code N in a file
7801 \(or output of subprocess) doesn't prevent it to be detected as
7802 a coding system of ISO 2022 variant which has a flag
7803 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
7804 or reading output of a subprocess.
7805 Only 128th through 159th elements has a meaning. */);
7806 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
7807
7808 DEFVAR_LISP ("select-safe-coding-system-function",
7809 &Vselect_safe_coding_system_function,
7810 doc: /* Function to call to select safe coding system for encoding a text.
7811
7812 If set, this function is called to force a user to select a proper
7813 coding system which can encode the text in the case that a default
7814 coding system used in each operation can't encode the text.
7815
7816 The default value is `select-safe-coding-system' (which see). */);
7817 Vselect_safe_coding_system_function = Qnil;
7818
7819 DEFVAR_BOOL ("coding-system-require-warning",
7820 &coding_system_require_warning,
7821 doc: /* Internal use only.
7822 If non-nil, on writing a file, `select-safe-coding-system-function' is
7823 called even if `coding-system-for-write' is non-nil. The command
7824 `universal-coding-system-argument' binds this variable to t temporarily. */);
7825 coding_system_require_warning = 0;
7826
7827
7828 DEFVAR_BOOL ("inhibit-iso-escape-detection",
7829 &inhibit_iso_escape_detection,
7830 doc: /* If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
7831
7832 By default, on reading a file, Emacs tries to detect how the text is
7833 encoded. This code detection is sensitive to escape sequences. If
7834 the sequence is valid as ISO2022, the code is determined as one of
7835 the ISO2022 encodings, and the file is decoded by the corresponding
7836 coding system (e.g. `iso-2022-7bit').
7837
7838 However, there may be a case that you want to read escape sequences in
7839 a file as is. In such a case, you can set this variable to non-nil.
7840 Then, as the code detection ignores any escape sequences, no file is
7841 detected as encoded in some ISO2022 encoding. The result is that all
7842 escape sequences become visible in a buffer.
7843
7844 The default value is nil, and it is strongly recommended not to change
7845 it. That is because many Emacs Lisp source files that contain
7846 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
7847 in Emacs's distribution, and they won't be decoded correctly on
7848 reading if you suppress escape sequence detection.
7849
7850 The other way to read escape sequences in a file without decoding is
7851 to explicitly specify some coding system that doesn't use ISO2022's
7852 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
7853 inhibit_iso_escape_detection = 0;
7854
7855 DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
7856 doc: /* Char table for translating self-inserting characters.
7857 This is applied to the result of input methods, not their input. See also
7858 `keyboard-translate-table'. */);
7859 Vtranslation_table_for_input = Qnil;
7860 }
7861
7862 char *
7863 emacs_strerror (error_number)
7864 int error_number;
7865 {
7866 char *str;
7867
7868 synchronize_system_messages_locale ();
7869 str = strerror (error_number);
7870
7871 if (! NILP (Vlocale_coding_system))
7872 {
7873 Lisp_Object dec = code_convert_string_norecord (build_string (str),
7874 Vlocale_coding_system,
7875 0);
7876 str = (char *) SDATA (dec);
7877 }
7878
7879 return str;
7880 }
7881
7882 #endif /* emacs */
7883