]> code.delx.au - gnu-emacs/blob - src/coding.c
(server-visit-files): If `minibuffer-auto-raise' has been set to t, respect it.
[gnu-emacs] / src / coding.c
1 /* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 2001, 2002, 2003, 2004, 2005,
3 2006 Free Software Foundation, Inc.
4 Copyright (C) 1995, 1997, 1998, 2002, 2003, 2004, 2005
5 National Institute of Advanced Industrial Science and Technology (AIST)
6 Registration Number H14PRO021
7
8 This file is part of GNU Emacs.
9
10 GNU Emacs is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 GNU Emacs is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with GNU Emacs; see the file COPYING. If not, write to
22 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
23 Boston, MA 02110-1301, USA. */
24
25 /*** TABLE OF CONTENTS ***
26
27 0. General comments
28 1. Preamble
29 2. Emacs' internal format (emacs-mule) handlers
30 3. ISO2022 handlers
31 4. Shift-JIS and BIG5 handlers
32 5. CCL handlers
33 6. End-of-line handlers
34 7. C library functions
35 8. Emacs Lisp library functions
36 9. Post-amble
37
38 */
39
40 /*** 0. General comments ***/
41
42
43 /*** GENERAL NOTE on CODING SYSTEMS ***
44
45 A coding system is an encoding mechanism for one or more character
46 sets. Here's a list of coding systems which Emacs can handle. When
47 we say "decode", it means converting some other coding system to
48 Emacs' internal format (emacs-mule), and when we say "encode",
49 it means converting the coding system emacs-mule to some other
50 coding system.
51
52 0. Emacs' internal format (emacs-mule)
53
54 Emacs itself holds a multi-lingual character in buffers and strings
55 in a special format. Details are described in section 2.
56
57 1. ISO2022
58
59 The most famous coding system for multiple character sets. X's
60 Compound Text, various EUCs (Extended Unix Code), and coding
61 systems used in Internet communication such as ISO-2022-JP are
62 all variants of ISO2022. Details are described in section 3.
63
64 2. SJIS (or Shift-JIS or MS-Kanji-Code)
65
66 A coding system to encode character sets: ASCII, JISX0201, and
67 JISX0208. Widely used for PC's in Japan. Details are described in
68 section 4.
69
70 3. BIG5
71
72 A coding system to encode the character sets ASCII and Big5. Widely
73 used for Chinese (mainly in Taiwan and Hong Kong). Details are
74 described in section 4. In this file, when we write "BIG5"
75 (all uppercase), we mean the coding system, and when we write
76 "Big5" (capitalized), we mean the character set.
77
78 4. Raw text
79
80 A coding system for text containing random 8-bit code. Emacs does
81 no code conversion on such text except for end-of-line format.
82
83 5. Other
84
85 If a user wants to read/write text encoded in a coding system not
86 listed above, he can supply a decoder and an encoder for it as CCL
87 (Code Conversion Language) programs. Emacs executes the CCL program
88 while reading/writing.
89
90 Emacs represents a coding system by a Lisp symbol that has a property
91 `coding-system'. But, before actually using the coding system, the
92 information about it is set in a structure of type `struct
93 coding_system' for rapid processing. See section 6 for more details.
94
95 */
96
97 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
98
99 How end-of-line of text is encoded depends on the operating system.
100 For instance, Unix's format is just one byte of `line-feed' code,
101 whereas DOS's format is two-byte sequence of `carriage-return' and
102 `line-feed' codes. MacOS's format is usually one byte of
103 `carriage-return'.
104
105 Since text character encoding and end-of-line encoding are
106 independent, any coding system described above can have any
107 end-of-line format. So Emacs has information about end-of-line
108 format in each coding-system. See section 6 for more details.
109
110 */
111
112 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
113
114 These functions check if a text between SRC and SRC_END is encoded
115 in the coding system category XXX. Each returns an integer value in
116 which appropriate flag bits for the category XXX are set. The flag
117 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
118 template for these functions. If MULTIBYTEP is nonzero, 8-bit codes
119 of the range 0x80..0x9F are in multibyte form. */
120 #if 0
121 int
122 detect_coding_emacs_mule (src, src_end, multibytep)
123 unsigned char *src, *src_end;
124 int multibytep;
125 {
126 ...
127 }
128 #endif
129
130 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
131
132 These functions decode SRC_BYTES length of unibyte text at SOURCE
133 encoded in CODING to Emacs' internal format. The resulting
134 multibyte text goes to a place pointed to by DESTINATION, the length
135 of which should not exceed DST_BYTES.
136
137 These functions set the information about original and decoded texts
138 in the members `produced', `produced_char', `consumed', and
139 `consumed_char' of the structure *CODING. They also set the member
140 `result' to one of CODING_FINISH_XXX indicating how the decoding
141 finished.
142
143 DST_BYTES zero means that the source area and destination area are
144 overlapped, which means that we can produce a decoded text until it
145 reaches the head of the not-yet-decoded source text.
146
147 Below is a template for these functions. */
148 #if 0
149 static void
150 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
151 struct coding_system *coding;
152 const unsigned char *source;
153 unsigned char *destination;
154 int src_bytes, dst_bytes;
155 {
156 ...
157 }
158 #endif
159
160 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
161
162 These functions encode SRC_BYTES length text at SOURCE from Emacs'
163 internal multibyte format to CODING. The resulting unibyte text
164 goes to a place pointed to by DESTINATION, the length of which
165 should not exceed DST_BYTES.
166
167 These functions set the information about original and encoded texts
168 in the members `produced', `produced_char', `consumed', and
169 `consumed_char' of the structure *CODING. They also set the member
170 `result' to one of CODING_FINISH_XXX indicating how the encoding
171 finished.
172
173 DST_BYTES zero means that the source area and destination area are
174 overlapped, which means that we can produce encoded text until it
175 reaches at the head of the not-yet-encoded source text.
176
177 Below is a template for these functions. */
178 #if 0
179 static void
180 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
181 struct coding_system *coding;
182 unsigned char *source, *destination;
183 int src_bytes, dst_bytes;
184 {
185 ...
186 }
187 #endif
188
189 /*** COMMONLY USED MACROS ***/
190
191 /* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
192 get one, two, and three bytes from the source text respectively.
193 If there are not enough bytes in the source, they jump to
194 `label_end_of_loop'. The caller should set variables `coding',
195 `src' and `src_end' to appropriate pointer in advance. These
196 macros are called from decoding routines `decode_coding_XXX', thus
197 it is assumed that the source text is unibyte. */
198
199 #define ONE_MORE_BYTE(c1) \
200 do { \
201 if (src >= src_end) \
202 { \
203 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
204 goto label_end_of_loop; \
205 } \
206 c1 = *src++; \
207 } while (0)
208
209 #define TWO_MORE_BYTES(c1, c2) \
210 do { \
211 if (src + 1 >= src_end) \
212 { \
213 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
214 goto label_end_of_loop; \
215 } \
216 c1 = *src++; \
217 c2 = *src++; \
218 } while (0)
219
220
221 /* Like ONE_MORE_BYTE, but 8-bit bytes of data at SRC are in multibyte
222 form if MULTIBYTEP is nonzero. In addition, if SRC is not less
223 than SRC_END, return with RET. */
224
225 #define ONE_MORE_BYTE_CHECK_MULTIBYTE(c1, multibytep, ret) \
226 do { \
227 if (src >= src_end) \
228 { \
229 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
230 return ret; \
231 } \
232 c1 = *src++; \
233 if (multibytep && c1 == LEADING_CODE_8_BIT_CONTROL) \
234 c1 = *src++ - 0x20; \
235 } while (0)
236
237 /* Set C to the next character at the source text pointed by `src'.
238 If there are not enough characters in the source, jump to
239 `label_end_of_loop'. The caller should set variables `coding'
240 `src', `src_end', and `translation_table' to appropriate pointers
241 in advance. This macro is used in encoding routines
242 `encode_coding_XXX', thus it assumes that the source text is in
243 multibyte form except for 8-bit characters. 8-bit characters are
244 in multibyte form if coding->src_multibyte is nonzero, else they
245 are represented by a single byte. */
246
247 #define ONE_MORE_CHAR(c) \
248 do { \
249 int len = src_end - src; \
250 int bytes; \
251 if (len <= 0) \
252 { \
253 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
254 goto label_end_of_loop; \
255 } \
256 if (coding->src_multibyte \
257 || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes)) \
258 c = STRING_CHAR_AND_LENGTH (src, len, bytes); \
259 else \
260 c = *src, bytes = 1; \
261 if (!NILP (translation_table)) \
262 c = translate_char (translation_table, c, -1, 0, 0); \
263 src += bytes; \
264 } while (0)
265
266
267 /* Produce a multibyte form of character C to `dst'. Jump to
268 `label_end_of_loop' if there's not enough space at `dst'.
269
270 If we are now in the middle of a composition sequence, the decoded
271 character may be ALTCHAR (for the current composition). In that
272 case, the character goes to coding->cmp_data->data instead of
273 `dst'.
274
275 This macro is used in decoding routines. */
276
277 #define EMIT_CHAR(c) \
278 do { \
279 if (! COMPOSING_P (coding) \
280 || coding->composing == COMPOSITION_RELATIVE \
281 || coding->composing == COMPOSITION_WITH_RULE) \
282 { \
283 int bytes = CHAR_BYTES (c); \
284 if ((dst + bytes) > (dst_bytes ? dst_end : src)) \
285 { \
286 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
287 goto label_end_of_loop; \
288 } \
289 dst += CHAR_STRING (c, dst); \
290 coding->produced_char++; \
291 } \
292 \
293 if (COMPOSING_P (coding) \
294 && coding->composing != COMPOSITION_RELATIVE) \
295 { \
296 CODING_ADD_COMPOSITION_COMPONENT (coding, c); \
297 coding->composition_rule_follows \
298 = coding->composing != COMPOSITION_WITH_ALTCHARS; \
299 } \
300 } while (0)
301
302
303 #define EMIT_ONE_BYTE(c) \
304 do { \
305 if (dst >= (dst_bytes ? dst_end : src)) \
306 { \
307 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
308 goto label_end_of_loop; \
309 } \
310 *dst++ = c; \
311 } while (0)
312
313 #define EMIT_TWO_BYTES(c1, c2) \
314 do { \
315 if (dst + 2 > (dst_bytes ? dst_end : src)) \
316 { \
317 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
318 goto label_end_of_loop; \
319 } \
320 *dst++ = c1, *dst++ = c2; \
321 } while (0)
322
323 #define EMIT_BYTES(from, to) \
324 do { \
325 if (dst + (to - from) > (dst_bytes ? dst_end : src)) \
326 { \
327 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
328 goto label_end_of_loop; \
329 } \
330 while (from < to) \
331 *dst++ = *from++; \
332 } while (0)
333
334 \f
335 /*** 1. Preamble ***/
336
337 #ifdef emacs
338 #include <config.h>
339 #endif
340
341 #include <stdio.h>
342
343 #ifdef emacs
344
345 #include "lisp.h"
346 #include "buffer.h"
347 #include "charset.h"
348 #include "composite.h"
349 #include "ccl.h"
350 #include "coding.h"
351 #include "window.h"
352 #include "intervals.h"
353
354 #else /* not emacs */
355
356 #include "mulelib.h"
357
358 #endif /* not emacs */
359
360 Lisp_Object Qcoding_system, Qeol_type;
361 Lisp_Object Qbuffer_file_coding_system;
362 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
363 Lisp_Object Qno_conversion, Qundecided;
364 Lisp_Object Qcoding_system_history;
365 Lisp_Object Qsafe_chars;
366 Lisp_Object Qvalid_codes;
367 Lisp_Object Qascii_incompatible;
368
369 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
370 Lisp_Object Qcall_process, Qcall_process_region;
371 Lisp_Object Qstart_process, Qopen_network_stream;
372 Lisp_Object Qtarget_idx;
373
374 /* If a symbol has this property, evaluate the value to define the
375 symbol as a coding system. */
376 Lisp_Object Qcoding_system_define_form;
377
378 Lisp_Object Vselect_safe_coding_system_function;
379
380 int coding_system_require_warning;
381
382 /* Mnemonic string for each format of end-of-line. */
383 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
384 /* Mnemonic string to indicate format of end-of-line is not yet
385 decided. */
386 Lisp_Object eol_mnemonic_undecided;
387
388 /* Format of end-of-line decided by system. This is CODING_EOL_LF on
389 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.
390 This has an effect only for external encoding (i.e. for output to
391 file and process), not for in-buffer or Lisp string encoding. */
392 int system_eol_type;
393
394 #ifdef emacs
395
396 /* Information about which coding system is safe for which chars.
397 The value has the form (GENERIC-LIST . NON-GENERIC-ALIST).
398
399 GENERIC-LIST is a list of generic coding systems which can encode
400 any characters.
401
402 NON-GENERIC-ALIST is an alist of non generic coding systems vs the
403 corresponding char table that contains safe chars. */
404 Lisp_Object Vcoding_system_safe_chars;
405
406 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
407
408 Lisp_Object Qcoding_system_p, Qcoding_system_error;
409
410 /* Coding system emacs-mule and raw-text are for converting only
411 end-of-line format. */
412 Lisp_Object Qemacs_mule, Qraw_text;
413
414 Lisp_Object Qutf_8;
415
416 /* Coding-systems are handed between Emacs Lisp programs and C internal
417 routines by the following three variables. */
418 /* Coding-system for reading files and receiving data from process. */
419 Lisp_Object Vcoding_system_for_read;
420 /* Coding-system for writing files and sending data to process. */
421 Lisp_Object Vcoding_system_for_write;
422 /* Coding-system actually used in the latest I/O. */
423 Lisp_Object Vlast_coding_system_used;
424
425 /* A vector of length 256 which contains information about special
426 Latin codes (especially for dealing with Microsoft codes). */
427 Lisp_Object Vlatin_extra_code_table;
428
429 /* Flag to inhibit code conversion of end-of-line format. */
430 int inhibit_eol_conversion;
431
432 /* Flag to inhibit ISO2022 escape sequence detection. */
433 int inhibit_iso_escape_detection;
434
435 /* Flag to make buffer-file-coding-system inherit from process-coding. */
436 int inherit_process_coding_system;
437
438 /* Coding system to be used to encode text for terminal display. */
439 struct coding_system terminal_coding;
440
441 /* Coding system to be used to encode text for terminal display when
442 terminal coding system is nil. */
443 struct coding_system safe_terminal_coding;
444
445 /* Coding system of what is sent from terminal keyboard. */
446 struct coding_system keyboard_coding;
447
448 /* Default coding system to be used to write a file. */
449 struct coding_system default_buffer_file_coding;
450
451 Lisp_Object Vfile_coding_system_alist;
452 Lisp_Object Vprocess_coding_system_alist;
453 Lisp_Object Vnetwork_coding_system_alist;
454
455 Lisp_Object Vlocale_coding_system;
456
457 #endif /* emacs */
458
459 Lisp_Object Qcoding_category, Qcoding_category_index;
460
461 /* List of symbols `coding-category-xxx' ordered by priority. */
462 Lisp_Object Vcoding_category_list;
463
464 /* Table of coding categories (Lisp symbols). */
465 Lisp_Object Vcoding_category_table;
466
467 /* Table of names of symbol for each coding-category. */
468 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
469 "coding-category-emacs-mule",
470 "coding-category-sjis",
471 "coding-category-iso-7",
472 "coding-category-iso-7-tight",
473 "coding-category-iso-8-1",
474 "coding-category-iso-8-2",
475 "coding-category-iso-7-else",
476 "coding-category-iso-8-else",
477 "coding-category-ccl",
478 "coding-category-big5",
479 "coding-category-utf-8",
480 "coding-category-utf-16-be",
481 "coding-category-utf-16-le",
482 "coding-category-raw-text",
483 "coding-category-binary"
484 };
485
486 /* Table of pointers to coding systems corresponding to each coding
487 categories. */
488 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
489
490 /* Table of coding category masks. Nth element is a mask for a coding
491 category of which priority is Nth. */
492 static
493 int coding_priorities[CODING_CATEGORY_IDX_MAX];
494
495 /* Flag to tell if we look up translation table on character code
496 conversion. */
497 Lisp_Object Venable_character_translation;
498 /* Standard translation table to look up on decoding (reading). */
499 Lisp_Object Vstandard_translation_table_for_decode;
500 /* Standard translation table to look up on encoding (writing). */
501 Lisp_Object Vstandard_translation_table_for_encode;
502
503 Lisp_Object Qtranslation_table;
504 Lisp_Object Qtranslation_table_id;
505 Lisp_Object Qtranslation_table_for_decode;
506 Lisp_Object Qtranslation_table_for_encode;
507
508 /* Alist of charsets vs revision number. */
509 Lisp_Object Vcharset_revision_alist;
510
511 /* Default coding systems used for process I/O. */
512 Lisp_Object Vdefault_process_coding_system;
513
514 /* Char table for translating Quail and self-inserting input. */
515 Lisp_Object Vtranslation_table_for_input;
516
517 /* Global flag to tell that we can't call post-read-conversion and
518 pre-write-conversion functions. Usually the value is zero, but it
519 is set to 1 temporarily while such functions are running. This is
520 to avoid infinite recursive call. */
521 static int inhibit_pre_post_conversion;
522
523 Lisp_Object Qchar_coding_system;
524
525 /* Return `safe-chars' property of CODING_SYSTEM (symbol). Don't check
526 its validity. */
527
528 Lisp_Object
529 coding_safe_chars (coding_system)
530 Lisp_Object coding_system;
531 {
532 Lisp_Object coding_spec, plist, safe_chars;
533
534 coding_spec = Fget (coding_system, Qcoding_system);
535 plist = XVECTOR (coding_spec)->contents[3];
536 safe_chars = Fplist_get (XVECTOR (coding_spec)->contents[3], Qsafe_chars);
537 return (CHAR_TABLE_P (safe_chars) ? safe_chars : Qt);
538 }
539
540 #define CODING_SAFE_CHAR_P(safe_chars, c) \
541 (EQ (safe_chars, Qt) || !NILP (CHAR_TABLE_REF (safe_chars, c)))
542
543 \f
544 /*** 2. Emacs internal format (emacs-mule) handlers ***/
545
546 /* Emacs' internal format for representation of multiple character
547 sets is a kind of multi-byte encoding, i.e. characters are
548 represented by variable-length sequences of one-byte codes.
549
550 ASCII characters and control characters (e.g. `tab', `newline') are
551 represented by one-byte sequences which are their ASCII codes, in
552 the range 0x00 through 0x7F.
553
554 8-bit characters of the range 0x80..0x9F are represented by
555 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
556 code + 0x20).
557
558 8-bit characters of the range 0xA0..0xFF are represented by
559 one-byte sequences which are their 8-bit code.
560
561 The other characters are represented by a sequence of `base
562 leading-code', optional `extended leading-code', and one or two
563 `position-code's. The length of the sequence is determined by the
564 base leading-code. Leading-code takes the range 0x81 through 0x9D,
565 whereas extended leading-code and position-code take the range 0xA0
566 through 0xFF. See `charset.h' for more details about leading-code
567 and position-code.
568
569 --- CODE RANGE of Emacs' internal format ---
570 character set range
571 ------------- -----
572 ascii 0x00..0x7F
573 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
574 eight-bit-graphic 0xA0..0xBF
575 ELSE 0x81..0x9D + [0xA0..0xFF]+
576 ---------------------------------------------
577
578 As this is the internal character representation, the format is
579 usually not used externally (i.e. in a file or in a data sent to a
580 process). But, it is possible to have a text externally in this
581 format (i.e. by encoding by the coding system `emacs-mule').
582
583 In that case, a sequence of one-byte codes has a slightly different
584 form.
585
586 Firstly, all characters in eight-bit-control are represented by
587 one-byte sequences which are their 8-bit code.
588
589 Next, character composition data are represented by the byte
590 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
591 where,
592 METHOD is 0xF0 plus one of composition method (enum
593 composition_method),
594
595 BYTES is 0xA0 plus the byte length of these composition data,
596
597 CHARS is 0xA0 plus the number of characters composed by these
598 data,
599
600 COMPONENTs are characters of multibyte form or composition
601 rules encoded by two-byte of ASCII codes.
602
603 In addition, for backward compatibility, the following formats are
604 also recognized as composition data on decoding.
605
606 0x80 MSEQ ...
607 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
608
609 Here,
610 MSEQ is a multibyte form but in these special format:
611 ASCII: 0xA0 ASCII_CODE+0x80,
612 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
613 RULE is a one byte code of the range 0xA0..0xF0 that
614 represents a composition rule.
615 */
616
617 enum emacs_code_class_type emacs_code_class[256];
618
619 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
620 Check if a text is encoded in Emacs' internal format. If it is,
621 return CODING_CATEGORY_MASK_EMACS_MULE, else return 0. */
622
623 static int
624 detect_coding_emacs_mule (src, src_end, multibytep)
625 unsigned char *src, *src_end;
626 int multibytep;
627 {
628 unsigned char c;
629 int composing = 0;
630 /* Dummy for ONE_MORE_BYTE. */
631 struct coding_system dummy_coding;
632 struct coding_system *coding = &dummy_coding;
633
634 while (1)
635 {
636 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep,
637 CODING_CATEGORY_MASK_EMACS_MULE);
638 if (composing)
639 {
640 if (c < 0xA0)
641 composing = 0;
642 else if (c == 0xA0)
643 {
644 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0);
645 c &= 0x7F;
646 }
647 else
648 c -= 0x20;
649 }
650
651 if (c < 0x20)
652 {
653 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
654 return 0;
655 }
656 else if (c >= 0x80 && c < 0xA0)
657 {
658 if (c == 0x80)
659 /* Old leading code for a composite character. */
660 composing = 1;
661 else
662 {
663 unsigned char *src_base = src - 1;
664 int bytes;
665
666 if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
667 bytes))
668 return 0;
669 src = src_base + bytes;
670 }
671 }
672 }
673 }
674
675
676 /* Record the starting position START and METHOD of one composition. */
677
678 #define CODING_ADD_COMPOSITION_START(coding, start, method) \
679 do { \
680 struct composition_data *cmp_data = coding->cmp_data; \
681 int *data = cmp_data->data + cmp_data->used; \
682 coding->cmp_data_start = cmp_data->used; \
683 data[0] = -1; \
684 data[1] = cmp_data->char_offset + start; \
685 data[3] = (int) method; \
686 cmp_data->used += 4; \
687 } while (0)
688
689 /* Record the ending position END of the current composition. */
690
691 #define CODING_ADD_COMPOSITION_END(coding, end) \
692 do { \
693 struct composition_data *cmp_data = coding->cmp_data; \
694 int *data = cmp_data->data + coding->cmp_data_start; \
695 data[0] = cmp_data->used - coding->cmp_data_start; \
696 data[2] = cmp_data->char_offset + end; \
697 } while (0)
698
699 /* Record one COMPONENT (alternate character or composition rule). */
700
701 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component) \
702 do { \
703 coding->cmp_data->data[coding->cmp_data->used++] = component; \
704 if (coding->cmp_data->used - coding->cmp_data_start \
705 == COMPOSITION_DATA_MAX_BUNCH_LENGTH) \
706 { \
707 CODING_ADD_COMPOSITION_END (coding, coding->produced_char); \
708 coding->composing = COMPOSITION_NO; \
709 } \
710 } while (0)
711
712
713 /* Get one byte from a data pointed by SRC and increment SRC. If SRC
714 is not less than SRC_END, return -1 without incrementing Src. */
715
716 #define SAFE_ONE_MORE_BYTE() (src >= src_end ? -1 : *src++)
717
718
719 /* Decode a character represented as a component of composition
720 sequence of Emacs 20 style at SRC. Set C to that character, store
721 its multibyte form sequence at P, and set P to the end of that
722 sequence. If no valid character is found, set C to -1. */
723
724 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(c, p) \
725 do { \
726 int bytes; \
727 \
728 c = SAFE_ONE_MORE_BYTE (); \
729 if (c < 0) \
730 break; \
731 if (CHAR_HEAD_P (c)) \
732 c = -1; \
733 else if (c == 0xA0) \
734 { \
735 c = SAFE_ONE_MORE_BYTE (); \
736 if (c < 0xA0) \
737 c = -1; \
738 else \
739 { \
740 c -= 0x80; \
741 *p++ = c; \
742 } \
743 } \
744 else if (BASE_LEADING_CODE_P (c - 0x20)) \
745 { \
746 unsigned char *p0 = p; \
747 \
748 c -= 0x20; \
749 *p++ = c; \
750 bytes = BYTES_BY_CHAR_HEAD (c); \
751 while (--bytes) \
752 { \
753 c = SAFE_ONE_MORE_BYTE (); \
754 if (c < 0) \
755 break; \
756 *p++ = c; \
757 } \
758 if (UNIBYTE_STR_AS_MULTIBYTE_P (p0, p - p0, bytes) \
759 || (coding->flags /* We are recovering a file. */ \
760 && p0[0] == LEADING_CODE_8_BIT_CONTROL \
761 && ! CHAR_HEAD_P (p0[1]))) \
762 c = STRING_CHAR (p0, bytes); \
763 else \
764 c = -1; \
765 } \
766 else \
767 c = -1; \
768 } while (0)
769
770
771 /* Decode a composition rule represented as a component of composition
772 sequence of Emacs 20 style at SRC. Set C to the rule. If not
773 valid rule is found, set C to -1. */
774
775 #define DECODE_EMACS_MULE_COMPOSITION_RULE(c) \
776 do { \
777 c = SAFE_ONE_MORE_BYTE (); \
778 c -= 0xA0; \
779 if (c < 0 || c >= 81) \
780 c = -1; \
781 else \
782 { \
783 gref = c / 9, nref = c % 9; \
784 c = COMPOSITION_ENCODE_RULE (gref, nref); \
785 } \
786 } while (0)
787
788
789 /* Decode composition sequence encoded by `emacs-mule' at the source
790 pointed by SRC. SRC_END is the end of source. Store information
791 of the composition in CODING->cmp_data.
792
793 For backward compatibility, decode also a composition sequence of
794 Emacs 20 style. In that case, the composition sequence contains
795 characters that should be extracted into a buffer or string. Store
796 those characters at *DESTINATION in multibyte form.
797
798 If we encounter an invalid byte sequence, return 0.
799 If we encounter an insufficient source or destination, or
800 insufficient space in CODING->cmp_data, return 1.
801 Otherwise, return consumed bytes in the source.
802
803 */
804 static INLINE int
805 decode_composition_emacs_mule (coding, src, src_end,
806 destination, dst_end, dst_bytes)
807 struct coding_system *coding;
808 const unsigned char *src, *src_end;
809 unsigned char **destination, *dst_end;
810 int dst_bytes;
811 {
812 unsigned char *dst = *destination;
813 int method, data_len, nchars;
814 const unsigned char *src_base = src++;
815 /* Store components of composition. */
816 int component[COMPOSITION_DATA_MAX_BUNCH_LENGTH];
817 int ncomponent;
818 /* Store multibyte form of characters to be composed. This is for
819 Emacs 20 style composition sequence. */
820 unsigned char buf[MAX_COMPOSITION_COMPONENTS * MAX_MULTIBYTE_LENGTH];
821 unsigned char *bufp = buf;
822 int c, i, gref, nref;
823
824 if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
825 >= COMPOSITION_DATA_SIZE)
826 {
827 coding->result = CODING_FINISH_INSUFFICIENT_CMP;
828 return -1;
829 }
830
831 ONE_MORE_BYTE (c);
832 if (c - 0xF0 >= COMPOSITION_RELATIVE
833 && c - 0xF0 <= COMPOSITION_WITH_RULE_ALTCHARS)
834 {
835 int with_rule;
836
837 method = c - 0xF0;
838 with_rule = (method == COMPOSITION_WITH_RULE
839 || method == COMPOSITION_WITH_RULE_ALTCHARS);
840 ONE_MORE_BYTE (c);
841 data_len = c - 0xA0;
842 if (data_len < 4
843 || src_base + data_len > src_end)
844 return 0;
845 ONE_MORE_BYTE (c);
846 nchars = c - 0xA0;
847 if (c < 1)
848 return 0;
849 for (ncomponent = 0; src < src_base + data_len; ncomponent++)
850 {
851 /* If it is longer than this, it can't be valid. */
852 if (ncomponent >= COMPOSITION_DATA_MAX_BUNCH_LENGTH)
853 return 0;
854
855 if (ncomponent % 2 && with_rule)
856 {
857 ONE_MORE_BYTE (gref);
858 gref -= 32;
859 ONE_MORE_BYTE (nref);
860 nref -= 32;
861 c = COMPOSITION_ENCODE_RULE (gref, nref);
862 }
863 else
864 {
865 int bytes;
866 if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
867 || (coding->flags /* We are recovering a file. */
868 && src[0] == LEADING_CODE_8_BIT_CONTROL
869 && ! CHAR_HEAD_P (src[1])))
870 c = STRING_CHAR (src, bytes);
871 else
872 c = *src, bytes = 1;
873 src += bytes;
874 }
875 component[ncomponent] = c;
876 }
877 }
878 else if (c >= 0x80)
879 {
880 /* This may be an old Emacs 20 style format. See the comment at
881 the section 2 of this file. */
882 while (src < src_end && !CHAR_HEAD_P (*src)) src++;
883 if (src == src_end
884 && !(coding->mode & CODING_MODE_LAST_BLOCK))
885 goto label_end_of_loop;
886
887 src_end = src;
888 src = src_base + 1;
889 if (c < 0xC0)
890 {
891 method = COMPOSITION_RELATIVE;
892 for (ncomponent = 0; ncomponent < MAX_COMPOSITION_COMPONENTS;)
893 {
894 DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
895 if (c < 0)
896 break;
897 component[ncomponent++] = c;
898 }
899 if (ncomponent < 2)
900 return 0;
901 nchars = ncomponent;
902 }
903 else if (c == 0xFF)
904 {
905 method = COMPOSITION_WITH_RULE;
906 src++;
907 DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
908 if (c < 0)
909 return 0;
910 component[0] = c;
911 for (ncomponent = 1;
912 ncomponent < MAX_COMPOSITION_COMPONENTS * 2 - 1;)
913 {
914 DECODE_EMACS_MULE_COMPOSITION_RULE (c);
915 if (c < 0)
916 break;
917 component[ncomponent++] = c;
918 DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
919 if (c < 0)
920 break;
921 component[ncomponent++] = c;
922 }
923 if (ncomponent < 3)
924 return 0;
925 nchars = (ncomponent + 1) / 2;
926 }
927 else
928 return 0;
929 }
930 else
931 return 0;
932
933 if (buf == bufp || dst + (bufp - buf) <= (dst_bytes ? dst_end : src))
934 {
935 CODING_ADD_COMPOSITION_START (coding, coding->produced_char, method);
936 for (i = 0; i < ncomponent; i++)
937 CODING_ADD_COMPOSITION_COMPONENT (coding, component[i]);
938 CODING_ADD_COMPOSITION_END (coding, coding->produced_char + nchars);
939 if (buf < bufp)
940 {
941 unsigned char *p = buf;
942 EMIT_BYTES (p, bufp);
943 *destination += bufp - buf;
944 coding->produced_char += nchars;
945 }
946 return (src - src_base);
947 }
948 label_end_of_loop:
949 return -1;
950 }
951
952 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
953
954 static void
955 decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
956 struct coding_system *coding;
957 const unsigned char *source;
958 unsigned char *destination;
959 int src_bytes, dst_bytes;
960 {
961 const unsigned char *src = source;
962 const unsigned char *src_end = source + src_bytes;
963 unsigned char *dst = destination;
964 unsigned char *dst_end = destination + dst_bytes;
965 /* SRC_BASE remembers the start position in source in each loop.
966 The loop will be exited when there's not enough source code, or
967 when there's not enough destination area to produce a
968 character. */
969 const unsigned char *src_base;
970
971 coding->produced_char = 0;
972 while ((src_base = src) < src_end)
973 {
974 unsigned char tmp[MAX_MULTIBYTE_LENGTH];
975 const unsigned char *p;
976 int bytes;
977
978 if (*src == '\r')
979 {
980 int c = *src++;
981
982 if (coding->eol_type == CODING_EOL_CR)
983 c = '\n';
984 else if (coding->eol_type == CODING_EOL_CRLF)
985 {
986 ONE_MORE_BYTE (c);
987 if (c != '\n')
988 {
989 src--;
990 c = '\r';
991 }
992 }
993 *dst++ = c;
994 coding->produced_char++;
995 continue;
996 }
997 else if (*src == '\n')
998 {
999 if ((coding->eol_type == CODING_EOL_CR
1000 || coding->eol_type == CODING_EOL_CRLF)
1001 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1002 {
1003 coding->result = CODING_FINISH_INCONSISTENT_EOL;
1004 goto label_end_of_loop;
1005 }
1006 *dst++ = *src++;
1007 coding->produced_char++;
1008 continue;
1009 }
1010 else if (*src == 0x80 && coding->cmp_data)
1011 {
1012 /* Start of composition data. */
1013 int consumed = decode_composition_emacs_mule (coding, src, src_end,
1014 &dst, dst_end,
1015 dst_bytes);
1016 if (consumed < 0)
1017 goto label_end_of_loop;
1018 else if (consumed > 0)
1019 {
1020 src += consumed;
1021 continue;
1022 }
1023 bytes = CHAR_STRING (*src, tmp);
1024 p = tmp;
1025 src++;
1026 }
1027 else if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
1028 || (coding->flags /* We are recovering a file. */
1029 && src[0] == LEADING_CODE_8_BIT_CONTROL
1030 && ! CHAR_HEAD_P (src[1])))
1031 {
1032 p = src;
1033 src += bytes;
1034 }
1035 else
1036 {
1037 int i, c;
1038
1039 bytes = BYTES_BY_CHAR_HEAD (*src);
1040 src++;
1041 for (i = 1; i < bytes; i++)
1042 {
1043 ONE_MORE_BYTE (c);
1044 if (CHAR_HEAD_P (c))
1045 break;
1046 }
1047 if (i < bytes)
1048 {
1049 bytes = CHAR_STRING (*src_base, tmp);
1050 p = tmp;
1051 src = src_base + 1;
1052 }
1053 else
1054 {
1055 p = src_base;
1056 }
1057 }
1058 if (dst + bytes >= (dst_bytes ? dst_end : src))
1059 {
1060 coding->result = CODING_FINISH_INSUFFICIENT_DST;
1061 break;
1062 }
1063 while (bytes--) *dst++ = *p++;
1064 coding->produced_char++;
1065 }
1066 label_end_of_loop:
1067 coding->consumed = coding->consumed_char = src_base - source;
1068 coding->produced = dst - destination;
1069 }
1070
1071
1072 /* Encode composition data stored at DATA into a special byte sequence
1073 starting by 0x80. Update CODING->cmp_data_start and maybe
1074 CODING->cmp_data for the next call. */
1075
1076 #define ENCODE_COMPOSITION_EMACS_MULE(coding, data) \
1077 do { \
1078 unsigned char buf[1024], *p0 = buf, *p; \
1079 int len = data[0]; \
1080 int i; \
1081 \
1082 buf[0] = 0x80; \
1083 buf[1] = 0xF0 + data[3]; /* METHOD */ \
1084 buf[3] = 0xA0 + (data[2] - data[1]); /* COMPOSED-CHARS */ \
1085 p = buf + 4; \
1086 if (data[3] == COMPOSITION_WITH_RULE \
1087 || data[3] == COMPOSITION_WITH_RULE_ALTCHARS) \
1088 { \
1089 p += CHAR_STRING (data[4], p); \
1090 for (i = 5; i < len; i += 2) \
1091 { \
1092 int gref, nref; \
1093 COMPOSITION_DECODE_RULE (data[i], gref, nref); \
1094 *p++ = 0x20 + gref; \
1095 *p++ = 0x20 + nref; \
1096 p += CHAR_STRING (data[i + 1], p); \
1097 } \
1098 } \
1099 else \
1100 { \
1101 for (i = 4; i < len; i++) \
1102 p += CHAR_STRING (data[i], p); \
1103 } \
1104 buf[2] = 0xA0 + (p - buf); /* COMPONENTS-BYTES */ \
1105 \
1106 if (dst + (p - buf) + 4 > (dst_bytes ? dst_end : src)) \
1107 { \
1108 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
1109 goto label_end_of_loop; \
1110 } \
1111 while (p0 < p) \
1112 *dst++ = *p0++; \
1113 coding->cmp_data_start += data[0]; \
1114 if (coding->cmp_data_start == coding->cmp_data->used \
1115 && coding->cmp_data->next) \
1116 { \
1117 coding->cmp_data = coding->cmp_data->next; \
1118 coding->cmp_data_start = 0; \
1119 } \
1120 } while (0)
1121
1122
1123 static void encode_eol P_ ((struct coding_system *, const unsigned char *,
1124 unsigned char *, int, int));
1125
1126 static void
1127 encode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
1128 struct coding_system *coding;
1129 const unsigned char *source;
1130 unsigned char *destination;
1131 int src_bytes, dst_bytes;
1132 {
1133 const unsigned char *src = source;
1134 const unsigned char *src_end = source + src_bytes;
1135 unsigned char *dst = destination;
1136 unsigned char *dst_end = destination + dst_bytes;
1137 const unsigned char *src_base;
1138 int c;
1139 int char_offset;
1140 int *data;
1141
1142 Lisp_Object translation_table;
1143
1144 translation_table = Qnil;
1145
1146 /* Optimization for the case that there's no composition. */
1147 if (!coding->cmp_data || coding->cmp_data->used == 0)
1148 {
1149 encode_eol (coding, source, destination, src_bytes, dst_bytes);
1150 return;
1151 }
1152
1153 char_offset = coding->cmp_data->char_offset;
1154 data = coding->cmp_data->data + coding->cmp_data_start;
1155 while (1)
1156 {
1157 src_base = src;
1158
1159 /* If SRC starts a composition, encode the information about the
1160 composition in advance. */
1161 if (coding->cmp_data_start < coding->cmp_data->used
1162 && char_offset + coding->consumed_char == data[1])
1163 {
1164 ENCODE_COMPOSITION_EMACS_MULE (coding, data);
1165 char_offset = coding->cmp_data->char_offset;
1166 data = coding->cmp_data->data + coding->cmp_data_start;
1167 }
1168
1169 ONE_MORE_CHAR (c);
1170 if (c == '\n' && (coding->eol_type == CODING_EOL_CRLF
1171 || coding->eol_type == CODING_EOL_CR))
1172 {
1173 if (coding->eol_type == CODING_EOL_CRLF)
1174 EMIT_TWO_BYTES ('\r', c);
1175 else
1176 EMIT_ONE_BYTE ('\r');
1177 }
1178 else if (SINGLE_BYTE_CHAR_P (c))
1179 {
1180 if (coding->flags && ! ASCII_BYTE_P (c))
1181 {
1182 /* As we are auto saving, retain the multibyte form for
1183 8-bit chars. */
1184 unsigned char buf[MAX_MULTIBYTE_LENGTH];
1185 int bytes = CHAR_STRING (c, buf);
1186
1187 if (bytes == 1)
1188 EMIT_ONE_BYTE (buf[0]);
1189 else
1190 EMIT_TWO_BYTES (buf[0], buf[1]);
1191 }
1192 else
1193 EMIT_ONE_BYTE (c);
1194 }
1195 else
1196 EMIT_BYTES (src_base, src);
1197 coding->consumed_char++;
1198 }
1199 label_end_of_loop:
1200 coding->consumed = src_base - source;
1201 coding->produced = coding->produced_char = dst - destination;
1202 return;
1203 }
1204
1205 \f
1206 /*** 3. ISO2022 handlers ***/
1207
1208 /* The following note describes the coding system ISO2022 briefly.
1209 Since the intention of this note is to help understand the
1210 functions in this file, some parts are NOT ACCURATE or are OVERLY
1211 SIMPLIFIED. For thorough understanding, please refer to the
1212 original document of ISO2022. This is equivalent to the standard
1213 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
1214
1215 ISO2022 provides many mechanisms to encode several character sets
1216 in 7-bit and 8-bit environments. For 7-bit environments, all text
1217 is encoded using bytes less than 128. This may make the encoded
1218 text a little bit longer, but the text passes more easily through
1219 several types of gateway, some of which strip off the MSB (Most
1220 Significant Bit).
1221
1222 There are two kinds of character sets: control character sets and
1223 graphic character sets. The former contain control characters such
1224 as `newline' and `escape' to provide control functions (control
1225 functions are also provided by escape sequences). The latter
1226 contain graphic characters such as 'A' and '-'. Emacs recognizes
1227 two control character sets and many graphic character sets.
1228
1229 Graphic character sets are classified into one of the following
1230 four classes, according to the number of bytes (DIMENSION) and
1231 number of characters in one dimension (CHARS) of the set:
1232 - DIMENSION1_CHARS94
1233 - DIMENSION1_CHARS96
1234 - DIMENSION2_CHARS94
1235 - DIMENSION2_CHARS96
1236
1237 In addition, each character set is assigned an identification tag,
1238 unique for each set, called the "final character" (denoted as <F>
1239 hereafter). The <F> of each character set is decided by ECMA(*)
1240 when it is registered in ISO. The code range of <F> is 0x30..0x7F
1241 (0x30..0x3F are for private use only).
1242
1243 Note (*): ECMA = European Computer Manufacturers Association
1244
1245 Here are examples of graphic character sets [NAME(<F>)]:
1246 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
1247 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
1248 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
1249 o DIMENSION2_CHARS96 -- none for the moment
1250
1251 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
1252 C0 [0x00..0x1F] -- control character plane 0
1253 GL [0x20..0x7F] -- graphic character plane 0
1254 C1 [0x80..0x9F] -- control character plane 1
1255 GR [0xA0..0xFF] -- graphic character plane 1
1256
1257 A control character set is directly designated and invoked to C0 or
1258 C1 by an escape sequence. The most common case is that:
1259 - ISO646's control character set is designated/invoked to C0, and
1260 - ISO6429's control character set is designated/invoked to C1,
1261 and usually these designations/invocations are omitted in encoded
1262 text. In a 7-bit environment, only C0 can be used, and a control
1263 character for C1 is encoded by an appropriate escape sequence to
1264 fit into the environment. All control characters for C1 are
1265 defined to have corresponding escape sequences.
1266
1267 A graphic character set is at first designated to one of four
1268 graphic registers (G0 through G3), then these graphic registers are
1269 invoked to GL or GR. These designations and invocations can be
1270 done independently. The most common case is that G0 is invoked to
1271 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
1272 these invocations and designations are omitted in encoded text.
1273 In a 7-bit environment, only GL can be used.
1274
1275 When a graphic character set of CHARS94 is invoked to GL, codes
1276 0x20 and 0x7F of the GL area work as control characters SPACE and
1277 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
1278 be used.
1279
1280 There are two ways of invocation: locking-shift and single-shift.
1281 With locking-shift, the invocation lasts until the next different
1282 invocation, whereas with single-shift, the invocation affects the
1283 following character only and doesn't affect the locking-shift
1284 state. Invocations are done by the following control characters or
1285 escape sequences:
1286
1287 ----------------------------------------------------------------------
1288 abbrev function cntrl escape seq description
1289 ----------------------------------------------------------------------
1290 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
1291 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
1292 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
1293 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
1294 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
1295 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
1296 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
1297 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
1298 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
1299 ----------------------------------------------------------------------
1300 (*) These are not used by any known coding system.
1301
1302 Control characters for these functions are defined by macros
1303 ISO_CODE_XXX in `coding.h'.
1304
1305 Designations are done by the following escape sequences:
1306 ----------------------------------------------------------------------
1307 escape sequence description
1308 ----------------------------------------------------------------------
1309 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
1310 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
1311 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
1312 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
1313 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
1314 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
1315 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
1316 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
1317 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
1318 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
1319 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
1320 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
1321 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
1322 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
1323 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
1324 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
1325 ----------------------------------------------------------------------
1326
1327 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
1328 of dimension 1, chars 94, and final character <F>, etc...
1329
1330 Note (*): Although these designations are not allowed in ISO2022,
1331 Emacs accepts them on decoding, and produces them on encoding
1332 CHARS96 character sets in a coding system which is characterized as
1333 7-bit environment, non-locking-shift, and non-single-shift.
1334
1335 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
1336 '(' can be omitted. We refer to this as "short-form" hereafter.
1337
1338 Now you may notice that there are a lot of ways of encoding the
1339 same multilingual text in ISO2022. Actually, there exist many
1340 coding systems such as Compound Text (used in X11's inter client
1341 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
1342 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
1343 localized platforms), and all of these are variants of ISO2022.
1344
1345 In addition to the above, Emacs handles two more kinds of escape
1346 sequences: ISO6429's direction specification and Emacs' private
1347 sequence for specifying character composition.
1348
1349 ISO6429's direction specification takes the following form:
1350 o CSI ']' -- end of the current direction
1351 o CSI '0' ']' -- end of the current direction
1352 o CSI '1' ']' -- start of left-to-right text
1353 o CSI '2' ']' -- start of right-to-left text
1354 The control character CSI (0x9B: control sequence introducer) is
1355 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
1356
1357 Character composition specification takes the following form:
1358 o ESC '0' -- start relative composition
1359 o ESC '1' -- end composition
1360 o ESC '2' -- start rule-base composition (*)
1361 o ESC '3' -- start relative composition with alternate chars (**)
1362 o ESC '4' -- start rule-base composition with alternate chars (**)
1363 Since these are not standard escape sequences of any ISO standard,
1364 the use of them with these meanings is restricted to Emacs only.
1365
1366 (*) This form is used only in Emacs 20.5 and older versions,
1367 but the newer versions can safely decode it.
1368 (**) This form is used only in Emacs 21.1 and newer versions,
1369 and the older versions can't decode it.
1370
1371 Here's a list of example usages of these composition escape
1372 sequences (categorized by `enum composition_method').
1373
1374 COMPOSITION_RELATIVE:
1375 ESC 0 CHAR [ CHAR ] ESC 1
1376 COMPOSITION_WITH_RULE:
1377 ESC 2 CHAR [ RULE CHAR ] ESC 1
1378 COMPOSITION_WITH_ALTCHARS:
1379 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
1380 COMPOSITION_WITH_RULE_ALTCHARS:
1381 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
1382
1383 enum iso_code_class_type iso_code_class[256];
1384
1385 #define CHARSET_OK(idx, charset, c) \
1386 (coding_system_table[idx] \
1387 && (charset == CHARSET_ASCII \
1388 || (safe_chars = coding_safe_chars (coding_system_table[idx]->symbol), \
1389 CODING_SAFE_CHAR_P (safe_chars, c))) \
1390 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding_system_table[idx], \
1391 charset) \
1392 != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
1393
1394 #define SHIFT_OUT_OK(idx) \
1395 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
1396
1397 #define COMPOSITION_OK(idx) \
1398 (coding_system_table[idx]->composing != COMPOSITION_DISABLED)
1399
1400 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1401 Check if a text is encoded in ISO2022. If it is, return an
1402 integer in which appropriate flag bits any of:
1403 CODING_CATEGORY_MASK_ISO_7
1404 CODING_CATEGORY_MASK_ISO_7_TIGHT
1405 CODING_CATEGORY_MASK_ISO_8_1
1406 CODING_CATEGORY_MASK_ISO_8_2
1407 CODING_CATEGORY_MASK_ISO_7_ELSE
1408 CODING_CATEGORY_MASK_ISO_8_ELSE
1409 are set. If a code which should never appear in ISO2022 is found,
1410 returns 0. */
1411
1412 static int
1413 detect_coding_iso2022 (src, src_end, multibytep)
1414 unsigned char *src, *src_end;
1415 int multibytep;
1416 {
1417 int mask = CODING_CATEGORY_MASK_ISO;
1418 int mask_found = 0;
1419 int reg[4], shift_out = 0, single_shifting = 0;
1420 int c, c1, charset;
1421 /* Dummy for ONE_MORE_BYTE. */
1422 struct coding_system dummy_coding;
1423 struct coding_system *coding = &dummy_coding;
1424 Lisp_Object safe_chars;
1425
1426 reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
1427 while (mask)
1428 {
1429 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, mask & mask_found);
1430 retry:
1431 switch (c)
1432 {
1433 case ISO_CODE_ESC:
1434 if (inhibit_iso_escape_detection)
1435 break;
1436 single_shifting = 0;
1437 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, mask & mask_found);
1438 if (c >= '(' && c <= '/')
1439 {
1440 /* Designation sequence for a charset of dimension 1. */
1441 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep, mask & mask_found);
1442 if (c1 < ' ' || c1 >= 0x80
1443 || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
1444 /* Invalid designation sequence. Just ignore. */
1445 break;
1446 reg[(c - '(') % 4] = charset;
1447 }
1448 else if (c == '$')
1449 {
1450 /* Designation sequence for a charset of dimension 2. */
1451 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, mask & mask_found);
1452 if (c >= '@' && c <= 'B')
1453 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
1454 reg[0] = charset = iso_charset_table[1][0][c];
1455 else if (c >= '(' && c <= '/')
1456 {
1457 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep,
1458 mask & mask_found);
1459 if (c1 < ' ' || c1 >= 0x80
1460 || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
1461 /* Invalid designation sequence. Just ignore. */
1462 break;
1463 reg[(c - '(') % 4] = charset;
1464 }
1465 else
1466 /* Invalid designation sequence. Just ignore. */
1467 break;
1468 }
1469 else if (c == 'N' || c == 'O')
1470 {
1471 /* ESC <Fe> for SS2 or SS3. */
1472 mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
1473 break;
1474 }
1475 else if (c >= '0' && c <= '4')
1476 {
1477 /* ESC <Fp> for start/end composition. */
1478 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7))
1479 mask_found |= CODING_CATEGORY_MASK_ISO_7;
1480 else
1481 mask &= ~CODING_CATEGORY_MASK_ISO_7;
1482 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT))
1483 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1484 else
1485 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1486 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_1))
1487 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1488 else
1489 mask &= ~CODING_CATEGORY_MASK_ISO_8_1;
1490 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_2))
1491 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1492 else
1493 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1494 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_ELSE))
1495 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1496 else
1497 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1498 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_ELSE))
1499 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1500 else
1501 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1502 break;
1503 }
1504 else
1505 /* Invalid escape sequence. Just ignore. */
1506 break;
1507
1508 /* We found a valid designation sequence for CHARSET. */
1509 mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
1510 c = MAKE_CHAR (charset, 0, 0);
1511 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset, c))
1512 mask_found |= CODING_CATEGORY_MASK_ISO_7;
1513 else
1514 mask &= ~CODING_CATEGORY_MASK_ISO_7;
1515 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset, c))
1516 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1517 else
1518 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1519 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset, c))
1520 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1521 else
1522 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1523 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset, c))
1524 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1525 else
1526 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1527 break;
1528
1529 case ISO_CODE_SO:
1530 if (inhibit_iso_escape_detection)
1531 break;
1532 single_shifting = 0;
1533 if (shift_out == 0
1534 && (reg[1] >= 0
1535 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
1536 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
1537 {
1538 /* Locking shift out. */
1539 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1540 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1541 }
1542 break;
1543
1544 case ISO_CODE_SI:
1545 if (inhibit_iso_escape_detection)
1546 break;
1547 single_shifting = 0;
1548 if (shift_out == 1)
1549 {
1550 /* Locking shift in. */
1551 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1552 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1553 }
1554 break;
1555
1556 case ISO_CODE_CSI:
1557 single_shifting = 0;
1558 case ISO_CODE_SS2:
1559 case ISO_CODE_SS3:
1560 {
1561 int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
1562
1563 if (inhibit_iso_escape_detection)
1564 break;
1565 if (c != ISO_CODE_CSI)
1566 {
1567 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1568 & CODING_FLAG_ISO_SINGLE_SHIFT)
1569 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1570 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1571 & CODING_FLAG_ISO_SINGLE_SHIFT)
1572 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1573 single_shifting = 1;
1574 }
1575 if (VECTORP (Vlatin_extra_code_table)
1576 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1577 {
1578 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1579 & CODING_FLAG_ISO_LATIN_EXTRA)
1580 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1581 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1582 & CODING_FLAG_ISO_LATIN_EXTRA)
1583 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1584 }
1585 mask &= newmask;
1586 mask_found |= newmask;
1587 }
1588 break;
1589
1590 default:
1591 if (c < 0x80)
1592 {
1593 single_shifting = 0;
1594 break;
1595 }
1596 else if (c < 0xA0)
1597 {
1598 single_shifting = 0;
1599 if (VECTORP (Vlatin_extra_code_table)
1600 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1601 {
1602 int newmask = 0;
1603
1604 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1605 & CODING_FLAG_ISO_LATIN_EXTRA)
1606 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1607 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1608 & CODING_FLAG_ISO_LATIN_EXTRA)
1609 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1610 mask &= newmask;
1611 mask_found |= newmask;
1612 }
1613 else
1614 return 0;
1615 }
1616 else
1617 {
1618 mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
1619 | CODING_CATEGORY_MASK_ISO_7_ELSE);
1620 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1621 /* Check the length of succeeding codes of the range
1622 0xA0..0FF. If the byte length is odd, we exclude
1623 CODING_CATEGORY_MASK_ISO_8_2. We can check this only
1624 when we are not single shifting. */
1625 if (!single_shifting
1626 && mask & CODING_CATEGORY_MASK_ISO_8_2)
1627 {
1628 int i = 1;
1629
1630 c = -1;
1631 while (src < src_end)
1632 {
1633 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep,
1634 mask & mask_found);
1635 if (c < 0xA0)
1636 break;
1637 i++;
1638 }
1639
1640 if (i & 1 && src < src_end)
1641 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1642 else
1643 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1644 if (c >= 0)
1645 /* This means that we have read one extra byte. */
1646 goto retry;
1647 }
1648 }
1649 break;
1650 }
1651 }
1652 return (mask & mask_found);
1653 }
1654
1655 /* Decode a character of which charset is CHARSET, the 1st position
1656 code is C1, the 2nd position code is C2, and return the decoded
1657 character code. If the variable `translation_table' is non-nil,
1658 returned the translated code. */
1659
1660 #define DECODE_ISO_CHARACTER(charset, c1, c2) \
1661 (NILP (translation_table) \
1662 ? MAKE_CHAR (charset, c1, c2) \
1663 : translate_char (translation_table, -1, charset, c1, c2))
1664
1665 /* Set designation state into CODING. */
1666 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
1667 do { \
1668 int charset, c; \
1669 \
1670 if (final_char < '0' || final_char >= 128) \
1671 goto label_invalid_code; \
1672 charset = ISO_CHARSET_TABLE (make_number (dimension), \
1673 make_number (chars), \
1674 make_number (final_char)); \
1675 c = MAKE_CHAR (charset, 0, 0); \
1676 if (charset >= 0 \
1677 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1678 || CODING_SAFE_CHAR_P (safe_chars, c))) \
1679 { \
1680 if (coding->spec.iso2022.last_invalid_designation_register == 0 \
1681 && reg == 0 \
1682 && charset == CHARSET_ASCII) \
1683 { \
1684 /* We should insert this designation sequence as is so \
1685 that it is surely written back to a file. */ \
1686 coding->spec.iso2022.last_invalid_designation_register = -1; \
1687 goto label_invalid_code; \
1688 } \
1689 coding->spec.iso2022.last_invalid_designation_register = -1; \
1690 if ((coding->mode & CODING_MODE_DIRECTION) \
1691 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
1692 charset = CHARSET_REVERSE_CHARSET (charset); \
1693 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1694 } \
1695 else \
1696 { \
1697 coding->spec.iso2022.last_invalid_designation_register = reg; \
1698 goto label_invalid_code; \
1699 } \
1700 } while (0)
1701
1702 /* Allocate a memory block for storing information about compositions.
1703 The block is chained to the already allocated blocks. */
1704
1705 void
1706 coding_allocate_composition_data (coding, char_offset)
1707 struct coding_system *coding;
1708 int char_offset;
1709 {
1710 struct composition_data *cmp_data
1711 = (struct composition_data *) xmalloc (sizeof *cmp_data);
1712
1713 cmp_data->char_offset = char_offset;
1714 cmp_data->used = 0;
1715 cmp_data->prev = coding->cmp_data;
1716 cmp_data->next = NULL;
1717 if (coding->cmp_data)
1718 coding->cmp_data->next = cmp_data;
1719 coding->cmp_data = cmp_data;
1720 coding->cmp_data_start = 0;
1721 coding->composing = COMPOSITION_NO;
1722 }
1723
1724 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
1725 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
1726 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
1727 ESC 3 : altchar composition : ESC 3 ALT ... ESC 0 CHAR ... ESC 1
1728 ESC 4 : alt&rule composition : ESC 4 ALT RULE .. ALT ESC 0 CHAR ... ESC 1
1729 */
1730
1731 #define DECODE_COMPOSITION_START(c1) \
1732 do { \
1733 if (coding->composing == COMPOSITION_DISABLED) \
1734 { \
1735 *dst++ = ISO_CODE_ESC; \
1736 *dst++ = c1 & 0x7f; \
1737 coding->produced_char += 2; \
1738 } \
1739 else if (!COMPOSING_P (coding)) \
1740 { \
1741 /* This is surely the start of a composition. We must be sure \
1742 that coding->cmp_data has enough space to store the \
1743 information about the composition. If not, terminate the \
1744 current decoding loop, allocate one more memory block for \
1745 coding->cmp_data in the caller, then start the decoding \
1746 loop again. We can't allocate memory here directly because \
1747 it may cause buffer/string relocation. */ \
1748 if (!coding->cmp_data \
1749 || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1750 >= COMPOSITION_DATA_SIZE)) \
1751 { \
1752 coding->result = CODING_FINISH_INSUFFICIENT_CMP; \
1753 goto label_end_of_loop; \
1754 } \
1755 coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE \
1756 : c1 == '2' ? COMPOSITION_WITH_RULE \
1757 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
1758 : COMPOSITION_WITH_RULE_ALTCHARS); \
1759 CODING_ADD_COMPOSITION_START (coding, coding->produced_char, \
1760 coding->composing); \
1761 coding->composition_rule_follows = 0; \
1762 } \
1763 else \
1764 { \
1765 /* We are already handling a composition. If the method is \
1766 the following two, the codes following the current escape \
1767 sequence are actual characters stored in a buffer. */ \
1768 if (coding->composing == COMPOSITION_WITH_ALTCHARS \
1769 || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS) \
1770 { \
1771 coding->composing = COMPOSITION_RELATIVE; \
1772 coding->composition_rule_follows = 0; \
1773 } \
1774 } \
1775 } while (0)
1776
1777 /* Handle composition end sequence ESC 1. */
1778
1779 #define DECODE_COMPOSITION_END(c1) \
1780 do { \
1781 if (! COMPOSING_P (coding)) \
1782 { \
1783 *dst++ = ISO_CODE_ESC; \
1784 *dst++ = c1; \
1785 coding->produced_char += 2; \
1786 } \
1787 else \
1788 { \
1789 CODING_ADD_COMPOSITION_END (coding, coding->produced_char); \
1790 coding->composing = COMPOSITION_NO; \
1791 } \
1792 } while (0)
1793
1794 /* Decode a composition rule from the byte C1 (and maybe one more byte
1795 from SRC) and store one encoded composition rule in
1796 coding->cmp_data. */
1797
1798 #define DECODE_COMPOSITION_RULE(c1) \
1799 do { \
1800 int rule = 0; \
1801 (c1) -= 32; \
1802 if (c1 < 81) /* old format (before ver.21) */ \
1803 { \
1804 int gref = (c1) / 9; \
1805 int nref = (c1) % 9; \
1806 if (gref == 4) gref = 10; \
1807 if (nref == 4) nref = 10; \
1808 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
1809 } \
1810 else if (c1 < 93) /* new format (after ver.21) */ \
1811 { \
1812 ONE_MORE_BYTE (c2); \
1813 rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
1814 } \
1815 CODING_ADD_COMPOSITION_COMPONENT (coding, rule); \
1816 coding->composition_rule_follows = 0; \
1817 } while (0)
1818
1819
1820 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1821
1822 static void
1823 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1824 struct coding_system *coding;
1825 const unsigned char *source;
1826 unsigned char *destination;
1827 int src_bytes, dst_bytes;
1828 {
1829 const unsigned char *src = source;
1830 const unsigned char *src_end = source + src_bytes;
1831 unsigned char *dst = destination;
1832 unsigned char *dst_end = destination + dst_bytes;
1833 /* Charsets invoked to graphic plane 0 and 1 respectively. */
1834 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1835 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1836 /* SRC_BASE remembers the start position in source in each loop.
1837 The loop will be exited when there's not enough source code
1838 (within macro ONE_MORE_BYTE), or when there's not enough
1839 destination area to produce a character (within macro
1840 EMIT_CHAR). */
1841 const unsigned char *src_base;
1842 int c, charset;
1843 Lisp_Object translation_table;
1844 Lisp_Object safe_chars;
1845
1846 safe_chars = coding_safe_chars (coding->symbol);
1847
1848 if (NILP (Venable_character_translation))
1849 translation_table = Qnil;
1850 else
1851 {
1852 translation_table = coding->translation_table_for_decode;
1853 if (NILP (translation_table))
1854 translation_table = Vstandard_translation_table_for_decode;
1855 }
1856
1857 coding->result = CODING_FINISH_NORMAL;
1858
1859 while (1)
1860 {
1861 int c1, c2 = 0;
1862
1863 src_base = src;
1864 ONE_MORE_BYTE (c1);
1865
1866 /* We produce no character or one character. */
1867 switch (iso_code_class [c1])
1868 {
1869 case ISO_0x20_or_0x7F:
1870 if (COMPOSING_P (coding) && coding->composition_rule_follows)
1871 {
1872 DECODE_COMPOSITION_RULE (c1);
1873 continue;
1874 }
1875 if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1876 {
1877 /* This is SPACE or DEL. */
1878 charset = CHARSET_ASCII;
1879 break;
1880 }
1881 /* This is a graphic character, we fall down ... */
1882
1883 case ISO_graphic_plane_0:
1884 if (COMPOSING_P (coding) && coding->composition_rule_follows)
1885 {
1886 DECODE_COMPOSITION_RULE (c1);
1887 continue;
1888 }
1889 charset = charset0;
1890 break;
1891
1892 case ISO_0xA0_or_0xFF:
1893 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1894 || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1895 goto label_invalid_code;
1896 /* This is a graphic character, we fall down ... */
1897
1898 case ISO_graphic_plane_1:
1899 if (charset1 < 0)
1900 goto label_invalid_code;
1901 charset = charset1;
1902 break;
1903
1904 case ISO_control_0:
1905 if (COMPOSING_P (coding))
1906 DECODE_COMPOSITION_END ('1');
1907
1908 /* All ISO2022 control characters in this class have the
1909 same representation in Emacs internal format. */
1910 if (c1 == '\n'
1911 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1912 && (coding->eol_type == CODING_EOL_CR
1913 || coding->eol_type == CODING_EOL_CRLF))
1914 {
1915 coding->result = CODING_FINISH_INCONSISTENT_EOL;
1916 goto label_end_of_loop;
1917 }
1918 charset = CHARSET_ASCII;
1919 break;
1920
1921 case ISO_control_1:
1922 if (COMPOSING_P (coding))
1923 DECODE_COMPOSITION_END ('1');
1924 goto label_invalid_code;
1925
1926 case ISO_carriage_return:
1927 if (COMPOSING_P (coding))
1928 DECODE_COMPOSITION_END ('1');
1929
1930 if (coding->eol_type == CODING_EOL_CR)
1931 c1 = '\n';
1932 else if (coding->eol_type == CODING_EOL_CRLF)
1933 {
1934 ONE_MORE_BYTE (c1);
1935 if (c1 != ISO_CODE_LF)
1936 {
1937 src--;
1938 c1 = '\r';
1939 }
1940 }
1941 charset = CHARSET_ASCII;
1942 break;
1943
1944 case ISO_shift_out:
1945 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1946 || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1947 goto label_invalid_code;
1948 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1949 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1950 continue;
1951
1952 case ISO_shift_in:
1953 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1954 goto label_invalid_code;
1955 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1956 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1957 continue;
1958
1959 case ISO_single_shift_2_7:
1960 case ISO_single_shift_2:
1961 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1962 goto label_invalid_code;
1963 /* SS2 is handled as an escape sequence of ESC 'N' */
1964 c1 = 'N';
1965 goto label_escape_sequence;
1966
1967 case ISO_single_shift_3:
1968 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1969 goto label_invalid_code;
1970 /* SS2 is handled as an escape sequence of ESC 'O' */
1971 c1 = 'O';
1972 goto label_escape_sequence;
1973
1974 case ISO_control_sequence_introducer:
1975 /* CSI is handled as an escape sequence of ESC '[' ... */
1976 c1 = '[';
1977 goto label_escape_sequence;
1978
1979 case ISO_escape:
1980 ONE_MORE_BYTE (c1);
1981 label_escape_sequence:
1982 /* Escape sequences handled by Emacs are invocation,
1983 designation, direction specification, and character
1984 composition specification. */
1985 switch (c1)
1986 {
1987 case '&': /* revision of following character set */
1988 ONE_MORE_BYTE (c1);
1989 if (!(c1 >= '@' && c1 <= '~'))
1990 goto label_invalid_code;
1991 ONE_MORE_BYTE (c1);
1992 if (c1 != ISO_CODE_ESC)
1993 goto label_invalid_code;
1994 ONE_MORE_BYTE (c1);
1995 goto label_escape_sequence;
1996
1997 case '$': /* designation of 2-byte character set */
1998 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1999 goto label_invalid_code;
2000 ONE_MORE_BYTE (c1);
2001 if (c1 >= '@' && c1 <= 'B')
2002 { /* designation of JISX0208.1978, GB2312.1980,
2003 or JISX0208.1980 */
2004 DECODE_DESIGNATION (0, 2, 94, c1);
2005 }
2006 else if (c1 >= 0x28 && c1 <= 0x2B)
2007 { /* designation of DIMENSION2_CHARS94 character set */
2008 ONE_MORE_BYTE (c2);
2009 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
2010 }
2011 else if (c1 >= 0x2C && c1 <= 0x2F)
2012 { /* designation of DIMENSION2_CHARS96 character set */
2013 ONE_MORE_BYTE (c2);
2014 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
2015 }
2016 else
2017 goto label_invalid_code;
2018 /* We must update these variables now. */
2019 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2020 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2021 continue;
2022
2023 case 'n': /* invocation of locking-shift-2 */
2024 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
2025 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2026 goto label_invalid_code;
2027 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
2028 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2029 continue;
2030
2031 case 'o': /* invocation of locking-shift-3 */
2032 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
2033 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2034 goto label_invalid_code;
2035 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
2036 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2037 continue;
2038
2039 case 'N': /* invocation of single-shift-2 */
2040 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2041 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2042 goto label_invalid_code;
2043 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
2044 ONE_MORE_BYTE (c1);
2045 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2046 goto label_invalid_code;
2047 break;
2048
2049 case 'O': /* invocation of single-shift-3 */
2050 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2051 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2052 goto label_invalid_code;
2053 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
2054 ONE_MORE_BYTE (c1);
2055 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2056 goto label_invalid_code;
2057 break;
2058
2059 case '0': case '2': case '3': case '4': /* start composition */
2060 DECODE_COMPOSITION_START (c1);
2061 continue;
2062
2063 case '1': /* end composition */
2064 DECODE_COMPOSITION_END (c1);
2065 continue;
2066
2067 case '[': /* specification of direction */
2068 if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
2069 goto label_invalid_code;
2070 /* For the moment, nested direction is not supported.
2071 So, `coding->mode & CODING_MODE_DIRECTION' zero means
2072 left-to-right, and nonzero means right-to-left. */
2073 ONE_MORE_BYTE (c1);
2074 switch (c1)
2075 {
2076 case ']': /* end of the current direction */
2077 coding->mode &= ~CODING_MODE_DIRECTION;
2078
2079 case '0': /* end of the current direction */
2080 case '1': /* start of left-to-right direction */
2081 ONE_MORE_BYTE (c1);
2082 if (c1 == ']')
2083 coding->mode &= ~CODING_MODE_DIRECTION;
2084 else
2085 goto label_invalid_code;
2086 break;
2087
2088 case '2': /* start of right-to-left direction */
2089 ONE_MORE_BYTE (c1);
2090 if (c1 == ']')
2091 coding->mode |= CODING_MODE_DIRECTION;
2092 else
2093 goto label_invalid_code;
2094 break;
2095
2096 default:
2097 goto label_invalid_code;
2098 }
2099 continue;
2100
2101 case '%':
2102 if (COMPOSING_P (coding))
2103 DECODE_COMPOSITION_END ('1');
2104 ONE_MORE_BYTE (c1);
2105 if (c1 == '/')
2106 {
2107 /* CTEXT extended segment:
2108 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
2109 We keep these bytes as is for the moment.
2110 They may be decoded by post-read-conversion. */
2111 int dim, M, L;
2112 int size, required;
2113 int produced_chars;
2114
2115 ONE_MORE_BYTE (dim);
2116 ONE_MORE_BYTE (M);
2117 ONE_MORE_BYTE (L);
2118 size = ((M - 128) * 128) + (L - 128);
2119 required = 8 + size * 2;
2120 if (dst + required > (dst_bytes ? dst_end : src))
2121 goto label_end_of_loop;
2122 *dst++ = ISO_CODE_ESC;
2123 *dst++ = '%';
2124 *dst++ = '/';
2125 *dst++ = dim;
2126 produced_chars = 4;
2127 dst += CHAR_STRING (M, dst), produced_chars++;
2128 dst += CHAR_STRING (L, dst), produced_chars++;
2129 while (size-- > 0)
2130 {
2131 ONE_MORE_BYTE (c1);
2132 dst += CHAR_STRING (c1, dst), produced_chars++;
2133 }
2134 coding->produced_char += produced_chars;
2135 }
2136 else if (c1 == 'G')
2137 {
2138 unsigned char *d = dst;
2139 int produced_chars;
2140
2141 /* XFree86 extension for embedding UTF-8 in CTEXT:
2142 ESC % G --UTF-8-BYTES-- ESC % @
2143 We keep these bytes as is for the moment.
2144 They may be decoded by post-read-conversion. */
2145 if (d + 6 > (dst_bytes ? dst_end : src))
2146 goto label_end_of_loop;
2147 *d++ = ISO_CODE_ESC;
2148 *d++ = '%';
2149 *d++ = 'G';
2150 produced_chars = 3;
2151 while (d + 1 < (dst_bytes ? dst_end : src))
2152 {
2153 ONE_MORE_BYTE (c1);
2154 if (c1 == ISO_CODE_ESC
2155 && src + 1 < src_end
2156 && src[0] == '%'
2157 && src[1] == '@')
2158 {
2159 src += 2;
2160 break;
2161 }
2162 d += CHAR_STRING (c1, d), produced_chars++;
2163 }
2164 if (d + 3 > (dst_bytes ? dst_end : src))
2165 goto label_end_of_loop;
2166 *d++ = ISO_CODE_ESC;
2167 *d++ = '%';
2168 *d++ = '@';
2169 dst = d;
2170 coding->produced_char += produced_chars + 3;
2171 }
2172 else
2173 goto label_invalid_code;
2174 continue;
2175
2176 default:
2177 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
2178 goto label_invalid_code;
2179 if (c1 >= 0x28 && c1 <= 0x2B)
2180 { /* designation of DIMENSION1_CHARS94 character set */
2181 ONE_MORE_BYTE (c2);
2182 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
2183 }
2184 else if (c1 >= 0x2C && c1 <= 0x2F)
2185 { /* designation of DIMENSION1_CHARS96 character set */
2186 ONE_MORE_BYTE (c2);
2187 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
2188 }
2189 else
2190 goto label_invalid_code;
2191 /* We must update these variables now. */
2192 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2193 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2194 continue;
2195 }
2196 }
2197
2198 /* Now we know CHARSET and 1st position code C1 of a character.
2199 Produce a multibyte sequence for that character while getting
2200 2nd position code C2 if necessary. */
2201 if (CHARSET_DIMENSION (charset) == 2)
2202 {
2203 ONE_MORE_BYTE (c2);
2204 if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
2205 /* C2 is not in a valid range. */
2206 goto label_invalid_code;
2207 }
2208 c = DECODE_ISO_CHARACTER (charset, c1, c2);
2209 EMIT_CHAR (c);
2210 continue;
2211
2212 label_invalid_code:
2213 coding->errors++;
2214 if (COMPOSING_P (coding))
2215 DECODE_COMPOSITION_END ('1');
2216 src = src_base;
2217 c = *src++;
2218 if (! NILP (translation_table))
2219 c = translate_char (translation_table, c, 0, 0, 0);
2220 EMIT_CHAR (c);
2221 }
2222
2223 label_end_of_loop:
2224 coding->consumed = coding->consumed_char = src_base - source;
2225 coding->produced = dst - destination;
2226 return;
2227 }
2228
2229
2230 /* ISO2022 encoding stuff. */
2231
2232 /*
2233 It is not enough to say just "ISO2022" on encoding, we have to
2234 specify more details. In Emacs, each ISO2022 coding system
2235 variant has the following specifications:
2236 1. Initial designation to G0 through G3.
2237 2. Allows short-form designation?
2238 3. ASCII should be designated to G0 before control characters?
2239 4. ASCII should be designated to G0 at end of line?
2240 5. 7-bit environment or 8-bit environment?
2241 6. Use locking-shift?
2242 7. Use Single-shift?
2243 And the following two are only for Japanese:
2244 8. Use ASCII in place of JIS0201-1976-Roman?
2245 9. Use JISX0208-1983 in place of JISX0208-1978?
2246 These specifications are encoded in `coding->flags' as flag bits
2247 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
2248 details.
2249 */
2250
2251 /* Produce codes (escape sequence) for designating CHARSET to graphic
2252 register REG at DST, and increment DST. If <final-char> of CHARSET is
2253 '@', 'A', or 'B' and the coding system CODING allows, produce
2254 designation sequence of short-form. */
2255
2256 #define ENCODE_DESIGNATION(charset, reg, coding) \
2257 do { \
2258 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
2259 char *intermediate_char_94 = "()*+"; \
2260 char *intermediate_char_96 = ",-./"; \
2261 int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset); \
2262 \
2263 if (revision < 255) \
2264 { \
2265 *dst++ = ISO_CODE_ESC; \
2266 *dst++ = '&'; \
2267 *dst++ = '@' + revision; \
2268 } \
2269 *dst++ = ISO_CODE_ESC; \
2270 if (CHARSET_DIMENSION (charset) == 1) \
2271 { \
2272 if (CHARSET_CHARS (charset) == 94) \
2273 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
2274 else \
2275 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
2276 } \
2277 else \
2278 { \
2279 *dst++ = '$'; \
2280 if (CHARSET_CHARS (charset) == 94) \
2281 { \
2282 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
2283 || reg != 0 \
2284 || final_char < '@' || final_char > 'B') \
2285 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
2286 } \
2287 else \
2288 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
2289 } \
2290 *dst++ = final_char; \
2291 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
2292 } while (0)
2293
2294 /* The following two macros produce codes (control character or escape
2295 sequence) for ISO2022 single-shift functions (single-shift-2 and
2296 single-shift-3). */
2297
2298 #define ENCODE_SINGLE_SHIFT_2 \
2299 do { \
2300 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2301 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
2302 else \
2303 *dst++ = ISO_CODE_SS2; \
2304 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
2305 } while (0)
2306
2307 #define ENCODE_SINGLE_SHIFT_3 \
2308 do { \
2309 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2310 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
2311 else \
2312 *dst++ = ISO_CODE_SS3; \
2313 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
2314 } while (0)
2315
2316 /* The following four macros produce codes (control character or
2317 escape sequence) for ISO2022 locking-shift functions (shift-in,
2318 shift-out, locking-shift-2, and locking-shift-3). */
2319
2320 #define ENCODE_SHIFT_IN \
2321 do { \
2322 *dst++ = ISO_CODE_SI; \
2323 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
2324 } while (0)
2325
2326 #define ENCODE_SHIFT_OUT \
2327 do { \
2328 *dst++ = ISO_CODE_SO; \
2329 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
2330 } while (0)
2331
2332 #define ENCODE_LOCKING_SHIFT_2 \
2333 do { \
2334 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
2335 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
2336 } while (0)
2337
2338 #define ENCODE_LOCKING_SHIFT_3 \
2339 do { \
2340 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
2341 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
2342 } while (0)
2343
2344 /* Produce codes for a DIMENSION1 character whose character set is
2345 CHARSET and whose position-code is C1. Designation and invocation
2346 sequences are also produced in advance if necessary. */
2347
2348 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
2349 do { \
2350 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
2351 { \
2352 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2353 *dst++ = c1 & 0x7F; \
2354 else \
2355 *dst++ = c1 | 0x80; \
2356 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
2357 break; \
2358 } \
2359 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
2360 { \
2361 *dst++ = c1 & 0x7F; \
2362 break; \
2363 } \
2364 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
2365 { \
2366 *dst++ = c1 | 0x80; \
2367 break; \
2368 } \
2369 else \
2370 /* Since CHARSET is not yet invoked to any graphic planes, we \
2371 must invoke it, or, at first, designate it to some graphic \
2372 register. Then repeat the loop to actually produce the \
2373 character. */ \
2374 dst = encode_invocation_designation (charset, coding, dst); \
2375 } while (1)
2376
2377 /* Produce codes for a DIMENSION2 character whose character set is
2378 CHARSET and whose position-codes are C1 and C2. Designation and
2379 invocation codes are also produced in advance if necessary. */
2380
2381 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
2382 do { \
2383 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
2384 { \
2385 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2386 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
2387 else \
2388 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
2389 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
2390 break; \
2391 } \
2392 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
2393 { \
2394 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
2395 break; \
2396 } \
2397 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
2398 { \
2399 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
2400 break; \
2401 } \
2402 else \
2403 /* Since CHARSET is not yet invoked to any graphic planes, we \
2404 must invoke it, or, at first, designate it to some graphic \
2405 register. Then repeat the loop to actually produce the \
2406 character. */ \
2407 dst = encode_invocation_designation (charset, coding, dst); \
2408 } while (1)
2409
2410 #define ENCODE_ISO_CHARACTER(c) \
2411 do { \
2412 int charset, c1, c2; \
2413 \
2414 SPLIT_CHAR (c, charset, c1, c2); \
2415 if (CHARSET_DEFINED_P (charset)) \
2416 { \
2417 if (CHARSET_DIMENSION (charset) == 1) \
2418 { \
2419 if (charset == CHARSET_ASCII \
2420 && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \
2421 charset = charset_latin_jisx0201; \
2422 ENCODE_ISO_CHARACTER_DIMENSION1 (charset, c1); \
2423 } \
2424 else \
2425 { \
2426 if (charset == charset_jisx0208 \
2427 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \
2428 charset = charset_jisx0208_1978; \
2429 ENCODE_ISO_CHARACTER_DIMENSION2 (charset, c1, c2); \
2430 } \
2431 } \
2432 else \
2433 { \
2434 *dst++ = c1; \
2435 if (c2 >= 0) \
2436 *dst++ = c2; \
2437 } \
2438 } while (0)
2439
2440
2441 /* Instead of encoding character C, produce one or two `?'s. */
2442
2443 #define ENCODE_UNSAFE_CHARACTER(c) \
2444 do { \
2445 ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER); \
2446 if (CHARSET_WIDTH (CHAR_CHARSET (c)) > 1) \
2447 ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER); \
2448 } while (0)
2449
2450
2451 /* Produce designation and invocation codes at a place pointed by DST
2452 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
2453 Return new DST. */
2454
2455 unsigned char *
2456 encode_invocation_designation (charset, coding, dst)
2457 int charset;
2458 struct coding_system *coding;
2459 unsigned char *dst;
2460 {
2461 int reg; /* graphic register number */
2462
2463 /* At first, check designations. */
2464 for (reg = 0; reg < 4; reg++)
2465 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
2466 break;
2467
2468 if (reg >= 4)
2469 {
2470 /* CHARSET is not yet designated to any graphic registers. */
2471 /* At first check the requested designation. */
2472 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2473 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
2474 /* Since CHARSET requests no special designation, designate it
2475 to graphic register 0. */
2476 reg = 0;
2477
2478 ENCODE_DESIGNATION (charset, reg, coding);
2479 }
2480
2481 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
2482 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
2483 {
2484 /* Since the graphic register REG is not invoked to any graphic
2485 planes, invoke it to graphic plane 0. */
2486 switch (reg)
2487 {
2488 case 0: /* graphic register 0 */
2489 ENCODE_SHIFT_IN;
2490 break;
2491
2492 case 1: /* graphic register 1 */
2493 ENCODE_SHIFT_OUT;
2494 break;
2495
2496 case 2: /* graphic register 2 */
2497 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2498 ENCODE_SINGLE_SHIFT_2;
2499 else
2500 ENCODE_LOCKING_SHIFT_2;
2501 break;
2502
2503 case 3: /* graphic register 3 */
2504 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2505 ENCODE_SINGLE_SHIFT_3;
2506 else
2507 ENCODE_LOCKING_SHIFT_3;
2508 break;
2509 }
2510 }
2511
2512 return dst;
2513 }
2514
2515 /* Produce 2-byte codes for encoded composition rule RULE. */
2516
2517 #define ENCODE_COMPOSITION_RULE(rule) \
2518 do { \
2519 int gref, nref; \
2520 COMPOSITION_DECODE_RULE (rule, gref, nref); \
2521 *dst++ = 32 + 81 + gref; \
2522 *dst++ = 32 + nref; \
2523 } while (0)
2524
2525 /* Produce codes for indicating the start of a composition sequence
2526 (ESC 0, ESC 3, or ESC 4). DATA points to an array of integers
2527 which specify information about the composition. See the comment
2528 in coding.h for the format of DATA. */
2529
2530 #define ENCODE_COMPOSITION_START(coding, data) \
2531 do { \
2532 coding->composing = data[3]; \
2533 *dst++ = ISO_CODE_ESC; \
2534 if (coding->composing == COMPOSITION_RELATIVE) \
2535 *dst++ = '0'; \
2536 else \
2537 { \
2538 *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS \
2539 ? '3' : '4'); \
2540 coding->cmp_data_index = coding->cmp_data_start + 4; \
2541 coding->composition_rule_follows = 0; \
2542 } \
2543 } while (0)
2544
2545 /* Produce codes for indicating the end of the current composition. */
2546
2547 #define ENCODE_COMPOSITION_END(coding, data) \
2548 do { \
2549 *dst++ = ISO_CODE_ESC; \
2550 *dst++ = '1'; \
2551 coding->cmp_data_start += data[0]; \
2552 coding->composing = COMPOSITION_NO; \
2553 if (coding->cmp_data_start == coding->cmp_data->used \
2554 && coding->cmp_data->next) \
2555 { \
2556 coding->cmp_data = coding->cmp_data->next; \
2557 coding->cmp_data_start = 0; \
2558 } \
2559 } while (0)
2560
2561 /* Produce composition start sequence ESC 0. Here, this sequence
2562 doesn't mean the start of a new composition but means that we have
2563 just produced components (alternate chars and composition rules) of
2564 the composition and the actual text follows in SRC. */
2565
2566 #define ENCODE_COMPOSITION_FAKE_START(coding) \
2567 do { \
2568 *dst++ = ISO_CODE_ESC; \
2569 *dst++ = '0'; \
2570 coding->composing = COMPOSITION_RELATIVE; \
2571 } while (0)
2572
2573 /* The following three macros produce codes for indicating direction
2574 of text. */
2575 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
2576 do { \
2577 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
2578 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
2579 else \
2580 *dst++ = ISO_CODE_CSI; \
2581 } while (0)
2582
2583 #define ENCODE_DIRECTION_R2L \
2584 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
2585
2586 #define ENCODE_DIRECTION_L2R \
2587 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
2588
2589 /* Produce codes for designation and invocation to reset the graphic
2590 planes and registers to initial state. */
2591 #define ENCODE_RESET_PLANE_AND_REGISTER \
2592 do { \
2593 int reg; \
2594 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
2595 ENCODE_SHIFT_IN; \
2596 for (reg = 0; reg < 4; reg++) \
2597 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
2598 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
2599 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
2600 ENCODE_DESIGNATION \
2601 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
2602 } while (0)
2603
2604 /* Produce designation sequences of charsets in the line started from
2605 SRC to a place pointed by DST, and return updated DST.
2606
2607 If the current block ends before any end-of-line, we may fail to
2608 find all the necessary designations. */
2609
2610 static unsigned char *
2611 encode_designation_at_bol (coding, translation_table, src, src_end, dst)
2612 struct coding_system *coding;
2613 Lisp_Object translation_table;
2614 const unsigned char *src, *src_end;
2615 unsigned char *dst;
2616 {
2617 int charset, c, found = 0, reg;
2618 /* Table of charsets to be designated to each graphic register. */
2619 int r[4];
2620
2621 for (reg = 0; reg < 4; reg++)
2622 r[reg] = -1;
2623
2624 while (found < 4)
2625 {
2626 ONE_MORE_CHAR (c);
2627 if (c == '\n')
2628 break;
2629
2630 charset = CHAR_CHARSET (c);
2631 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2632 if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
2633 {
2634 found++;
2635 r[reg] = charset;
2636 }
2637 }
2638
2639 label_end_of_loop:
2640 if (found)
2641 {
2642 for (reg = 0; reg < 4; reg++)
2643 if (r[reg] >= 0
2644 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
2645 ENCODE_DESIGNATION (r[reg], reg, coding);
2646 }
2647
2648 return dst;
2649 }
2650
2651 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
2652
2653 static void
2654 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
2655 struct coding_system *coding;
2656 const unsigned char *source;
2657 unsigned char *destination;
2658 int src_bytes, dst_bytes;
2659 {
2660 const unsigned char *src = source;
2661 const unsigned char *src_end = source + src_bytes;
2662 unsigned char *dst = destination;
2663 unsigned char *dst_end = destination + dst_bytes;
2664 /* Since the maximum bytes produced by each loop is 20, we subtract 19
2665 from DST_END to assure overflow checking is necessary only at the
2666 head of loop. */
2667 unsigned char *adjusted_dst_end = dst_end - 19;
2668 /* SRC_BASE remembers the start position in source in each loop.
2669 The loop will be exited when there's not enough source text to
2670 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2671 there's not enough destination area to produce encoded codes
2672 (within macro EMIT_BYTES). */
2673 const unsigned char *src_base;
2674 int c;
2675 Lisp_Object translation_table;
2676 Lisp_Object safe_chars;
2677
2678 if (coding->flags & CODING_FLAG_ISO_SAFE)
2679 coding->mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
2680
2681 safe_chars = coding_safe_chars (coding->symbol);
2682
2683 if (NILP (Venable_character_translation))
2684 translation_table = Qnil;
2685 else
2686 {
2687 translation_table = coding->translation_table_for_encode;
2688 if (NILP (translation_table))
2689 translation_table = Vstandard_translation_table_for_encode;
2690 }
2691
2692 coding->consumed_char = 0;
2693 coding->errors = 0;
2694 while (1)
2695 {
2696 src_base = src;
2697
2698 if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2699 {
2700 coding->result = CODING_FINISH_INSUFFICIENT_DST;
2701 break;
2702 }
2703
2704 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2705 && CODING_SPEC_ISO_BOL (coding))
2706 {
2707 /* We have to produce designation sequences if any now. */
2708 dst = encode_designation_at_bol (coding, translation_table,
2709 src, src_end, dst);
2710 CODING_SPEC_ISO_BOL (coding) = 0;
2711 }
2712
2713 /* Check composition start and end. */
2714 if (coding->composing != COMPOSITION_DISABLED
2715 && coding->cmp_data_start < coding->cmp_data->used)
2716 {
2717 struct composition_data *cmp_data = coding->cmp_data;
2718 int *data = cmp_data->data + coding->cmp_data_start;
2719 int this_pos = cmp_data->char_offset + coding->consumed_char;
2720
2721 if (coding->composing == COMPOSITION_RELATIVE)
2722 {
2723 if (this_pos == data[2])
2724 {
2725 ENCODE_COMPOSITION_END (coding, data);
2726 cmp_data = coding->cmp_data;
2727 data = cmp_data->data + coding->cmp_data_start;
2728 }
2729 }
2730 else if (COMPOSING_P (coding))
2731 {
2732 /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR */
2733 if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2734 /* We have consumed components of the composition.
2735 What follows in SRC is the composition's base
2736 text. */
2737 ENCODE_COMPOSITION_FAKE_START (coding);
2738 else
2739 {
2740 int c = cmp_data->data[coding->cmp_data_index++];
2741 if (coding->composition_rule_follows)
2742 {
2743 ENCODE_COMPOSITION_RULE (c);
2744 coding->composition_rule_follows = 0;
2745 }
2746 else
2747 {
2748 if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2749 && ! CODING_SAFE_CHAR_P (safe_chars, c))
2750 ENCODE_UNSAFE_CHARACTER (c);
2751 else
2752 ENCODE_ISO_CHARACTER (c);
2753 if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2754 coding->composition_rule_follows = 1;
2755 }
2756 continue;
2757 }
2758 }
2759 if (!COMPOSING_P (coding))
2760 {
2761 if (this_pos == data[1])
2762 {
2763 ENCODE_COMPOSITION_START (coding, data);
2764 continue;
2765 }
2766 }
2767 }
2768
2769 ONE_MORE_CHAR (c);
2770
2771 /* Now encode the character C. */
2772 if (c < 0x20 || c == 0x7F)
2773 {
2774 if (c == '\r')
2775 {
2776 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2777 {
2778 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2779 ENCODE_RESET_PLANE_AND_REGISTER;
2780 *dst++ = c;
2781 continue;
2782 }
2783 /* fall down to treat '\r' as '\n' ... */
2784 c = '\n';
2785 }
2786 if (c == '\n')
2787 {
2788 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2789 ENCODE_RESET_PLANE_AND_REGISTER;
2790 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2791 bcopy (coding->spec.iso2022.initial_designation,
2792 coding->spec.iso2022.current_designation,
2793 sizeof coding->spec.iso2022.initial_designation);
2794 if (coding->eol_type == CODING_EOL_LF
2795 || coding->eol_type == CODING_EOL_UNDECIDED)
2796 *dst++ = ISO_CODE_LF;
2797 else if (coding->eol_type == CODING_EOL_CRLF)
2798 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2799 else
2800 *dst++ = ISO_CODE_CR;
2801 CODING_SPEC_ISO_BOL (coding) = 1;
2802 }
2803 else
2804 {
2805 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2806 ENCODE_RESET_PLANE_AND_REGISTER;
2807 *dst++ = c;
2808 }
2809 }
2810 else if (ASCII_BYTE_P (c))
2811 ENCODE_ISO_CHARACTER (c);
2812 else if (SINGLE_BYTE_CHAR_P (c))
2813 {
2814 *dst++ = c;
2815 coding->errors++;
2816 }
2817 else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2818 && ! CODING_SAFE_CHAR_P (safe_chars, c))
2819 ENCODE_UNSAFE_CHARACTER (c);
2820 else
2821 ENCODE_ISO_CHARACTER (c);
2822
2823 coding->consumed_char++;
2824 }
2825
2826 label_end_of_loop:
2827 coding->consumed = src_base - source;
2828 coding->produced = coding->produced_char = dst - destination;
2829 }
2830
2831 \f
2832 /*** 4. SJIS and BIG5 handlers ***/
2833
2834 /* Although SJIS and BIG5 are not ISO coding systems, they are used
2835 quite widely. So, for the moment, Emacs supports them in the bare
2836 C code. But, in the future, they may be supported only by CCL. */
2837
2838 /* SJIS is a coding system encoding three character sets: ASCII, right
2839 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2840 as is. A character of charset katakana-jisx0201 is encoded by
2841 "position-code + 0x80". A character of charset japanese-jisx0208
2842 is encoded in 2-byte but two position-codes are divided and shifted
2843 so that it fits in the range below.
2844
2845 --- CODE RANGE of SJIS ---
2846 (character set) (range)
2847 ASCII 0x00 .. 0x7F
2848 KATAKANA-JISX0201 0xA1 .. 0xDF
2849 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
2850 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2851 -------------------------------
2852
2853 */
2854
2855 /* BIG5 is a coding system encoding two character sets: ASCII and
2856 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2857 character set and is encoded in two bytes.
2858
2859 --- CODE RANGE of BIG5 ---
2860 (character set) (range)
2861 ASCII 0x00 .. 0x7F
2862 Big5 (1st byte) 0xA1 .. 0xFE
2863 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2864 --------------------------
2865
2866 Since the number of characters in Big5 is larger than maximum
2867 characters in Emacs' charset (96x96), it can't be handled as one
2868 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2869 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
2870 contains frequently used characters and the latter contains less
2871 frequently used characters. */
2872
2873 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
2874 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2875 C1 and C2 are the 1st and 2nd position-codes of Emacs' internal
2876 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
2877
2878 /* Number of Big5 characters which have the same code in 1st byte. */
2879 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2880
2881 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
2882 do { \
2883 unsigned int temp \
2884 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
2885 if (b1 < 0xC9) \
2886 charset = charset_big5_1; \
2887 else \
2888 { \
2889 charset = charset_big5_2; \
2890 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
2891 } \
2892 c1 = temp / (0xFF - 0xA1) + 0x21; \
2893 c2 = temp % (0xFF - 0xA1) + 0x21; \
2894 } while (0)
2895
2896 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
2897 do { \
2898 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
2899 if (charset == charset_big5_2) \
2900 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2901 b1 = temp / BIG5_SAME_ROW + 0xA1; \
2902 b2 = temp % BIG5_SAME_ROW; \
2903 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2904 } while (0)
2905
2906 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2907 Check if a text is encoded in SJIS. If it is, return
2908 CODING_CATEGORY_MASK_SJIS, else return 0. */
2909
2910 static int
2911 detect_coding_sjis (src, src_end, multibytep)
2912 unsigned char *src, *src_end;
2913 int multibytep;
2914 {
2915 int c;
2916 /* Dummy for ONE_MORE_BYTE. */
2917 struct coding_system dummy_coding;
2918 struct coding_system *coding = &dummy_coding;
2919
2920 while (1)
2921 {
2922 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_SJIS);
2923 if (c < 0x80)
2924 continue;
2925 if (c == 0x80 || c == 0xA0 || c > 0xEF)
2926 return 0;
2927 if (c <= 0x9F || c >= 0xE0)
2928 {
2929 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0);
2930 if (c < 0x40 || c == 0x7F || c > 0xFC)
2931 return 0;
2932 }
2933 }
2934 }
2935
2936 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2937 Check if a text is encoded in BIG5. If it is, return
2938 CODING_CATEGORY_MASK_BIG5, else return 0. */
2939
2940 static int
2941 detect_coding_big5 (src, src_end, multibytep)
2942 unsigned char *src, *src_end;
2943 int multibytep;
2944 {
2945 int c;
2946 /* Dummy for ONE_MORE_BYTE. */
2947 struct coding_system dummy_coding;
2948 struct coding_system *coding = &dummy_coding;
2949
2950 while (1)
2951 {
2952 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_BIG5);
2953 if (c < 0x80)
2954 continue;
2955 if (c < 0xA1 || c > 0xFE)
2956 return 0;
2957 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0);
2958 if (c < 0x40 || (c > 0x7F && c < 0xA1) || c > 0xFE)
2959 return 0;
2960 }
2961 }
2962
2963 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2964 Check if a text is encoded in UTF-8. If it is, return
2965 CODING_CATEGORY_MASK_UTF_8, else return 0. */
2966
2967 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
2968 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
2969 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2970 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2971 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2972 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2973 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2974
2975 static int
2976 detect_coding_utf_8 (src, src_end, multibytep)
2977 unsigned char *src, *src_end;
2978 int multibytep;
2979 {
2980 unsigned char c;
2981 int seq_maybe_bytes;
2982 /* Dummy for ONE_MORE_BYTE. */
2983 struct coding_system dummy_coding;
2984 struct coding_system *coding = &dummy_coding;
2985
2986 while (1)
2987 {
2988 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_UTF_8);
2989 if (UTF_8_1_OCTET_P (c))
2990 continue;
2991 else if (UTF_8_2_OCTET_LEADING_P (c))
2992 seq_maybe_bytes = 1;
2993 else if (UTF_8_3_OCTET_LEADING_P (c))
2994 seq_maybe_bytes = 2;
2995 else if (UTF_8_4_OCTET_LEADING_P (c))
2996 seq_maybe_bytes = 3;
2997 else if (UTF_8_5_OCTET_LEADING_P (c))
2998 seq_maybe_bytes = 4;
2999 else if (UTF_8_6_OCTET_LEADING_P (c))
3000 seq_maybe_bytes = 5;
3001 else
3002 return 0;
3003
3004 do
3005 {
3006 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0);
3007 if (!UTF_8_EXTRA_OCTET_P (c))
3008 return 0;
3009 seq_maybe_bytes--;
3010 }
3011 while (seq_maybe_bytes > 0);
3012 }
3013 }
3014
3015 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3016 Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
3017 Little Endian (otherwise). If it is, return
3018 CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
3019 else return 0. */
3020
3021 #define UTF_16_INVALID_P(val) \
3022 (((val) == 0xFFFE) \
3023 || ((val) == 0xFFFF))
3024
3025 #define UTF_16_HIGH_SURROGATE_P(val) \
3026 (((val) & 0xD800) == 0xD800)
3027
3028 #define UTF_16_LOW_SURROGATE_P(val) \
3029 (((val) & 0xDC00) == 0xDC00)
3030
3031 static int
3032 detect_coding_utf_16 (src, src_end, multibytep)
3033 unsigned char *src, *src_end;
3034 int multibytep;
3035 {
3036 unsigned char c1, c2;
3037 /* Dummy for ONE_MORE_BYTE_CHECK_MULTIBYTE. */
3038 struct coding_system dummy_coding;
3039 struct coding_system *coding = &dummy_coding;
3040
3041 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep, 0);
3042 ONE_MORE_BYTE_CHECK_MULTIBYTE (c2, multibytep, 0);
3043
3044 if ((c1 == 0xFF) && (c2 == 0xFE))
3045 return CODING_CATEGORY_MASK_UTF_16_LE;
3046 else if ((c1 == 0xFE) && (c2 == 0xFF))
3047 return CODING_CATEGORY_MASK_UTF_16_BE;
3048 return 0;
3049 }
3050
3051 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3052 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
3053
3054 static void
3055 decode_coding_sjis_big5 (coding, source, destination,
3056 src_bytes, dst_bytes, sjis_p)
3057 struct coding_system *coding;
3058 const unsigned char *source;
3059 unsigned char *destination;
3060 int src_bytes, dst_bytes;
3061 int sjis_p;
3062 {
3063 const unsigned char *src = source;
3064 const unsigned char *src_end = source + src_bytes;
3065 unsigned char *dst = destination;
3066 unsigned char *dst_end = destination + dst_bytes;
3067 /* SRC_BASE remembers the start position in source in each loop.
3068 The loop will be exited when there's not enough source code
3069 (within macro ONE_MORE_BYTE), or when there's not enough
3070 destination area to produce a character (within macro
3071 EMIT_CHAR). */
3072 const unsigned char *src_base;
3073 Lisp_Object translation_table;
3074
3075 if (NILP (Venable_character_translation))
3076 translation_table = Qnil;
3077 else
3078 {
3079 translation_table = coding->translation_table_for_decode;
3080 if (NILP (translation_table))
3081 translation_table = Vstandard_translation_table_for_decode;
3082 }
3083
3084 coding->produced_char = 0;
3085 while (1)
3086 {
3087 int c, charset, c1, c2 = 0;
3088
3089 src_base = src;
3090 ONE_MORE_BYTE (c1);
3091
3092 if (c1 < 0x80)
3093 {
3094 charset = CHARSET_ASCII;
3095 if (c1 < 0x20)
3096 {
3097 if (c1 == '\r')
3098 {
3099 if (coding->eol_type == CODING_EOL_CRLF)
3100 {
3101 ONE_MORE_BYTE (c2);
3102 if (c2 == '\n')
3103 c1 = c2;
3104 else
3105 /* To process C2 again, SRC is subtracted by 1. */
3106 src--;
3107 }
3108 else if (coding->eol_type == CODING_EOL_CR)
3109 c1 = '\n';
3110 }
3111 else if (c1 == '\n'
3112 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3113 && (coding->eol_type == CODING_EOL_CR
3114 || coding->eol_type == CODING_EOL_CRLF))
3115 {
3116 coding->result = CODING_FINISH_INCONSISTENT_EOL;
3117 goto label_end_of_loop;
3118 }
3119 }
3120 }
3121 else
3122 {
3123 if (sjis_p)
3124 {
3125 if (c1 == 0x80 || c1 == 0xA0 || c1 > 0xEF)
3126 goto label_invalid_code;
3127 if (c1 <= 0x9F || c1 >= 0xE0)
3128 {
3129 /* SJIS -> JISX0208 */
3130 ONE_MORE_BYTE (c2);
3131 if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
3132 goto label_invalid_code;
3133 DECODE_SJIS (c1, c2, c1, c2);
3134 charset = charset_jisx0208;
3135 }
3136 else
3137 /* SJIS -> JISX0201-Kana */
3138 charset = charset_katakana_jisx0201;
3139 }
3140 else
3141 {
3142 /* BIG5 -> Big5 */
3143 if (c1 < 0xA0 || c1 > 0xFE)
3144 goto label_invalid_code;
3145 ONE_MORE_BYTE (c2);
3146 if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
3147 goto label_invalid_code;
3148 DECODE_BIG5 (c1, c2, charset, c1, c2);
3149 }
3150 }
3151
3152 c = DECODE_ISO_CHARACTER (charset, c1, c2);
3153 EMIT_CHAR (c);
3154 continue;
3155
3156 label_invalid_code:
3157 coding->errors++;
3158 src = src_base;
3159 c = *src++;
3160 EMIT_CHAR (c);
3161 }
3162
3163 label_end_of_loop:
3164 coding->consumed = coding->consumed_char = src_base - source;
3165 coding->produced = dst - destination;
3166 return;
3167 }
3168
3169 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
3170 This function can encode charsets `ascii', `katakana-jisx0201',
3171 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
3172 are sure that all these charsets are registered as official charset
3173 (i.e. do not have extended leading-codes). Characters of other
3174 charsets are produced without any encoding. If SJIS_P is 1, encode
3175 SJIS text, else encode BIG5 text. */
3176
3177 static void
3178 encode_coding_sjis_big5 (coding, source, destination,
3179 src_bytes, dst_bytes, sjis_p)
3180 struct coding_system *coding;
3181 unsigned char *source, *destination;
3182 int src_bytes, dst_bytes;
3183 int sjis_p;
3184 {
3185 unsigned char *src = source;
3186 unsigned char *src_end = source + src_bytes;
3187 unsigned char *dst = destination;
3188 unsigned char *dst_end = destination + dst_bytes;
3189 /* SRC_BASE remembers the start position in source in each loop.
3190 The loop will be exited when there's not enough source text to
3191 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3192 there's not enough destination area to produce encoded codes
3193 (within macro EMIT_BYTES). */
3194 unsigned char *src_base;
3195 Lisp_Object translation_table;
3196
3197 if (NILP (Venable_character_translation))
3198 translation_table = Qnil;
3199 else
3200 {
3201 translation_table = coding->translation_table_for_encode;
3202 if (NILP (translation_table))
3203 translation_table = Vstandard_translation_table_for_encode;
3204 }
3205
3206 while (1)
3207 {
3208 int c, charset, c1, c2;
3209
3210 src_base = src;
3211 ONE_MORE_CHAR (c);
3212
3213 /* Now encode the character C. */
3214 if (SINGLE_BYTE_CHAR_P (c))
3215 {
3216 switch (c)
3217 {
3218 case '\r':
3219 if (!(coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
3220 {
3221 EMIT_ONE_BYTE (c);
3222 break;
3223 }
3224 c = '\n';
3225 case '\n':
3226 if (coding->eol_type == CODING_EOL_CRLF)
3227 {
3228 EMIT_TWO_BYTES ('\r', c);
3229 break;
3230 }
3231 else if (coding->eol_type == CODING_EOL_CR)
3232 c = '\r';
3233 default:
3234 EMIT_ONE_BYTE (c);
3235 }
3236 }
3237 else
3238 {
3239 SPLIT_CHAR (c, charset, c1, c2);
3240 if (sjis_p)
3241 {
3242 if (charset == charset_jisx0208
3243 || charset == charset_jisx0208_1978)
3244 {
3245 ENCODE_SJIS (c1, c2, c1, c2);
3246 EMIT_TWO_BYTES (c1, c2);
3247 }
3248 else if (charset == charset_katakana_jisx0201)
3249 EMIT_ONE_BYTE (c1 | 0x80);
3250 else if (charset == charset_latin_jisx0201)
3251 EMIT_ONE_BYTE (c1);
3252 else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3253 {
3254 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3255 if (CHARSET_WIDTH (charset) > 1)
3256 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3257 }
3258 else
3259 /* There's no way other than producing the internal
3260 codes as is. */
3261 EMIT_BYTES (src_base, src);
3262 }
3263 else
3264 {
3265 if (charset == charset_big5_1 || charset == charset_big5_2)
3266 {
3267 ENCODE_BIG5 (charset, c1, c2, c1, c2);
3268 EMIT_TWO_BYTES (c1, c2);
3269 }
3270 else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3271 {
3272 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3273 if (CHARSET_WIDTH (charset) > 1)
3274 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3275 }
3276 else
3277 /* There's no way other than producing the internal
3278 codes as is. */
3279 EMIT_BYTES (src_base, src);
3280 }
3281 }
3282 coding->consumed_char++;
3283 }
3284
3285 label_end_of_loop:
3286 coding->consumed = src_base - source;
3287 coding->produced = coding->produced_char = dst - destination;
3288 }
3289
3290 \f
3291 /*** 5. CCL handlers ***/
3292
3293 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3294 Check if a text is encoded in a coding system of which
3295 encoder/decoder are written in CCL program. If it is, return
3296 CODING_CATEGORY_MASK_CCL, else return 0. */
3297
3298 static int
3299 detect_coding_ccl (src, src_end, multibytep)
3300 unsigned char *src, *src_end;
3301 int multibytep;
3302 {
3303 unsigned char *valid;
3304 int c;
3305 /* Dummy for ONE_MORE_BYTE. */
3306 struct coding_system dummy_coding;
3307 struct coding_system *coding = &dummy_coding;
3308
3309 /* No coding system is assigned to coding-category-ccl. */
3310 if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
3311 return 0;
3312
3313 valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
3314 while (1)
3315 {
3316 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_CCL);
3317 if (! valid[c])
3318 return 0;
3319 }
3320 }
3321
3322 \f
3323 /*** 6. End-of-line handlers ***/
3324
3325 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
3326
3327 static void
3328 decode_eol (coding, source, destination, src_bytes, dst_bytes)
3329 struct coding_system *coding;
3330 const unsigned char *source;
3331 unsigned char *destination;
3332 int src_bytes, dst_bytes;
3333 {
3334 const unsigned char *src = source;
3335 unsigned char *dst = destination;
3336 const unsigned char *src_end = src + src_bytes;
3337 unsigned char *dst_end = dst + dst_bytes;
3338 Lisp_Object translation_table;
3339 /* SRC_BASE remembers the start position in source in each loop.
3340 The loop will be exited when there's not enough source code
3341 (within macro ONE_MORE_BYTE), or when there's not enough
3342 destination area to produce a character (within macro
3343 EMIT_CHAR). */
3344 const unsigned char *src_base;
3345 int c;
3346
3347 translation_table = Qnil;
3348 switch (coding->eol_type)
3349 {
3350 case CODING_EOL_CRLF:
3351 while (1)
3352 {
3353 src_base = src;
3354 ONE_MORE_BYTE (c);
3355 if (c == '\r')
3356 {
3357 ONE_MORE_BYTE (c);
3358 if (c != '\n')
3359 {
3360 src--;
3361 c = '\r';
3362 }
3363 }
3364 else if (c == '\n'
3365 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
3366 {
3367 coding->result = CODING_FINISH_INCONSISTENT_EOL;
3368 goto label_end_of_loop;
3369 }
3370 EMIT_CHAR (c);
3371 }
3372 break;
3373
3374 case CODING_EOL_CR:
3375 while (1)
3376 {
3377 src_base = src;
3378 ONE_MORE_BYTE (c);
3379 if (c == '\n')
3380 {
3381 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3382 {
3383 coding->result = CODING_FINISH_INCONSISTENT_EOL;
3384 goto label_end_of_loop;
3385 }
3386 }
3387 else if (c == '\r')
3388 c = '\n';
3389 EMIT_CHAR (c);
3390 }
3391 break;
3392
3393 default: /* no need for EOL handling */
3394 while (1)
3395 {
3396 src_base = src;
3397 ONE_MORE_BYTE (c);
3398 EMIT_CHAR (c);
3399 }
3400 }
3401
3402 label_end_of_loop:
3403 coding->consumed = coding->consumed_char = src_base - source;
3404 coding->produced = dst - destination;
3405 return;
3406 }
3407
3408 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
3409 format of end-of-line according to `coding->eol_type'. It also
3410 convert multibyte form 8-bit characters to unibyte if
3411 CODING->src_multibyte is nonzero. If `coding->mode &
3412 CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
3413 also means end-of-line. */
3414
3415 static void
3416 encode_eol (coding, source, destination, src_bytes, dst_bytes)
3417 struct coding_system *coding;
3418 const unsigned char *source;
3419 unsigned char *destination;
3420 int src_bytes, dst_bytes;
3421 {
3422 const unsigned char *src = source;
3423 unsigned char *dst = destination;
3424 const unsigned char *src_end = src + src_bytes;
3425 unsigned char *dst_end = dst + dst_bytes;
3426 Lisp_Object translation_table;
3427 /* SRC_BASE remembers the start position in source in each loop.
3428 The loop will be exited when there's not enough source text to
3429 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3430 there's not enough destination area to produce encoded codes
3431 (within macro EMIT_BYTES). */
3432 const unsigned char *src_base;
3433 unsigned char *tmp;
3434 int c;
3435 int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
3436
3437 translation_table = Qnil;
3438 if (coding->src_multibyte
3439 && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
3440 {
3441 src_end--;
3442 src_bytes--;
3443 coding->result = CODING_FINISH_INSUFFICIENT_SRC;
3444 }
3445
3446 if (coding->eol_type == CODING_EOL_CRLF)
3447 {
3448 while (src < src_end)
3449 {
3450 src_base = src;
3451 c = *src++;
3452 if (c >= 0x20)
3453 EMIT_ONE_BYTE (c);
3454 else if (c == '\n' || (c == '\r' && selective_display))
3455 EMIT_TWO_BYTES ('\r', '\n');
3456 else
3457 EMIT_ONE_BYTE (c);
3458 }
3459 src_base = src;
3460 label_end_of_loop:
3461 ;
3462 }
3463 else
3464 {
3465 if (!dst_bytes || src_bytes <= dst_bytes)
3466 {
3467 safe_bcopy (src, dst, src_bytes);
3468 src_base = src_end;
3469 dst += src_bytes;
3470 }
3471 else
3472 {
3473 if (coding->src_multibyte
3474 && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
3475 dst_bytes--;
3476 safe_bcopy (src, dst, dst_bytes);
3477 src_base = src + dst_bytes;
3478 dst = destination + dst_bytes;
3479 coding->result = CODING_FINISH_INSUFFICIENT_DST;
3480 }
3481 if (coding->eol_type == CODING_EOL_CR)
3482 {
3483 for (tmp = destination; tmp < dst; tmp++)
3484 if (*tmp == '\n') *tmp = '\r';
3485 }
3486 else if (selective_display)
3487 {
3488 for (tmp = destination; tmp < dst; tmp++)
3489 if (*tmp == '\r') *tmp = '\n';
3490 }
3491 }
3492 if (coding->src_multibyte)
3493 dst = destination + str_as_unibyte (destination, dst - destination);
3494
3495 coding->consumed = src_base - source;
3496 coding->produced = dst - destination;
3497 coding->produced_char = coding->produced;
3498 }
3499
3500 \f
3501 /*** 7. C library functions ***/
3502
3503 /* In Emacs Lisp, a coding system is represented by a Lisp symbol which
3504 has a property `coding-system'. The value of this property is a
3505 vector of length 5 (called the coding-vector). Among elements of
3506 this vector, the first (element[0]) and the fifth (element[4])
3507 carry important information for decoding/encoding. Before
3508 decoding/encoding, this information should be set in fields of a
3509 structure of type `coding_system'.
3510
3511 The value of the property `coding-system' can be a symbol of another
3512 subsidiary coding-system. In that case, Emacs gets coding-vector
3513 from that symbol.
3514
3515 `element[0]' contains information to be set in `coding->type'. The
3516 value and its meaning is as follows:
3517
3518 0 -- coding_type_emacs_mule
3519 1 -- coding_type_sjis
3520 2 -- coding_type_iso2022
3521 3 -- coding_type_big5
3522 4 -- coding_type_ccl encoder/decoder written in CCL
3523 nil -- coding_type_no_conversion
3524 t -- coding_type_undecided (automatic conversion on decoding,
3525 no-conversion on encoding)
3526
3527 `element[4]' contains information to be set in `coding->flags' and
3528 `coding->spec'. The meaning varies by `coding->type'.
3529
3530 If `coding->type' is `coding_type_iso2022', element[4] is a vector
3531 of length 32 (of which the first 13 sub-elements are used now).
3532 Meanings of these sub-elements are:
3533
3534 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
3535 If the value is an integer of valid charset, the charset is
3536 assumed to be designated to graphic register N initially.
3537
3538 If the value is minus, it is a minus value of charset which
3539 reserves graphic register N, which means that the charset is
3540 not designated initially but should be designated to graphic
3541 register N just before encoding a character in that charset.
3542
3543 If the value is nil, graphic register N is never used on
3544 encoding.
3545
3546 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
3547 Each value takes t or nil. See the section ISO2022 of
3548 `coding.h' for more information.
3549
3550 If `coding->type' is `coding_type_big5', element[4] is t to denote
3551 BIG5-ETen or nil to denote BIG5-HKU.
3552
3553 If `coding->type' takes the other value, element[4] is ignored.
3554
3555 Emacs Lisp's coding systems also carry information about format of
3556 end-of-line in a value of property `eol-type'. If the value is
3557 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
3558 means CODING_EOL_CR. If it is not integer, it should be a vector
3559 of subsidiary coding systems of which property `eol-type' has one
3560 of the above values.
3561
3562 */
3563
3564 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
3565 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
3566 is setup so that no conversion is necessary and return -1, else
3567 return 0. */
3568
3569 int
3570 setup_coding_system (coding_system, coding)
3571 Lisp_Object coding_system;
3572 struct coding_system *coding;
3573 {
3574 Lisp_Object coding_spec, coding_type, eol_type, plist;
3575 Lisp_Object val;
3576
3577 /* At first, zero clear all members. */
3578 bzero (coding, sizeof (struct coding_system));
3579
3580 /* Initialize some fields required for all kinds of coding systems. */
3581 coding->symbol = coding_system;
3582 coding->heading_ascii = -1;
3583 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
3584 coding->composing = COMPOSITION_DISABLED;
3585 coding->cmp_data = NULL;
3586
3587 if (NILP (coding_system))
3588 goto label_invalid_coding_system;
3589
3590 coding_spec = Fget (coding_system, Qcoding_system);
3591
3592 if (!VECTORP (coding_spec)
3593 || XVECTOR (coding_spec)->size != 5
3594 || !CONSP (XVECTOR (coding_spec)->contents[3]))
3595 goto label_invalid_coding_system;
3596
3597 eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
3598 if (VECTORP (eol_type))
3599 {
3600 coding->eol_type = CODING_EOL_UNDECIDED;
3601 coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
3602 if (system_eol_type != CODING_EOL_LF)
3603 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3604 }
3605 else if (XFASTINT (eol_type) == 1)
3606 {
3607 coding->eol_type = CODING_EOL_CRLF;
3608 coding->common_flags
3609 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3610 }
3611 else if (XFASTINT (eol_type) == 2)
3612 {
3613 coding->eol_type = CODING_EOL_CR;
3614 coding->common_flags
3615 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3616 }
3617 else
3618 {
3619 coding->common_flags = 0;
3620 coding->eol_type = CODING_EOL_LF;
3621 }
3622
3623 coding_type = XVECTOR (coding_spec)->contents[0];
3624 /* Try short cut. */
3625 if (SYMBOLP (coding_type))
3626 {
3627 if (EQ (coding_type, Qt))
3628 {
3629 coding->type = coding_type_undecided;
3630 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
3631 }
3632 else
3633 coding->type = coding_type_no_conversion;
3634 /* Initialize this member. Any thing other than
3635 CODING_CATEGORY_IDX_UTF_16_BE and
3636 CODING_CATEGORY_IDX_UTF_16_LE are ok because they have
3637 special treatment in detect_eol. */
3638 coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
3639
3640 return 0;
3641 }
3642
3643 /* Get values of coding system properties:
3644 `post-read-conversion', `pre-write-conversion',
3645 `translation-table-for-decode', `translation-table-for-encode'. */
3646 plist = XVECTOR (coding_spec)->contents[3];
3647 /* Pre & post conversion functions should be disabled if
3648 inhibit_eol_conversion is nonzero. This is the case that a code
3649 conversion function is called while those functions are running. */
3650 if (! inhibit_pre_post_conversion)
3651 {
3652 coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
3653 coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
3654 }
3655 val = Fplist_get (plist, Qtranslation_table_for_decode);
3656 if (SYMBOLP (val))
3657 val = Fget (val, Qtranslation_table_for_decode);
3658 coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
3659 val = Fplist_get (plist, Qtranslation_table_for_encode);
3660 if (SYMBOLP (val))
3661 val = Fget (val, Qtranslation_table_for_encode);
3662 coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
3663 val = Fplist_get (plist, Qcoding_category);
3664 if (!NILP (val))
3665 {
3666 val = Fget (val, Qcoding_category_index);
3667 if (INTEGERP (val))
3668 coding->category_idx = XINT (val);
3669 else
3670 goto label_invalid_coding_system;
3671 }
3672 else
3673 goto label_invalid_coding_system;
3674
3675 /* If the coding system has non-nil `composition' property, enable
3676 composition handling. */
3677 val = Fplist_get (plist, Qcomposition);
3678 if (!NILP (val))
3679 coding->composing = COMPOSITION_NO;
3680
3681 /* If the coding system is ascii-incompatible, record it in
3682 common_flags. */
3683 val = Fplist_get (plist, Qascii_incompatible);
3684 if (! NILP (val))
3685 coding->common_flags |= CODING_ASCII_INCOMPATIBLE_MASK;
3686
3687 switch (XFASTINT (coding_type))
3688 {
3689 case 0:
3690 coding->type = coding_type_emacs_mule;
3691 coding->common_flags
3692 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3693 if (!NILP (coding->post_read_conversion))
3694 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
3695 if (!NILP (coding->pre_write_conversion))
3696 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3697 break;
3698
3699 case 1:
3700 coding->type = coding_type_sjis;
3701 coding->common_flags
3702 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3703 break;
3704
3705 case 2:
3706 coding->type = coding_type_iso2022;
3707 coding->common_flags
3708 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3709 {
3710 Lisp_Object val, temp;
3711 Lisp_Object *flags;
3712 int i, charset, reg_bits = 0;
3713
3714 val = XVECTOR (coding_spec)->contents[4];
3715
3716 if (!VECTORP (val) || XVECTOR (val)->size != 32)
3717 goto label_invalid_coding_system;
3718
3719 flags = XVECTOR (val)->contents;
3720 coding->flags
3721 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3722 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3723 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3724 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3725 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3726 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3727 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3728 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3729 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3730 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3731 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3732 | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3733 | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3734 );
3735
3736 /* Invoke graphic register 0 to plane 0. */
3737 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3738 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
3739 CODING_SPEC_ISO_INVOCATION (coding, 1)
3740 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3741 /* Not single shifting at first. */
3742 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3743 /* Beginning of buffer should also be regarded as bol. */
3744 CODING_SPEC_ISO_BOL (coding) = 1;
3745
3746 for (charset = 0; charset <= MAX_CHARSET; charset++)
3747 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3748 val = Vcharset_revision_alist;
3749 while (CONSP (val))
3750 {
3751 charset = get_charset_id (Fcar_safe (XCAR (val)));
3752 if (charset >= 0
3753 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3754 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3755 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3756 val = XCDR (val);
3757 }
3758
3759 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3760 FLAGS[REG] can be one of below:
3761 integer CHARSET: CHARSET occupies register I,
3762 t: designate nothing to REG initially, but can be used
3763 by any charsets,
3764 list of integer, nil, or t: designate the first
3765 element (if integer) to REG initially, the remaining
3766 elements (if integer) is designated to REG on request,
3767 if an element is t, REG can be used by any charsets,
3768 nil: REG is never used. */
3769 for (charset = 0; charset <= MAX_CHARSET; charset++)
3770 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3771 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3772 for (i = 0; i < 4; i++)
3773 {
3774 if ((INTEGERP (flags[i])
3775 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset)))
3776 || (charset = get_charset_id (flags[i])) >= 0)
3777 {
3778 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3779 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3780 }
3781 else if (EQ (flags[i], Qt))
3782 {
3783 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3784 reg_bits |= 1 << i;
3785 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3786 }
3787 else if (CONSP (flags[i]))
3788 {
3789 Lisp_Object tail;
3790 tail = flags[i];
3791
3792 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3793 if ((INTEGERP (XCAR (tail))
3794 && (charset = XINT (XCAR (tail)),
3795 CHARSET_VALID_P (charset)))
3796 || (charset = get_charset_id (XCAR (tail))) >= 0)
3797 {
3798 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3799 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3800 }
3801 else
3802 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3803 tail = XCDR (tail);
3804 while (CONSP (tail))
3805 {
3806 if ((INTEGERP (XCAR (tail))
3807 && (charset = XINT (XCAR (tail)),
3808 CHARSET_VALID_P (charset)))
3809 || (charset = get_charset_id (XCAR (tail))) >= 0)
3810 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3811 = i;
3812 else if (EQ (XCAR (tail), Qt))
3813 reg_bits |= 1 << i;
3814 tail = XCDR (tail);
3815 }
3816 }
3817 else
3818 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3819
3820 CODING_SPEC_ISO_DESIGNATION (coding, i)
3821 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3822 }
3823
3824 if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3825 {
3826 /* REG 1 can be used only by locking shift in 7-bit env. */
3827 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3828 reg_bits &= ~2;
3829 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3830 /* Without any shifting, only REG 0 and 1 can be used. */
3831 reg_bits &= 3;
3832 }
3833
3834 if (reg_bits)
3835 for (charset = 0; charset <= MAX_CHARSET; charset++)
3836 {
3837 if (CHARSET_DEFINED_P (charset)
3838 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3839 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
3840 {
3841 /* There exist some default graphic registers to be
3842 used by CHARSET. */
3843
3844 /* We had better avoid designating a charset of
3845 CHARS96 to REG 0 as far as possible. */
3846 if (CHARSET_CHARS (charset) == 96)
3847 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3848 = (reg_bits & 2
3849 ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3850 else
3851 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3852 = (reg_bits & 1
3853 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3854 }
3855 }
3856 }
3857 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3858 coding->spec.iso2022.last_invalid_designation_register = -1;
3859 break;
3860
3861 case 3:
3862 coding->type = coding_type_big5;
3863 coding->common_flags
3864 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3865 coding->flags
3866 = (NILP (XVECTOR (coding_spec)->contents[4])
3867 ? CODING_FLAG_BIG5_HKU
3868 : CODING_FLAG_BIG5_ETEN);
3869 break;
3870
3871 case 4:
3872 coding->type = coding_type_ccl;
3873 coding->common_flags
3874 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3875 {
3876 val = XVECTOR (coding_spec)->contents[4];
3877 if (! CONSP (val)
3878 || setup_ccl_program (&(coding->spec.ccl.decoder),
3879 XCAR (val)) < 0
3880 || setup_ccl_program (&(coding->spec.ccl.encoder),
3881 XCDR (val)) < 0)
3882 goto label_invalid_coding_system;
3883
3884 bzero (coding->spec.ccl.valid_codes, 256);
3885 val = Fplist_get (plist, Qvalid_codes);
3886 if (CONSP (val))
3887 {
3888 Lisp_Object this;
3889
3890 for (; CONSP (val); val = XCDR (val))
3891 {
3892 this = XCAR (val);
3893 if (INTEGERP (this)
3894 && XINT (this) >= 0 && XINT (this) < 256)
3895 coding->spec.ccl.valid_codes[XINT (this)] = 1;
3896 else if (CONSP (this)
3897 && INTEGERP (XCAR (this))
3898 && INTEGERP (XCDR (this)))
3899 {
3900 int start = XINT (XCAR (this));
3901 int end = XINT (XCDR (this));
3902
3903 if (start >= 0 && start <= end && end < 256)
3904 while (start <= end)
3905 coding->spec.ccl.valid_codes[start++] = 1;
3906 }
3907 }
3908 }
3909 }
3910 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3911 coding->spec.ccl.cr_carryover = 0;
3912 coding->spec.ccl.eight_bit_carryover[0] = 0;
3913 break;
3914
3915 case 5:
3916 coding->type = coding_type_raw_text;
3917 break;
3918
3919 default:
3920 goto label_invalid_coding_system;
3921 }
3922 return 0;
3923
3924 label_invalid_coding_system:
3925 coding->type = coding_type_no_conversion;
3926 coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3927 coding->common_flags = 0;
3928 coding->eol_type = CODING_EOL_UNDECIDED;
3929 coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3930 return NILP (coding_system) ? 0 : -1;
3931 }
3932
3933 /* Free memory blocks allocated for storing composition information. */
3934
3935 void
3936 coding_free_composition_data (coding)
3937 struct coding_system *coding;
3938 {
3939 struct composition_data *cmp_data = coding->cmp_data, *next;
3940
3941 if (!cmp_data)
3942 return;
3943 /* Memory blocks are chained. At first, rewind to the first, then,
3944 free blocks one by one. */
3945 while (cmp_data->prev)
3946 cmp_data = cmp_data->prev;
3947 while (cmp_data)
3948 {
3949 next = cmp_data->next;
3950 xfree (cmp_data);
3951 cmp_data = next;
3952 }
3953 coding->cmp_data = NULL;
3954 }
3955
3956 /* Set `char_offset' member of all memory blocks pointed by
3957 coding->cmp_data to POS. */
3958
3959 void
3960 coding_adjust_composition_offset (coding, pos)
3961 struct coding_system *coding;
3962 int pos;
3963 {
3964 struct composition_data *cmp_data;
3965
3966 for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3967 cmp_data->char_offset = pos;
3968 }
3969
3970 /* Setup raw-text or one of its subsidiaries in the structure
3971 coding_system CODING according to the already setup value eol_type
3972 in CODING. CODING should be setup for some coding system in
3973 advance. */
3974
3975 void
3976 setup_raw_text_coding_system (coding)
3977 struct coding_system *coding;
3978 {
3979 if (coding->type != coding_type_raw_text)
3980 {
3981 coding->symbol = Qraw_text;
3982 coding->type = coding_type_raw_text;
3983 if (coding->eol_type != CODING_EOL_UNDECIDED)
3984 {
3985 Lisp_Object subsidiaries;
3986 subsidiaries = Fget (Qraw_text, Qeol_type);
3987
3988 if (VECTORP (subsidiaries)
3989 && XVECTOR (subsidiaries)->size == 3)
3990 coding->symbol
3991 = XVECTOR (subsidiaries)->contents[coding->eol_type];
3992 }
3993 setup_coding_system (coding->symbol, coding);
3994 }
3995 return;
3996 }
3997
3998 /* Emacs has a mechanism to automatically detect a coding system if it
3999 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
4000 it's impossible to distinguish some coding systems accurately
4001 because they use the same range of codes. So, at first, coding
4002 systems are categorized into 7, those are:
4003
4004 o coding-category-emacs-mule
4005
4006 The category for a coding system which has the same code range
4007 as Emacs' internal format. Assigned the coding-system (Lisp
4008 symbol) `emacs-mule' by default.
4009
4010 o coding-category-sjis
4011
4012 The category for a coding system which has the same code range
4013 as SJIS. Assigned the coding-system (Lisp
4014 symbol) `japanese-shift-jis' by default.
4015
4016 o coding-category-iso-7
4017
4018 The category for a coding system which has the same code range
4019 as ISO2022 of 7-bit environment. This doesn't use any locking
4020 shift and single shift functions. This can encode/decode all
4021 charsets. Assigned the coding-system (Lisp symbol)
4022 `iso-2022-7bit' by default.
4023
4024 o coding-category-iso-7-tight
4025
4026 Same as coding-category-iso-7 except that this can
4027 encode/decode only the specified charsets.
4028
4029 o coding-category-iso-8-1
4030
4031 The category for a coding system which has the same code range
4032 as ISO2022 of 8-bit environment and graphic plane 1 used only
4033 for DIMENSION1 charset. This doesn't use any locking shift
4034 and single shift functions. Assigned the coding-system (Lisp
4035 symbol) `iso-latin-1' by default.
4036
4037 o coding-category-iso-8-2
4038
4039 The category for a coding system which has the same code range
4040 as ISO2022 of 8-bit environment and graphic plane 1 used only
4041 for DIMENSION2 charset. This doesn't use any locking shift
4042 and single shift functions. Assigned the coding-system (Lisp
4043 symbol) `japanese-iso-8bit' by default.
4044
4045 o coding-category-iso-7-else
4046
4047 The category for a coding system which has the same code range
4048 as ISO2022 of 7-bit environment but uses locking shift or
4049 single shift functions. Assigned the coding-system (Lisp
4050 symbol) `iso-2022-7bit-lock' by default.
4051
4052 o coding-category-iso-8-else
4053
4054 The category for a coding system which has the same code range
4055 as ISO2022 of 8-bit environment but uses locking shift or
4056 single shift functions. Assigned the coding-system (Lisp
4057 symbol) `iso-2022-8bit-ss2' by default.
4058
4059 o coding-category-big5
4060
4061 The category for a coding system which has the same code range
4062 as BIG5. Assigned the coding-system (Lisp symbol)
4063 `cn-big5' by default.
4064
4065 o coding-category-utf-8
4066
4067 The category for a coding system which has the same code range
4068 as UTF-8 (cf. RFC3629). Assigned the coding-system (Lisp
4069 symbol) `utf-8' by default.
4070
4071 o coding-category-utf-16-be
4072
4073 The category for a coding system in which a text has an
4074 Unicode signature (cf. Unicode Standard) in the order of BIG
4075 endian at the head. Assigned the coding-system (Lisp symbol)
4076 `utf-16-be' by default.
4077
4078 o coding-category-utf-16-le
4079
4080 The category for a coding system in which a text has an
4081 Unicode signature (cf. Unicode Standard) in the order of
4082 LITTLE endian at the head. Assigned the coding-system (Lisp
4083 symbol) `utf-16-le' by default.
4084
4085 o coding-category-ccl
4086
4087 The category for a coding system of which encoder/decoder is
4088 written in CCL programs. The default value is nil, i.e., no
4089 coding system is assigned.
4090
4091 o coding-category-binary
4092
4093 The category for a coding system not categorized in any of the
4094 above. Assigned the coding-system (Lisp symbol)
4095 `no-conversion' by default.
4096
4097 Each of them is a Lisp symbol and the value is an actual
4098 `coding-system' (this is also a Lisp symbol) assigned by a user.
4099 What Emacs does actually is to detect a category of coding system.
4100 Then, it uses a `coding-system' assigned to it. If Emacs can't
4101 decide a single possible category, it selects a category of the
4102 highest priority. Priorities of categories are also specified by a
4103 user in a Lisp variable `coding-category-list'.
4104
4105 */
4106
4107 static
4108 int ascii_skip_code[256];
4109
4110 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
4111 If it detects possible coding systems, return an integer in which
4112 appropriate flag bits are set. Flag bits are defined by macros
4113 CODING_CATEGORY_MASK_XXX in `coding.h'. If PRIORITIES is non-NULL,
4114 it should point the table `coding_priorities'. In that case, only
4115 the flag bit for a coding system of the highest priority is set in
4116 the returned value. If MULTIBYTEP is nonzero, 8-bit codes of the
4117 range 0x80..0x9F are in multibyte form.
4118
4119 How many ASCII characters are at the head is returned as *SKIP. */
4120
4121 static int
4122 detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
4123 unsigned char *source;
4124 int src_bytes, *priorities, *skip;
4125 int multibytep;
4126 {
4127 register unsigned char c;
4128 unsigned char *src = source, *src_end = source + src_bytes;
4129 unsigned int mask, utf16_examined_p, iso2022_examined_p;
4130 int i;
4131
4132 /* At first, skip all ASCII characters and control characters except
4133 for three ISO2022 specific control characters. */
4134 ascii_skip_code[ISO_CODE_SO] = 0;
4135 ascii_skip_code[ISO_CODE_SI] = 0;
4136 ascii_skip_code[ISO_CODE_ESC] = 0;
4137
4138 label_loop_detect_coding:
4139 while (src < src_end && ascii_skip_code[*src]) src++;
4140 *skip = src - source;
4141
4142 if (src >= src_end)
4143 /* We found nothing other than ASCII. There's nothing to do. */
4144 return 0;
4145
4146 c = *src;
4147 /* The text seems to be encoded in some multilingual coding system.
4148 Now, try to find in which coding system the text is encoded. */
4149 if (c < 0x80)
4150 {
4151 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
4152 /* C is an ISO2022 specific control code of C0. */
4153 mask = detect_coding_iso2022 (src, src_end, multibytep);
4154 if (mask == 0)
4155 {
4156 /* No valid ISO2022 code follows C. Try again. */
4157 src++;
4158 if (c == ISO_CODE_ESC)
4159 ascii_skip_code[ISO_CODE_ESC] = 1;
4160 else
4161 ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
4162 goto label_loop_detect_coding;
4163 }
4164 if (priorities)
4165 {
4166 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4167 {
4168 if (mask & priorities[i])
4169 return priorities[i];
4170 }
4171 return CODING_CATEGORY_MASK_RAW_TEXT;
4172 }
4173 }
4174 else
4175 {
4176 int try;
4177
4178 if (multibytep && c == LEADING_CODE_8_BIT_CONTROL)
4179 c = src[1] - 0x20;
4180
4181 if (c < 0xA0)
4182 {
4183 /* C is the first byte of SJIS character code,
4184 or a leading-code of Emacs' internal format (emacs-mule),
4185 or the first byte of UTF-16. */
4186 try = (CODING_CATEGORY_MASK_SJIS
4187 | CODING_CATEGORY_MASK_EMACS_MULE
4188 | CODING_CATEGORY_MASK_UTF_16_BE
4189 | CODING_CATEGORY_MASK_UTF_16_LE);
4190
4191 /* Or, if C is a special latin extra code,
4192 or is an ISO2022 specific control code of C1 (SS2 or SS3),
4193 or is an ISO2022 control-sequence-introducer (CSI),
4194 we should also consider the possibility of ISO2022 codings. */
4195 if ((VECTORP (Vlatin_extra_code_table)
4196 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
4197 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
4198 || (c == ISO_CODE_CSI
4199 && (src < src_end
4200 && (*src == ']'
4201 || ((*src == '0' || *src == '1' || *src == '2')
4202 && src + 1 < src_end
4203 && src[1] == ']')))))
4204 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
4205 | CODING_CATEGORY_MASK_ISO_8BIT);
4206 }
4207 else
4208 /* C is a character of ISO2022 in graphic plane right,
4209 or a SJIS's 1-byte character code (i.e. JISX0201),
4210 or the first byte of BIG5's 2-byte code,
4211 or the first byte of UTF-8/16. */
4212 try = (CODING_CATEGORY_MASK_ISO_8_ELSE
4213 | CODING_CATEGORY_MASK_ISO_8BIT
4214 | CODING_CATEGORY_MASK_SJIS
4215 | CODING_CATEGORY_MASK_BIG5
4216 | CODING_CATEGORY_MASK_UTF_8
4217 | CODING_CATEGORY_MASK_UTF_16_BE
4218 | CODING_CATEGORY_MASK_UTF_16_LE);
4219
4220 /* Or, we may have to consider the possibility of CCL. */
4221 if (coding_system_table[CODING_CATEGORY_IDX_CCL]
4222 && (coding_system_table[CODING_CATEGORY_IDX_CCL]
4223 ->spec.ccl.valid_codes)[c])
4224 try |= CODING_CATEGORY_MASK_CCL;
4225
4226 mask = 0;
4227 utf16_examined_p = iso2022_examined_p = 0;
4228 if (priorities)
4229 {
4230 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4231 {
4232 if (!iso2022_examined_p
4233 && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
4234 {
4235 mask |= detect_coding_iso2022 (src, src_end, multibytep);
4236 iso2022_examined_p = 1;
4237 }
4238 else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
4239 mask |= detect_coding_sjis (src, src_end, multibytep);
4240 else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
4241 mask |= detect_coding_utf_8 (src, src_end, multibytep);
4242 else if (!utf16_examined_p
4243 && (priorities[i] & try &
4244 CODING_CATEGORY_MASK_UTF_16_BE_LE))
4245 {
4246 mask |= detect_coding_utf_16 (src, src_end, multibytep);
4247 utf16_examined_p = 1;
4248 }
4249 else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
4250 mask |= detect_coding_big5 (src, src_end, multibytep);
4251 else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
4252 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4253 else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
4254 mask |= detect_coding_ccl (src, src_end, multibytep);
4255 else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
4256 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
4257 else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
4258 mask |= CODING_CATEGORY_MASK_BINARY;
4259 if (mask & priorities[i])
4260 return priorities[i];
4261 }
4262 return CODING_CATEGORY_MASK_RAW_TEXT;
4263 }
4264 if (try & CODING_CATEGORY_MASK_ISO)
4265 mask |= detect_coding_iso2022 (src, src_end, multibytep);
4266 if (try & CODING_CATEGORY_MASK_SJIS)
4267 mask |= detect_coding_sjis (src, src_end, multibytep);
4268 if (try & CODING_CATEGORY_MASK_BIG5)
4269 mask |= detect_coding_big5 (src, src_end, multibytep);
4270 if (try & CODING_CATEGORY_MASK_UTF_8)
4271 mask |= detect_coding_utf_8 (src, src_end, multibytep);
4272 if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
4273 mask |= detect_coding_utf_16 (src, src_end, multibytep);
4274 if (try & CODING_CATEGORY_MASK_EMACS_MULE)
4275 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4276 if (try & CODING_CATEGORY_MASK_CCL)
4277 mask |= detect_coding_ccl (src, src_end, multibytep);
4278 }
4279 return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
4280 }
4281
4282 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
4283 The information of the detected coding system is set in CODING. */
4284
4285 void
4286 detect_coding (coding, src, src_bytes)
4287 struct coding_system *coding;
4288 const unsigned char *src;
4289 int src_bytes;
4290 {
4291 unsigned int idx;
4292 int skip, mask;
4293 Lisp_Object val;
4294
4295 val = Vcoding_category_list;
4296 mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip,
4297 coding->src_multibyte);
4298 coding->heading_ascii = skip;
4299
4300 if (!mask) return;
4301
4302 /* We found a single coding system of the highest priority in MASK. */
4303 idx = 0;
4304 while (mask && ! (mask & 1)) mask >>= 1, idx++;
4305 if (! mask)
4306 idx = CODING_CATEGORY_IDX_RAW_TEXT;
4307
4308 val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[idx]);
4309
4310 if (coding->eol_type != CODING_EOL_UNDECIDED)
4311 {
4312 Lisp_Object tmp;
4313
4314 tmp = Fget (val, Qeol_type);
4315 if (VECTORP (tmp))
4316 val = XVECTOR (tmp)->contents[coding->eol_type];
4317 }
4318
4319 /* Setup this new coding system while preserving some slots. */
4320 {
4321 int src_multibyte = coding->src_multibyte;
4322 int dst_multibyte = coding->dst_multibyte;
4323
4324 setup_coding_system (val, coding);
4325 coding->src_multibyte = src_multibyte;
4326 coding->dst_multibyte = dst_multibyte;
4327 coding->heading_ascii = skip;
4328 }
4329 }
4330
4331 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
4332 SOURCE is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
4333 CODING_EOL_CR, and CODING_EOL_UNDECIDED.
4334
4335 How many non-eol characters are at the head is returned as *SKIP. */
4336
4337 #define MAX_EOL_CHECK_COUNT 3
4338
4339 static int
4340 detect_eol_type (source, src_bytes, skip)
4341 unsigned char *source;
4342 int src_bytes, *skip;
4343 {
4344 unsigned char *src = source, *src_end = src + src_bytes;
4345 unsigned char c;
4346 int total = 0; /* How many end-of-lines are found so far. */
4347 int eol_type = CODING_EOL_UNDECIDED;
4348 int this_eol_type;
4349
4350 *skip = 0;
4351
4352 while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4353 {
4354 c = *src++;
4355 if (c == '\n' || c == '\r')
4356 {
4357 if (*skip == 0)
4358 *skip = src - 1 - source;
4359 total++;
4360 if (c == '\n')
4361 this_eol_type = CODING_EOL_LF;
4362 else if (src >= src_end || *src != '\n')
4363 this_eol_type = CODING_EOL_CR;
4364 else
4365 this_eol_type = CODING_EOL_CRLF, src++;
4366
4367 if (eol_type == CODING_EOL_UNDECIDED)
4368 /* This is the first end-of-line. */
4369 eol_type = this_eol_type;
4370 else if (eol_type != this_eol_type)
4371 {
4372 /* The found type is different from what found before. */
4373 eol_type = CODING_EOL_INCONSISTENT;
4374 break;
4375 }
4376 }
4377 }
4378
4379 if (*skip == 0)
4380 *skip = src_end - source;
4381 return eol_type;
4382 }
4383
4384 /* Like detect_eol_type, but detect EOL type in 2-octet
4385 big-endian/little-endian format for coding systems utf-16-be and
4386 utf-16-le. */
4387
4388 static int
4389 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
4390 unsigned char *source;
4391 int src_bytes, *skip, big_endian_p;
4392 {
4393 unsigned char *src = source, *src_end = src + src_bytes;
4394 unsigned int c1, c2;
4395 int total = 0; /* How many end-of-lines are found so far. */
4396 int eol_type = CODING_EOL_UNDECIDED;
4397 int this_eol_type;
4398 int msb, lsb;
4399
4400 if (big_endian_p)
4401 msb = 0, lsb = 1;
4402 else
4403 msb = 1, lsb = 0;
4404
4405 *skip = 0;
4406
4407 while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
4408 {
4409 c1 = (src[msb] << 8) | (src[lsb]);
4410 src += 2;
4411
4412 if (c1 == '\n' || c1 == '\r')
4413 {
4414 if (*skip == 0)
4415 *skip = src - 2 - source;
4416 total++;
4417 if (c1 == '\n')
4418 {
4419 this_eol_type = CODING_EOL_LF;
4420 }
4421 else
4422 {
4423 if ((src + 1) >= src_end)
4424 {
4425 this_eol_type = CODING_EOL_CR;
4426 }
4427 else
4428 {
4429 c2 = (src[msb] << 8) | (src[lsb]);
4430 if (c2 == '\n')
4431 this_eol_type = CODING_EOL_CRLF, src += 2;
4432 else
4433 this_eol_type = CODING_EOL_CR;
4434 }
4435 }
4436
4437 if (eol_type == CODING_EOL_UNDECIDED)
4438 /* This is the first end-of-line. */
4439 eol_type = this_eol_type;
4440 else if (eol_type != this_eol_type)
4441 {
4442 /* The found type is different from what found before. */
4443 eol_type = CODING_EOL_INCONSISTENT;
4444 break;
4445 }
4446 }
4447 }
4448
4449 if (*skip == 0)
4450 *skip = src_end - source;
4451 return eol_type;
4452 }
4453
4454 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
4455 is encoded. If it detects an appropriate format of end-of-line, it
4456 sets the information in *CODING. */
4457
4458 void
4459 detect_eol (coding, src, src_bytes)
4460 struct coding_system *coding;
4461 const unsigned char *src;
4462 int src_bytes;
4463 {
4464 Lisp_Object val;
4465 int skip;
4466 int eol_type;
4467
4468 switch (coding->category_idx)
4469 {
4470 case CODING_CATEGORY_IDX_UTF_16_BE:
4471 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
4472 break;
4473 case CODING_CATEGORY_IDX_UTF_16_LE:
4474 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
4475 break;
4476 default:
4477 eol_type = detect_eol_type (src, src_bytes, &skip);
4478 break;
4479 }
4480
4481 if (coding->heading_ascii > skip)
4482 coding->heading_ascii = skip;
4483 else
4484 skip = coding->heading_ascii;
4485
4486 if (eol_type == CODING_EOL_UNDECIDED)
4487 return;
4488 if (eol_type == CODING_EOL_INCONSISTENT)
4489 {
4490 #if 0
4491 /* This code is suppressed until we find a better way to
4492 distinguish raw text file and binary file. */
4493
4494 /* If we have already detected that the coding is raw-text, the
4495 coding should actually be no-conversion. */
4496 if (coding->type == coding_type_raw_text)
4497 {
4498 setup_coding_system (Qno_conversion, coding);
4499 return;
4500 }
4501 /* Else, let's decode only text code anyway. */
4502 #endif /* 0 */
4503 eol_type = CODING_EOL_LF;
4504 }
4505
4506 val = Fget (coding->symbol, Qeol_type);
4507 if (VECTORP (val) && XVECTOR (val)->size == 3)
4508 {
4509 int src_multibyte = coding->src_multibyte;
4510 int dst_multibyte = coding->dst_multibyte;
4511 struct composition_data *cmp_data = coding->cmp_data;
4512
4513 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
4514 coding->src_multibyte = src_multibyte;
4515 coding->dst_multibyte = dst_multibyte;
4516 coding->heading_ascii = skip;
4517 coding->cmp_data = cmp_data;
4518 }
4519 }
4520
4521 #define CONVERSION_BUFFER_EXTRA_ROOM 256
4522
4523 #define DECODING_BUFFER_MAG(coding) \
4524 (coding->type == coding_type_iso2022 \
4525 ? 3 \
4526 : (coding->type == coding_type_ccl \
4527 ? coding->spec.ccl.decoder.buf_magnification \
4528 : 2))
4529
4530 /* Return maximum size (bytes) of a buffer enough for decoding
4531 SRC_BYTES of text encoded in CODING. */
4532
4533 int
4534 decoding_buffer_size (coding, src_bytes)
4535 struct coding_system *coding;
4536 int src_bytes;
4537 {
4538 return (src_bytes * DECODING_BUFFER_MAG (coding)
4539 + CONVERSION_BUFFER_EXTRA_ROOM);
4540 }
4541
4542 /* Return maximum size (bytes) of a buffer enough for encoding
4543 SRC_BYTES of text to CODING. */
4544
4545 int
4546 encoding_buffer_size (coding, src_bytes)
4547 struct coding_system *coding;
4548 int src_bytes;
4549 {
4550 int magnification;
4551
4552 if (coding->type == coding_type_ccl)
4553 {
4554 magnification = coding->spec.ccl.encoder.buf_magnification;
4555 if (coding->eol_type == CODING_EOL_CRLF)
4556 magnification *= 2;
4557 }
4558 else if (CODING_REQUIRE_ENCODING (coding))
4559 magnification = 3;
4560 else
4561 magnification = 1;
4562
4563 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
4564 }
4565
4566 /* Working buffer for code conversion. */
4567 struct conversion_buffer
4568 {
4569 int size; /* size of data. */
4570 int on_stack; /* 1 if allocated by alloca. */
4571 unsigned char *data;
4572 };
4573
4574 /* Allocate LEN bytes of memory for BUF (struct conversion_buffer). */
4575 #define allocate_conversion_buffer(buf, len) \
4576 do { \
4577 if (len < MAX_ALLOCA) \
4578 { \
4579 buf.data = (unsigned char *) alloca (len); \
4580 buf.on_stack = 1; \
4581 } \
4582 else \
4583 { \
4584 buf.data = (unsigned char *) xmalloc (len); \
4585 buf.on_stack = 0; \
4586 } \
4587 buf.size = len; \
4588 } while (0)
4589
4590 /* Double the allocated memory for *BUF. */
4591 static void
4592 extend_conversion_buffer (buf)
4593 struct conversion_buffer *buf;
4594 {
4595 if (buf->on_stack)
4596 {
4597 unsigned char *save = buf->data;
4598 buf->data = (unsigned char *) xmalloc (buf->size * 2);
4599 bcopy (save, buf->data, buf->size);
4600 buf->on_stack = 0;
4601 }
4602 else
4603 {
4604 buf->data = (unsigned char *) xrealloc (buf->data, buf->size * 2);
4605 }
4606 buf->size *= 2;
4607 }
4608
4609 /* Free the allocated memory for BUF if it is not on stack. */
4610 static void
4611 free_conversion_buffer (buf)
4612 struct conversion_buffer *buf;
4613 {
4614 if (!buf->on_stack)
4615 xfree (buf->data);
4616 }
4617
4618 int
4619 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
4620 struct coding_system *coding;
4621 unsigned char *source, *destination;
4622 int src_bytes, dst_bytes, encodep;
4623 {
4624 struct ccl_program *ccl
4625 = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
4626 unsigned char *dst = destination;
4627
4628 ccl->suppress_error = coding->suppress_error;
4629 ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4630 if (encodep)
4631 {
4632 /* On encoding, EOL format is converted within ccl_driver. For
4633 that, setup proper information in the structure CCL. */
4634 ccl->eol_type = coding->eol_type;
4635 if (ccl->eol_type ==CODING_EOL_UNDECIDED)
4636 ccl->eol_type = CODING_EOL_LF;
4637 ccl->cr_consumed = coding->spec.ccl.cr_carryover;
4638 ccl->eight_bit_control = coding->dst_multibyte;
4639 }
4640 else
4641 ccl->eight_bit_control = 1;
4642 ccl->multibyte = coding->src_multibyte;
4643 if (coding->spec.ccl.eight_bit_carryover[0] != 0)
4644 {
4645 /* Move carryover bytes to DESTINATION. */
4646 unsigned char *p = coding->spec.ccl.eight_bit_carryover;
4647 while (*p)
4648 *dst++ = *p++;
4649 coding->spec.ccl.eight_bit_carryover[0] = 0;
4650 if (dst_bytes)
4651 dst_bytes -= dst - destination;
4652 }
4653
4654 coding->produced = (ccl_driver (ccl, source, dst, src_bytes, dst_bytes,
4655 &(coding->consumed))
4656 + dst - destination);
4657
4658 if (encodep)
4659 {
4660 coding->produced_char = coding->produced;
4661 coding->spec.ccl.cr_carryover = ccl->cr_consumed;
4662 }
4663 else if (!ccl->eight_bit_control)
4664 {
4665 /* The produced bytes forms a valid multibyte sequence. */
4666 coding->produced_char
4667 = multibyte_chars_in_text (destination, coding->produced);
4668 coding->spec.ccl.eight_bit_carryover[0] = 0;
4669 }
4670 else
4671 {
4672 /* On decoding, the destination should always multibyte. But,
4673 CCL program might have been generated an invalid multibyte
4674 sequence. Here we make such a sequence valid as
4675 multibyte. */
4676 int bytes
4677 = dst_bytes ? dst_bytes : source + coding->consumed - destination;
4678
4679 if ((coding->consumed < src_bytes
4680 || !ccl->last_block)
4681 && coding->produced >= 1
4682 && destination[coding->produced - 1] >= 0x80)
4683 {
4684 /* We should not convert the tailing 8-bit codes to
4685 multibyte form even if they doesn't form a valid
4686 multibyte sequence. They may form a valid sequence in
4687 the next call. */
4688 int carryover = 0;
4689
4690 if (destination[coding->produced - 1] < 0xA0)
4691 carryover = 1;
4692 else if (coding->produced >= 2)
4693 {
4694 if (destination[coding->produced - 2] >= 0x80)
4695 {
4696 if (destination[coding->produced - 2] < 0xA0)
4697 carryover = 2;
4698 else if (coding->produced >= 3
4699 && destination[coding->produced - 3] >= 0x80
4700 && destination[coding->produced - 3] < 0xA0)
4701 carryover = 3;
4702 }
4703 }
4704 if (carryover > 0)
4705 {
4706 BCOPY_SHORT (destination + coding->produced - carryover,
4707 coding->spec.ccl.eight_bit_carryover,
4708 carryover);
4709 coding->spec.ccl.eight_bit_carryover[carryover] = 0;
4710 coding->produced -= carryover;
4711 }
4712 }
4713 coding->produced = str_as_multibyte (destination, bytes,
4714 coding->produced,
4715 &(coding->produced_char));
4716 }
4717
4718 switch (ccl->status)
4719 {
4720 case CCL_STAT_SUSPEND_BY_SRC:
4721 coding->result = CODING_FINISH_INSUFFICIENT_SRC;
4722 break;
4723 case CCL_STAT_SUSPEND_BY_DST:
4724 coding->result = CODING_FINISH_INSUFFICIENT_DST;
4725 break;
4726 case CCL_STAT_QUIT:
4727 case CCL_STAT_INVALID_CMD:
4728 coding->result = CODING_FINISH_INTERRUPT;
4729 break;
4730 default:
4731 coding->result = CODING_FINISH_NORMAL;
4732 break;
4733 }
4734 return coding->result;
4735 }
4736
4737 /* Decode EOL format of the text at PTR of BYTES length destructively
4738 according to CODING->eol_type. This is called after the CCL
4739 program produced a decoded text at PTR. If we do CRLF->LF
4740 conversion, update CODING->produced and CODING->produced_char. */
4741
4742 static void
4743 decode_eol_post_ccl (coding, ptr, bytes)
4744 struct coding_system *coding;
4745 unsigned char *ptr;
4746 int bytes;
4747 {
4748 Lisp_Object val, saved_coding_symbol;
4749 unsigned char *pend = ptr + bytes;
4750 int dummy;
4751
4752 /* Remember the current coding system symbol. We set it back when
4753 an inconsistent EOL is found so that `last-coding-system-used' is
4754 set to the coding system that doesn't specify EOL conversion. */
4755 saved_coding_symbol = coding->symbol;
4756
4757 coding->spec.ccl.cr_carryover = 0;
4758 if (coding->eol_type == CODING_EOL_UNDECIDED)
4759 {
4760 /* Here, to avoid the call of setup_coding_system, we directly
4761 call detect_eol_type. */
4762 coding->eol_type = detect_eol_type (ptr, bytes, &dummy);
4763 if (coding->eol_type == CODING_EOL_INCONSISTENT)
4764 coding->eol_type = CODING_EOL_LF;
4765 if (coding->eol_type != CODING_EOL_UNDECIDED)
4766 {
4767 val = Fget (coding->symbol, Qeol_type);
4768 if (VECTORP (val) && XVECTOR (val)->size == 3)
4769 coding->symbol = XVECTOR (val)->contents[coding->eol_type];
4770 }
4771 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4772 }
4773
4774 if (coding->eol_type == CODING_EOL_LF
4775 || coding->eol_type == CODING_EOL_UNDECIDED)
4776 {
4777 /* We have nothing to do. */
4778 ptr = pend;
4779 }
4780 else if (coding->eol_type == CODING_EOL_CRLF)
4781 {
4782 unsigned char *pstart = ptr, *p = ptr;
4783
4784 if (! (coding->mode & CODING_MODE_LAST_BLOCK)
4785 && *(pend - 1) == '\r')
4786 {
4787 /* If the last character is CR, we can't handle it here
4788 because LF will be in the not-yet-decoded source text.
4789 Record that the CR is not yet processed. */
4790 coding->spec.ccl.cr_carryover = 1;
4791 coding->produced--;
4792 coding->produced_char--;
4793 pend--;
4794 }
4795 while (ptr < pend)
4796 {
4797 if (*ptr == '\r')
4798 {
4799 if (ptr + 1 < pend && *(ptr + 1) == '\n')
4800 {
4801 *p++ = '\n';
4802 ptr += 2;
4803 }
4804 else
4805 {
4806 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4807 goto undo_eol_conversion;
4808 *p++ = *ptr++;
4809 }
4810 }
4811 else if (*ptr == '\n'
4812 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4813 goto undo_eol_conversion;
4814 else
4815 *p++ = *ptr++;
4816 continue;
4817
4818 undo_eol_conversion:
4819 /* We have faced with inconsistent EOL format at PTR.
4820 Convert all LFs before PTR back to CRLFs. */
4821 for (p--, ptr--; p >= pstart; p--)
4822 {
4823 if (*p == '\n')
4824 *ptr-- = '\n', *ptr-- = '\r';
4825 else
4826 *ptr-- = *p;
4827 }
4828 /* If carryover is recorded, cancel it because we don't
4829 convert CRLF anymore. */
4830 if (coding->spec.ccl.cr_carryover)
4831 {
4832 coding->spec.ccl.cr_carryover = 0;
4833 coding->produced++;
4834 coding->produced_char++;
4835 pend++;
4836 }
4837 p = ptr = pend;
4838 coding->eol_type = CODING_EOL_LF;
4839 coding->symbol = saved_coding_symbol;
4840 }
4841 if (p < pend)
4842 {
4843 /* As each two-byte sequence CRLF was converted to LF, (PEND
4844 - P) is the number of deleted characters. */
4845 coding->produced -= pend - p;
4846 coding->produced_char -= pend - p;
4847 }
4848 }
4849 else /* i.e. coding->eol_type == CODING_EOL_CR */
4850 {
4851 unsigned char *p = ptr;
4852
4853 for (; ptr < pend; ptr++)
4854 {
4855 if (*ptr == '\r')
4856 *ptr = '\n';
4857 else if (*ptr == '\n'
4858 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4859 {
4860 for (; p < ptr; p++)
4861 {
4862 if (*p == '\n')
4863 *p = '\r';
4864 }
4865 ptr = pend;
4866 coding->eol_type = CODING_EOL_LF;
4867 coding->symbol = saved_coding_symbol;
4868 }
4869 }
4870 }
4871 }
4872
4873 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
4874 decoding, it may detect coding system and format of end-of-line if
4875 those are not yet decided. The source should be unibyte, the
4876 result is multibyte if CODING->dst_multibyte is nonzero, else
4877 unibyte. */
4878
4879 int
4880 decode_coding (coding, source, destination, src_bytes, dst_bytes)
4881 struct coding_system *coding;
4882 const unsigned char *source;
4883 unsigned char *destination;
4884 int src_bytes, dst_bytes;
4885 {
4886 int extra = 0;
4887
4888 if (coding->type == coding_type_undecided)
4889 detect_coding (coding, source, src_bytes);
4890
4891 if (coding->eol_type == CODING_EOL_UNDECIDED
4892 && coding->type != coding_type_ccl)
4893 {
4894 detect_eol (coding, source, src_bytes);
4895 /* We had better recover the original eol format if we
4896 encounter an inconsistent eol format while decoding. */
4897 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4898 }
4899
4900 coding->produced = coding->produced_char = 0;
4901 coding->consumed = coding->consumed_char = 0;
4902 coding->errors = 0;
4903 coding->result = CODING_FINISH_NORMAL;
4904
4905 switch (coding->type)
4906 {
4907 case coding_type_sjis:
4908 decode_coding_sjis_big5 (coding, source, destination,
4909 src_bytes, dst_bytes, 1);
4910 break;
4911
4912 case coding_type_iso2022:
4913 decode_coding_iso2022 (coding, source, destination,
4914 src_bytes, dst_bytes);
4915 break;
4916
4917 case coding_type_big5:
4918 decode_coding_sjis_big5 (coding, source, destination,
4919 src_bytes, dst_bytes, 0);
4920 break;
4921
4922 case coding_type_emacs_mule:
4923 decode_coding_emacs_mule (coding, source, destination,
4924 src_bytes, dst_bytes);
4925 break;
4926
4927 case coding_type_ccl:
4928 if (coding->spec.ccl.cr_carryover)
4929 {
4930 /* Put the CR which was not processed by the previous call
4931 of decode_eol_post_ccl in DESTINATION. It will be
4932 decoded together with the following LF by the call to
4933 decode_eol_post_ccl below. */
4934 *destination = '\r';
4935 coding->produced++;
4936 coding->produced_char++;
4937 dst_bytes--;
4938 extra = coding->spec.ccl.cr_carryover;
4939 }
4940 ccl_coding_driver (coding, source, destination + extra,
4941 src_bytes, dst_bytes, 0);
4942 if (coding->eol_type != CODING_EOL_LF)
4943 {
4944 coding->produced += extra;
4945 coding->produced_char += extra;
4946 decode_eol_post_ccl (coding, destination, coding->produced);
4947 }
4948 break;
4949
4950 default:
4951 decode_eol (coding, source, destination, src_bytes, dst_bytes);
4952 }
4953
4954 if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4955 && coding->mode & CODING_MODE_LAST_BLOCK
4956 && coding->consumed == src_bytes)
4957 coding->result = CODING_FINISH_NORMAL;
4958
4959 if (coding->mode & CODING_MODE_LAST_BLOCK
4960 && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4961 {
4962 const unsigned char *src = source + coding->consumed;
4963 unsigned char *dst = destination + coding->produced;
4964
4965 src_bytes -= coding->consumed;
4966 coding->errors++;
4967 if (COMPOSING_P (coding))
4968 DECODE_COMPOSITION_END ('1');
4969 while (src_bytes--)
4970 {
4971 int c = *src++;
4972 dst += CHAR_STRING (c, dst);
4973 coding->produced_char++;
4974 }
4975 coding->consumed = coding->consumed_char = src - source;
4976 coding->produced = dst - destination;
4977 coding->result = CODING_FINISH_NORMAL;
4978 }
4979
4980 if (!coding->dst_multibyte)
4981 {
4982 coding->produced = str_as_unibyte (destination, coding->produced);
4983 coding->produced_char = coding->produced;
4984 }
4985
4986 return coding->result;
4987 }
4988
4989 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". The
4990 multibyteness of the source is CODING->src_multibyte, the
4991 multibyteness of the result is always unibyte. */
4992
4993 int
4994 encode_coding (coding, source, destination, src_bytes, dst_bytes)
4995 struct coding_system *coding;
4996 const unsigned char *source;
4997 unsigned char *destination;
4998 int src_bytes, dst_bytes;
4999 {
5000 coding->produced = coding->produced_char = 0;
5001 coding->consumed = coding->consumed_char = 0;
5002 coding->errors = 0;
5003 coding->result = CODING_FINISH_NORMAL;
5004 if (coding->eol_type == CODING_EOL_UNDECIDED)
5005 coding->eol_type = CODING_EOL_LF;
5006
5007 switch (coding->type)
5008 {
5009 case coding_type_sjis:
5010 encode_coding_sjis_big5 (coding, source, destination,
5011 src_bytes, dst_bytes, 1);
5012 break;
5013
5014 case coding_type_iso2022:
5015 encode_coding_iso2022 (coding, source, destination,
5016 src_bytes, dst_bytes);
5017 break;
5018
5019 case coding_type_big5:
5020 encode_coding_sjis_big5 (coding, source, destination,
5021 src_bytes, dst_bytes, 0);
5022 break;
5023
5024 case coding_type_emacs_mule:
5025 encode_coding_emacs_mule (coding, source, destination,
5026 src_bytes, dst_bytes);
5027 break;
5028
5029 case coding_type_ccl:
5030 ccl_coding_driver (coding, source, destination,
5031 src_bytes, dst_bytes, 1);
5032 break;
5033
5034 default:
5035 encode_eol (coding, source, destination, src_bytes, dst_bytes);
5036 }
5037
5038 if (coding->mode & CODING_MODE_LAST_BLOCK
5039 && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
5040 {
5041 const unsigned char *src = source + coding->consumed;
5042 unsigned char *dst = destination + coding->produced;
5043
5044 if (coding->type == coding_type_iso2022)
5045 ENCODE_RESET_PLANE_AND_REGISTER;
5046 if (COMPOSING_P (coding))
5047 *dst++ = ISO_CODE_ESC, *dst++ = '1';
5048 if (coding->consumed < src_bytes)
5049 {
5050 int len = src_bytes - coding->consumed;
5051
5052 BCOPY_SHORT (src, dst, len);
5053 if (coding->src_multibyte)
5054 len = str_as_unibyte (dst, len);
5055 dst += len;
5056 coding->consumed = src_bytes;
5057 }
5058 coding->produced = coding->produced_char = dst - destination;
5059 coding->result = CODING_FINISH_NORMAL;
5060 }
5061
5062 if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
5063 && coding->consumed == src_bytes)
5064 coding->result = CODING_FINISH_NORMAL;
5065
5066 return coding->result;
5067 }
5068
5069 /* Scan text in the region between *BEG and *END (byte positions),
5070 skip characters which we don't have to decode by coding system
5071 CODING at the head and tail, then set *BEG and *END to the region
5072 of the text we actually have to convert. The caller should move
5073 the gap out of the region in advance if the region is from a
5074 buffer.
5075
5076 If STR is not NULL, *BEG and *END are indices into STR. */
5077
5078 static void
5079 shrink_decoding_region (beg, end, coding, str)
5080 int *beg, *end;
5081 struct coding_system *coding;
5082 unsigned char *str;
5083 {
5084 unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
5085 int eol_conversion;
5086 Lisp_Object translation_table;
5087
5088 if (coding->type == coding_type_ccl
5089 || coding->type == coding_type_undecided
5090 || coding->eol_type != CODING_EOL_LF
5091 || !NILP (coding->post_read_conversion)
5092 || coding->composing != COMPOSITION_DISABLED)
5093 {
5094 /* We can't skip any data. */
5095 return;
5096 }
5097 if (coding->type == coding_type_no_conversion
5098 || coding->type == coding_type_raw_text
5099 || coding->type == coding_type_emacs_mule)
5100 {
5101 /* We need no conversion, but don't have to skip any data here.
5102 Decoding routine handles them effectively anyway. */
5103 return;
5104 }
5105
5106 translation_table = coding->translation_table_for_decode;
5107 if (NILP (translation_table) && !NILP (Venable_character_translation))
5108 translation_table = Vstandard_translation_table_for_decode;
5109 if (CHAR_TABLE_P (translation_table))
5110 {
5111 int i;
5112 for (i = 0; i < 128; i++)
5113 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5114 break;
5115 if (i < 128)
5116 /* Some ASCII character should be translated. We give up
5117 shrinking. */
5118 return;
5119 }
5120
5121 if (coding->heading_ascii >= 0)
5122 /* Detection routine has already found how much we can skip at the
5123 head. */
5124 *beg += coding->heading_ascii;
5125
5126 if (str)
5127 {
5128 begp_orig = begp = str + *beg;
5129 endp_orig = endp = str + *end;
5130 }
5131 else
5132 {
5133 begp_orig = begp = BYTE_POS_ADDR (*beg);
5134 endp_orig = endp = begp + *end - *beg;
5135 }
5136
5137 eol_conversion = (coding->eol_type == CODING_EOL_CR
5138 || coding->eol_type == CODING_EOL_CRLF);
5139
5140 switch (coding->type)
5141 {
5142 case coding_type_sjis:
5143 case coding_type_big5:
5144 /* We can skip all ASCII characters at the head. */
5145 if (coding->heading_ascii < 0)
5146 {
5147 if (eol_conversion)
5148 while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
5149 else
5150 while (begp < endp && *begp < 0x80) begp++;
5151 }
5152 /* We can skip all ASCII characters at the tail except for the
5153 second byte of SJIS or BIG5 code. */
5154 if (eol_conversion)
5155 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
5156 else
5157 while (begp < endp && endp[-1] < 0x80) endp--;
5158 /* Do not consider LF as ascii if preceded by CR, since that
5159 confuses eol decoding. */
5160 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5161 endp++;
5162 if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
5163 endp++;
5164 break;
5165
5166 case coding_type_iso2022:
5167 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5168 /* We can't skip any data. */
5169 break;
5170 if (coding->heading_ascii < 0)
5171 {
5172 /* We can skip all ASCII characters at the head except for a
5173 few control codes. */
5174 while (begp < endp && (c = *begp) < 0x80
5175 && c != ISO_CODE_CR && c != ISO_CODE_SO
5176 && c != ISO_CODE_SI && c != ISO_CODE_ESC
5177 && (!eol_conversion || c != ISO_CODE_LF))
5178 begp++;
5179 }
5180 switch (coding->category_idx)
5181 {
5182 case CODING_CATEGORY_IDX_ISO_8_1:
5183 case CODING_CATEGORY_IDX_ISO_8_2:
5184 /* We can skip all ASCII characters at the tail. */
5185 if (eol_conversion)
5186 while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
5187 else
5188 while (begp < endp && endp[-1] < 0x80) endp--;
5189 /* Do not consider LF as ascii if preceded by CR, since that
5190 confuses eol decoding. */
5191 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5192 endp++;
5193 break;
5194
5195 case CODING_CATEGORY_IDX_ISO_7:
5196 case CODING_CATEGORY_IDX_ISO_7_TIGHT:
5197 {
5198 /* We can skip all characters at the tail except for 8-bit
5199 codes and ESC and the following 2-byte at the tail. */
5200 unsigned char *eight_bit = NULL;
5201
5202 if (eol_conversion)
5203 while (begp < endp
5204 && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
5205 {
5206 if (!eight_bit && c & 0x80) eight_bit = endp;
5207 endp--;
5208 }
5209 else
5210 while (begp < endp
5211 && (c = endp[-1]) != ISO_CODE_ESC)
5212 {
5213 if (!eight_bit && c & 0x80) eight_bit = endp;
5214 endp--;
5215 }
5216 /* Do not consider LF as ascii if preceded by CR, since that
5217 confuses eol decoding. */
5218 if (begp < endp && endp < endp_orig
5219 && endp[-1] == '\r' && endp[0] == '\n')
5220 endp++;
5221 if (begp < endp && endp[-1] == ISO_CODE_ESC)
5222 {
5223 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
5224 /* This is an ASCII designation sequence. We can
5225 surely skip the tail. But, if we have
5226 encountered an 8-bit code, skip only the codes
5227 after that. */
5228 endp = eight_bit ? eight_bit : endp + 2;
5229 else
5230 /* Hmmm, we can't skip the tail. */
5231 endp = endp_orig;
5232 }
5233 else if (eight_bit)
5234 endp = eight_bit;
5235 }
5236 }
5237 break;
5238
5239 default:
5240 abort ();
5241 }
5242 *beg += begp - begp_orig;
5243 *end += endp - endp_orig;
5244 return;
5245 }
5246
5247 /* Like shrink_decoding_region but for encoding. */
5248
5249 static void
5250 shrink_encoding_region (beg, end, coding, str)
5251 int *beg, *end;
5252 struct coding_system *coding;
5253 unsigned char *str;
5254 {
5255 unsigned char *begp_orig, *begp, *endp_orig, *endp;
5256 int eol_conversion;
5257 Lisp_Object translation_table;
5258
5259 if (coding->type == coding_type_ccl
5260 || coding->eol_type == CODING_EOL_CRLF
5261 || coding->eol_type == CODING_EOL_CR
5262 || (coding->cmp_data && coding->cmp_data->used > 0))
5263 {
5264 /* We can't skip any data. */
5265 return;
5266 }
5267 if (coding->type == coding_type_no_conversion
5268 || coding->type == coding_type_raw_text
5269 || coding->type == coding_type_emacs_mule
5270 || coding->type == coding_type_undecided)
5271 {
5272 /* We need no conversion, but don't have to skip any data here.
5273 Encoding routine handles them effectively anyway. */
5274 return;
5275 }
5276
5277 translation_table = coding->translation_table_for_encode;
5278 if (NILP (translation_table) && !NILP (Venable_character_translation))
5279 translation_table = Vstandard_translation_table_for_encode;
5280 if (CHAR_TABLE_P (translation_table))
5281 {
5282 int i;
5283 for (i = 0; i < 128; i++)
5284 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5285 break;
5286 if (i < 128)
5287 /* Some ASCII character should be translated. We give up
5288 shrinking. */
5289 return;
5290 }
5291
5292 if (str)
5293 {
5294 begp_orig = begp = str + *beg;
5295 endp_orig = endp = str + *end;
5296 }
5297 else
5298 {
5299 begp_orig = begp = BYTE_POS_ADDR (*beg);
5300 endp_orig = endp = begp + *end - *beg;
5301 }
5302
5303 eol_conversion = (coding->eol_type == CODING_EOL_CR
5304 || coding->eol_type == CODING_EOL_CRLF);
5305
5306 /* Here, we don't have to check coding->pre_write_conversion because
5307 the caller is expected to have handled it already. */
5308 switch (coding->type)
5309 {
5310 case coding_type_iso2022:
5311 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5312 /* We can't skip any data. */
5313 break;
5314 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
5315 {
5316 unsigned char *bol = begp;
5317 while (begp < endp && *begp < 0x80)
5318 {
5319 begp++;
5320 if (begp[-1] == '\n')
5321 bol = begp;
5322 }
5323 begp = bol;
5324 goto label_skip_tail;
5325 }
5326 /* fall down ... */
5327
5328 case coding_type_sjis:
5329 case coding_type_big5:
5330 /* We can skip all ASCII characters at the head and tail. */
5331 if (eol_conversion)
5332 while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
5333 else
5334 while (begp < endp && *begp < 0x80) begp++;
5335 label_skip_tail:
5336 if (eol_conversion)
5337 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
5338 else
5339 while (begp < endp && *(endp - 1) < 0x80) endp--;
5340 break;
5341
5342 default:
5343 abort ();
5344 }
5345
5346 *beg += begp - begp_orig;
5347 *end += endp - endp_orig;
5348 return;
5349 }
5350
5351 /* As shrinking conversion region requires some overhead, we don't try
5352 shrinking if the length of conversion region is less than this
5353 value. */
5354 static int shrink_conversion_region_threshhold = 1024;
5355
5356 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep) \
5357 do { \
5358 if (*(end) - *(beg) > shrink_conversion_region_threshhold) \
5359 { \
5360 if (encodep) shrink_encoding_region (beg, end, coding, str); \
5361 else shrink_decoding_region (beg, end, coding, str); \
5362 } \
5363 } while (0)
5364
5365 /* ARG is (CODING BUFFER ...) where CODING is what to be set in
5366 Vlast_coding_system_used and the remaining elements are buffers to
5367 kill. */
5368 static Lisp_Object
5369 code_convert_region_unwind (arg)
5370 Lisp_Object arg;
5371 {
5372 struct gcpro gcpro1;
5373 GCPRO1 (arg);
5374
5375 inhibit_pre_post_conversion = 0;
5376 Vlast_coding_system_used = XCAR (arg);
5377 for (arg = XCDR (arg); ! NILP (arg); arg = XCDR (arg))
5378 Fkill_buffer (XCAR (arg));
5379
5380 UNGCPRO;
5381 return Qnil;
5382 }
5383
5384 /* Store information about all compositions in the range FROM and TO
5385 of OBJ in memory blocks pointed by CODING->cmp_data. OBJ is a
5386 buffer or a string, defaults to the current buffer. */
5387
5388 void
5389 coding_save_composition (coding, from, to, obj)
5390 struct coding_system *coding;
5391 int from, to;
5392 Lisp_Object obj;
5393 {
5394 Lisp_Object prop;
5395 int start, end;
5396
5397 if (coding->composing == COMPOSITION_DISABLED)
5398 return;
5399 if (!coding->cmp_data)
5400 coding_allocate_composition_data (coding, from);
5401 if (!find_composition (from, to, &start, &end, &prop, obj)
5402 || end > to)
5403 return;
5404 if (start < from
5405 && (!find_composition (end, to, &start, &end, &prop, obj)
5406 || end > to))
5407 return;
5408 coding->composing = COMPOSITION_NO;
5409 do
5410 {
5411 if (COMPOSITION_VALID_P (start, end, prop))
5412 {
5413 enum composition_method method = COMPOSITION_METHOD (prop);
5414 if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
5415 >= COMPOSITION_DATA_SIZE)
5416 coding_allocate_composition_data (coding, from);
5417 /* For relative composition, we remember start and end
5418 positions, for the other compositions, we also remember
5419 components. */
5420 CODING_ADD_COMPOSITION_START (coding, start - from, method);
5421 if (method != COMPOSITION_RELATIVE)
5422 {
5423 /* We must store a*/
5424 Lisp_Object val, ch;
5425
5426 val = COMPOSITION_COMPONENTS (prop);
5427 if (CONSP (val))
5428 while (CONSP (val))
5429 {
5430 ch = XCAR (val), val = XCDR (val);
5431 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5432 }
5433 else if (VECTORP (val) || STRINGP (val))
5434 {
5435 int len = (VECTORP (val)
5436 ? XVECTOR (val)->size : SCHARS (val));
5437 int i;
5438 for (i = 0; i < len; i++)
5439 {
5440 ch = (STRINGP (val)
5441 ? Faref (val, make_number (i))
5442 : XVECTOR (val)->contents[i]);
5443 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5444 }
5445 }
5446 else /* INTEGERP (val) */
5447 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
5448 }
5449 CODING_ADD_COMPOSITION_END (coding, end - from);
5450 }
5451 start = end;
5452 }
5453 while (start < to
5454 && find_composition (start, to, &start, &end, &prop, obj)
5455 && end <= to);
5456
5457 /* Make coding->cmp_data point to the first memory block. */
5458 while (coding->cmp_data->prev)
5459 coding->cmp_data = coding->cmp_data->prev;
5460 coding->cmp_data_start = 0;
5461 }
5462
5463 /* Reflect the saved information about compositions to OBJ.
5464 CODING->cmp_data points to a memory block for the information. OBJ
5465 is a buffer or a string, defaults to the current buffer. */
5466
5467 void
5468 coding_restore_composition (coding, obj)
5469 struct coding_system *coding;
5470 Lisp_Object obj;
5471 {
5472 struct composition_data *cmp_data = coding->cmp_data;
5473
5474 if (!cmp_data)
5475 return;
5476
5477 while (cmp_data->prev)
5478 cmp_data = cmp_data->prev;
5479
5480 while (cmp_data)
5481 {
5482 int i;
5483
5484 for (i = 0; i < cmp_data->used && cmp_data->data[i] > 0;
5485 i += cmp_data->data[i])
5486 {
5487 int *data = cmp_data->data + i;
5488 enum composition_method method = (enum composition_method) data[3];
5489 Lisp_Object components;
5490
5491 if (data[0] < 0 || i + data[0] > cmp_data->used)
5492 /* Invalid composition data. */
5493 break;
5494
5495 if (method == COMPOSITION_RELATIVE)
5496 components = Qnil;
5497 else
5498 {
5499 int len = data[0] - 4, j;
5500 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5501
5502 if (method == COMPOSITION_WITH_RULE_ALTCHARS
5503 && len % 2 == 0)
5504 len --;
5505 if (len < 1)
5506 /* Invalid composition data. */
5507 break;
5508 for (j = 0; j < len; j++)
5509 args[j] = make_number (data[4 + j]);
5510 components = (method == COMPOSITION_WITH_ALTCHARS
5511 ? Fstring (len, args)
5512 : Fvector (len, args));
5513 }
5514 compose_text (data[1], data[2], components, Qnil, obj);
5515 }
5516 cmp_data = cmp_data->next;
5517 }
5518 }
5519
5520 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
5521 text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
5522 coding system CODING, and return the status code of code conversion
5523 (currently, this value has no meaning).
5524
5525 How many characters (and bytes) are converted to how many
5526 characters (and bytes) are recorded in members of the structure
5527 CODING.
5528
5529 If REPLACE is nonzero, we do various things as if the original text
5530 is deleted and a new text is inserted. See the comments in
5531 replace_range (insdel.c) to know what we are doing.
5532
5533 If REPLACE is zero, it is assumed that the source text is unibyte.
5534 Otherwise, it is assumed that the source text is multibyte. */
5535
5536 int
5537 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
5538 int from, from_byte, to, to_byte, encodep, replace;
5539 struct coding_system *coding;
5540 {
5541 int len = to - from, len_byte = to_byte - from_byte;
5542 int nchars_del = 0, nbytes_del = 0;
5543 int require, inserted, inserted_byte;
5544 int head_skip, tail_skip, total_skip = 0;
5545 Lisp_Object saved_coding_symbol;
5546 int first = 1;
5547 unsigned char *src, *dst;
5548 Lisp_Object deletion;
5549 int orig_point = PT, orig_len = len;
5550 int prev_Z;
5551 int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
5552
5553 deletion = Qnil;
5554 saved_coding_symbol = coding->symbol;
5555
5556 if (from < PT && PT < to)
5557 {
5558 TEMP_SET_PT_BOTH (from, from_byte);
5559 orig_point = from;
5560 }
5561
5562 if (replace)
5563 {
5564 int saved_from = from;
5565 int saved_inhibit_modification_hooks;
5566
5567 prepare_to_modify_buffer (from, to, &from);
5568 if (saved_from != from)
5569 {
5570 to = from + len;
5571 from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
5572 len_byte = to_byte - from_byte;
5573 }
5574
5575 /* The code conversion routine can not preserve text properties
5576 for now. So, we must remove all text properties in the
5577 region. Here, we must suppress all modification hooks. */
5578 saved_inhibit_modification_hooks = inhibit_modification_hooks;
5579 inhibit_modification_hooks = 1;
5580 Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
5581 inhibit_modification_hooks = saved_inhibit_modification_hooks;
5582 }
5583
5584 if (! encodep && CODING_REQUIRE_DETECTION (coding))
5585 {
5586 /* We must detect encoding of text and eol format. */
5587
5588 if (from < GPT && to > GPT)
5589 move_gap_both (from, from_byte);
5590 if (coding->type == coding_type_undecided)
5591 {
5592 detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
5593 if (coding->type == coding_type_undecided)
5594 {
5595 /* It seems that the text contains only ASCII, but we
5596 should not leave it undecided because the deeper
5597 decoding routine (decode_coding) tries to detect the
5598 encodings again in vain. */
5599 coding->type = coding_type_emacs_mule;
5600 coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
5601 /* As emacs-mule decoder will handle composition, we
5602 need this setting to allocate coding->cmp_data
5603 later. */
5604 coding->composing = COMPOSITION_NO;
5605 }
5606 }
5607 if (coding->eol_type == CODING_EOL_UNDECIDED
5608 && coding->type != coding_type_ccl)
5609 {
5610 detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
5611 if (coding->eol_type == CODING_EOL_UNDECIDED)
5612 coding->eol_type = CODING_EOL_LF;
5613 /* We had better recover the original eol format if we
5614 encounter an inconsistent eol format while decoding. */
5615 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5616 }
5617 }
5618
5619 /* Now we convert the text. */
5620
5621 /* For encoding, we must process pre-write-conversion in advance. */
5622 if (! inhibit_pre_post_conversion
5623 && encodep
5624 && SYMBOLP (coding->pre_write_conversion)
5625 && ! NILP (Ffboundp (coding->pre_write_conversion)))
5626 {
5627 /* The function in pre-write-conversion may put a new text in a
5628 new buffer. */
5629 struct buffer *prev = current_buffer;
5630 Lisp_Object new;
5631
5632 record_unwind_protect (code_convert_region_unwind,
5633 Fcons (Vlast_coding_system_used, Qnil));
5634 /* We should not call any more pre-write/post-read-conversion
5635 functions while this pre-write-conversion is running. */
5636 inhibit_pre_post_conversion = 1;
5637 call2 (coding->pre_write_conversion,
5638 make_number (from), make_number (to));
5639 inhibit_pre_post_conversion = 0;
5640 /* Discard the unwind protect. */
5641 specpdl_ptr--;
5642
5643 if (current_buffer != prev)
5644 {
5645 len = ZV - BEGV;
5646 new = Fcurrent_buffer ();
5647 set_buffer_internal_1 (prev);
5648 del_range_2 (from, from_byte, to, to_byte, 0);
5649 TEMP_SET_PT_BOTH (from, from_byte);
5650 insert_from_buffer (XBUFFER (new), 1, len, 0);
5651 Fkill_buffer (new);
5652 if (orig_point >= to)
5653 orig_point += len - orig_len;
5654 else if (orig_point > from)
5655 orig_point = from;
5656 orig_len = len;
5657 to = from + len;
5658 from_byte = CHAR_TO_BYTE (from);
5659 to_byte = CHAR_TO_BYTE (to);
5660 len_byte = to_byte - from_byte;
5661 TEMP_SET_PT_BOTH (from, from_byte);
5662 }
5663 }
5664
5665 if (replace)
5666 {
5667 if (! EQ (current_buffer->undo_list, Qt))
5668 deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
5669 else
5670 {
5671 nchars_del = to - from;
5672 nbytes_del = to_byte - from_byte;
5673 }
5674 }
5675
5676 if (coding->composing != COMPOSITION_DISABLED)
5677 {
5678 if (encodep)
5679 coding_save_composition (coding, from, to, Fcurrent_buffer ());
5680 else
5681 coding_allocate_composition_data (coding, from);
5682 }
5683
5684 /* Try to skip the heading and tailing ASCIIs. We can't skip them
5685 if we must run CCL program or there are compositions to
5686 encode. */
5687 if (coding->type != coding_type_ccl
5688 && (! coding->cmp_data || coding->cmp_data->used == 0))
5689 {
5690 int from_byte_orig = from_byte, to_byte_orig = to_byte;
5691
5692 if (from < GPT && GPT < to)
5693 move_gap_both (from, from_byte);
5694 SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
5695 if (from_byte == to_byte
5696 && (encodep || NILP (coding->post_read_conversion))
5697 && ! CODING_REQUIRE_FLUSHING (coding))
5698 {
5699 coding->produced = len_byte;
5700 coding->produced_char = len;
5701 if (!replace)
5702 /* We must record and adjust for this new text now. */
5703 adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
5704 coding_free_composition_data (coding);
5705 return 0;
5706 }
5707
5708 head_skip = from_byte - from_byte_orig;
5709 tail_skip = to_byte_orig - to_byte;
5710 total_skip = head_skip + tail_skip;
5711 from += head_skip;
5712 to -= tail_skip;
5713 len -= total_skip; len_byte -= total_skip;
5714 }
5715
5716 /* For conversion, we must put the gap before the text in addition to
5717 making the gap larger for efficient decoding. The required gap
5718 size starts from 2000 which is the magic number used in make_gap.
5719 But, after one batch of conversion, it will be incremented if we
5720 find that it is not enough . */
5721 require = 2000;
5722
5723 if (GAP_SIZE < require)
5724 make_gap (require - GAP_SIZE);
5725 move_gap_both (from, from_byte);
5726
5727 inserted = inserted_byte = 0;
5728
5729 GAP_SIZE += len_byte;
5730 ZV -= len;
5731 Z -= len;
5732 ZV_BYTE -= len_byte;
5733 Z_BYTE -= len_byte;
5734
5735 if (GPT - BEG < BEG_UNCHANGED)
5736 BEG_UNCHANGED = GPT - BEG;
5737 if (Z - GPT < END_UNCHANGED)
5738 END_UNCHANGED = Z - GPT;
5739
5740 if (!encodep && coding->src_multibyte)
5741 {
5742 /* Decoding routines expects that the source text is unibyte.
5743 We must convert 8-bit characters of multibyte form to
5744 unibyte. */
5745 int len_byte_orig = len_byte;
5746 len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
5747 if (len_byte < len_byte_orig)
5748 safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
5749 len_byte);
5750 coding->src_multibyte = 0;
5751 }
5752
5753 for (;;)
5754 {
5755 int result;
5756
5757 /* The buffer memory is now:
5758 +--------+converted-text+---------+-------original-text-------+---+
5759 |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
5760 |<---------------------- GAP ----------------------->| */
5761 src = GAP_END_ADDR - len_byte;
5762 dst = GPT_ADDR + inserted_byte;
5763
5764 if (encodep)
5765 result = encode_coding (coding, src, dst, len_byte, 0);
5766 else
5767 {
5768 if (coding->composing != COMPOSITION_DISABLED)
5769 coding->cmp_data->char_offset = from + inserted;
5770 result = decode_coding (coding, src, dst, len_byte, 0);
5771 }
5772
5773 /* The buffer memory is now:
5774 +--------+-------converted-text----+--+------original-text----+---+
5775 |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
5776 |<---------------------- GAP ----------------------->| */
5777
5778 inserted += coding->produced_char;
5779 inserted_byte += coding->produced;
5780 len_byte -= coding->consumed;
5781
5782 if (result == CODING_FINISH_INSUFFICIENT_CMP)
5783 {
5784 coding_allocate_composition_data (coding, from + inserted);
5785 continue;
5786 }
5787
5788 src += coding->consumed;
5789 dst += coding->produced;
5790
5791 if (result == CODING_FINISH_NORMAL)
5792 {
5793 src += len_byte;
5794 break;
5795 }
5796 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
5797 {
5798 unsigned char *pend = dst, *p = pend - inserted_byte;
5799 Lisp_Object eol_type;
5800
5801 /* Encode LFs back to the original eol format (CR or CRLF). */
5802 if (coding->eol_type == CODING_EOL_CR)
5803 {
5804 while (p < pend) if (*p++ == '\n') p[-1] = '\r';
5805 }
5806 else
5807 {
5808 int count = 0;
5809
5810 while (p < pend) if (*p++ == '\n') count++;
5811 if (src - dst < count)
5812 {
5813 /* We don't have sufficient room for encoding LFs
5814 back to CRLF. We must record converted and
5815 not-yet-converted text back to the buffer
5816 content, enlarge the gap, then record them out of
5817 the buffer contents again. */
5818 int add = len_byte + inserted_byte;
5819
5820 GAP_SIZE -= add;
5821 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5822 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5823 make_gap (count - GAP_SIZE);
5824 GAP_SIZE += add;
5825 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5826 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5827 /* Don't forget to update SRC, DST, and PEND. */
5828 src = GAP_END_ADDR - len_byte;
5829 dst = GPT_ADDR + inserted_byte;
5830 pend = dst;
5831 }
5832 inserted += count;
5833 inserted_byte += count;
5834 coding->produced += count;
5835 p = dst = pend + count;
5836 while (count)
5837 {
5838 *--p = *--pend;
5839 if (*p == '\n') count--, *--p = '\r';
5840 }
5841 }
5842
5843 /* Suppress eol-format conversion in the further conversion. */
5844 coding->eol_type = CODING_EOL_LF;
5845
5846 /* Set the coding system symbol to that for Unix-like EOL. */
5847 eol_type = Fget (saved_coding_symbol, Qeol_type);
5848 if (VECTORP (eol_type)
5849 && XVECTOR (eol_type)->size == 3
5850 && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
5851 coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
5852 else
5853 coding->symbol = saved_coding_symbol;
5854
5855 continue;
5856 }
5857 if (len_byte <= 0)
5858 {
5859 if (coding->type != coding_type_ccl
5860 || coding->mode & CODING_MODE_LAST_BLOCK)
5861 break;
5862 coding->mode |= CODING_MODE_LAST_BLOCK;
5863 continue;
5864 }
5865 if (result == CODING_FINISH_INSUFFICIENT_SRC)
5866 {
5867 /* The source text ends in invalid codes. Let's just
5868 make them valid buffer contents, and finish conversion. */
5869 if (multibyte_p)
5870 {
5871 unsigned char *start = dst;
5872
5873 inserted += len_byte;
5874 while (len_byte--)
5875 {
5876 int c = *src++;
5877 dst += CHAR_STRING (c, dst);
5878 }
5879
5880 inserted_byte += dst - start;
5881 }
5882 else
5883 {
5884 inserted += len_byte;
5885 inserted_byte += len_byte;
5886 while (len_byte--)
5887 *dst++ = *src++;
5888 }
5889 break;
5890 }
5891 if (result == CODING_FINISH_INTERRUPT)
5892 {
5893 /* The conversion procedure was interrupted by a user. */
5894 break;
5895 }
5896 /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST */
5897 if (coding->consumed < 1)
5898 {
5899 /* It's quite strange to require more memory without
5900 consuming any bytes. Perhaps CCL program bug. */
5901 break;
5902 }
5903 if (first)
5904 {
5905 /* We have just done the first batch of conversion which was
5906 stopped because of insufficient gap. Let's reconsider the
5907 required gap size (i.e. SRT - DST) now.
5908
5909 We have converted ORIG bytes (== coding->consumed) into
5910 NEW bytes (coding->produced). To convert the remaining
5911 LEN bytes, we may need REQUIRE bytes of gap, where:
5912 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
5913 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
5914 Here, we are sure that NEW >= ORIG. */
5915
5916 if (coding->produced <= coding->consumed)
5917 {
5918 /* This happens because of CCL-based coding system with
5919 eol-type CRLF. */
5920 require = 0;
5921 }
5922 else
5923 {
5924 float ratio = coding->produced - coding->consumed;
5925 ratio /= coding->consumed;
5926 require = len_byte * ratio;
5927 }
5928 first = 0;
5929 }
5930 if ((src - dst) < (require + 2000))
5931 {
5932 /* See the comment above the previous call of make_gap. */
5933 int add = len_byte + inserted_byte;
5934
5935 GAP_SIZE -= add;
5936 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5937 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5938 make_gap (require + 2000);
5939 GAP_SIZE += add;
5940 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5941 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5942 }
5943 }
5944 if (src - dst > 0) *dst = 0; /* Put an anchor. */
5945
5946 if (encodep && coding->dst_multibyte)
5947 {
5948 /* The output is unibyte. We must convert 8-bit characters to
5949 multibyte form. */
5950 if (inserted_byte * 2 > GAP_SIZE)
5951 {
5952 GAP_SIZE -= inserted_byte;
5953 ZV += inserted_byte; Z += inserted_byte;
5954 ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
5955 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5956 make_gap (inserted_byte - GAP_SIZE);
5957 GAP_SIZE += inserted_byte;
5958 ZV -= inserted_byte; Z -= inserted_byte;
5959 ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
5960 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5961 }
5962 inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
5963 }
5964
5965 /* If we shrank the conversion area, adjust it now. */
5966 if (total_skip > 0)
5967 {
5968 if (tail_skip > 0)
5969 safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
5970 inserted += total_skip; inserted_byte += total_skip;
5971 GAP_SIZE += total_skip;
5972 GPT -= head_skip; GPT_BYTE -= head_skip;
5973 ZV -= total_skip; ZV_BYTE -= total_skip;
5974 Z -= total_skip; Z_BYTE -= total_skip;
5975 from -= head_skip; from_byte -= head_skip;
5976 to += tail_skip; to_byte += tail_skip;
5977 }
5978
5979 prev_Z = Z;
5980 if (! EQ (current_buffer->undo_list, Qt))
5981 adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
5982 else
5983 adjust_after_replace_noundo (from, from_byte, nchars_del, nbytes_del,
5984 inserted, inserted_byte);
5985 inserted = Z - prev_Z;
5986
5987 if (!encodep && coding->cmp_data && coding->cmp_data->used)
5988 coding_restore_composition (coding, Fcurrent_buffer ());
5989 coding_free_composition_data (coding);
5990
5991 if (! inhibit_pre_post_conversion
5992 && ! encodep && ! NILP (coding->post_read_conversion))
5993 {
5994 Lisp_Object val;
5995 Lisp_Object saved_coding_system;
5996
5997 if (from != PT)
5998 TEMP_SET_PT_BOTH (from, from_byte);
5999 prev_Z = Z;
6000 record_unwind_protect (code_convert_region_unwind,
6001 Fcons (Vlast_coding_system_used, Qnil));
6002 saved_coding_system = Vlast_coding_system_used;
6003 Vlast_coding_system_used = coding->symbol;
6004 /* We should not call any more pre-write/post-read-conversion
6005 functions while this post-read-conversion is running. */
6006 inhibit_pre_post_conversion = 1;
6007 val = call1 (coding->post_read_conversion, make_number (inserted));
6008 inhibit_pre_post_conversion = 0;
6009 coding->symbol = Vlast_coding_system_used;
6010 Vlast_coding_system_used = saved_coding_system;
6011 /* Discard the unwind protect. */
6012 specpdl_ptr--;
6013 CHECK_NUMBER (val);
6014 inserted += Z - prev_Z;
6015 }
6016
6017 if (orig_point >= from)
6018 {
6019 if (orig_point >= from + orig_len)
6020 orig_point += inserted - orig_len;
6021 else
6022 orig_point = from;
6023 TEMP_SET_PT (orig_point);
6024 }
6025
6026 if (replace)
6027 {
6028 signal_after_change (from, to - from, inserted);
6029 update_compositions (from, from + inserted, CHECK_BORDER);
6030 }
6031
6032 {
6033 coding->consumed = to_byte - from_byte;
6034 coding->consumed_char = to - from;
6035 coding->produced = inserted_byte;
6036 coding->produced_char = inserted;
6037 }
6038
6039 return 0;
6040 }
6041
6042 /* Name (or base name) of work buffer for code conversion. */
6043 static Lisp_Object Vcode_conversion_workbuf_name;
6044
6045 /* Set the current buffer to the working buffer prepared for
6046 code-conversion. MULTIBYTE specifies the multibyteness of the
6047 buffer. Return the buffer we set if it must be killed after use.
6048 Otherwise return Qnil. */
6049
6050 static Lisp_Object
6051 set_conversion_work_buffer (multibyte)
6052 int multibyte;
6053 {
6054 Lisp_Object buffer, buffer_to_kill;
6055 struct buffer *buf;
6056
6057 buffer = Fget_buffer_create (Vcode_conversion_workbuf_name);
6058 buf = XBUFFER (buffer);
6059 if (buf == current_buffer)
6060 {
6061 /* As we are already in the work buffer, we must generate a new
6062 buffer for the work. */
6063 Lisp_Object name;
6064
6065 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6066 buffer = buffer_to_kill = Fget_buffer_create (name);
6067 buf = XBUFFER (buffer);
6068 }
6069 else
6070 buffer_to_kill = Qnil;
6071
6072 delete_all_overlays (buf);
6073 buf->directory = current_buffer->directory;
6074 buf->read_only = Qnil;
6075 buf->filename = Qnil;
6076 buf->undo_list = Qt;
6077 eassert (buf->overlays_before == NULL);
6078 eassert (buf->overlays_after == NULL);
6079 set_buffer_internal (buf);
6080 if (BEG != BEGV || Z != ZV)
6081 Fwiden ();
6082 del_range_2 (BEG, BEG_BYTE, Z, Z_BYTE, 0);
6083 buf->enable_multibyte_characters = multibyte ? Qt : Qnil;
6084 return buffer_to_kill;
6085 }
6086
6087 Lisp_Object
6088 run_pre_post_conversion_on_str (str, coding, encodep)
6089 Lisp_Object str;
6090 struct coding_system *coding;
6091 int encodep;
6092 {
6093 int count = SPECPDL_INDEX ();
6094 struct gcpro gcpro1, gcpro2;
6095 int multibyte = STRING_MULTIBYTE (str);
6096 Lisp_Object old_deactivate_mark;
6097 Lisp_Object buffer_to_kill;
6098 Lisp_Object unwind_arg;
6099
6100 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
6101 /* It is not crucial to specbind this. */
6102 old_deactivate_mark = Vdeactivate_mark;
6103 GCPRO2 (str, old_deactivate_mark);
6104
6105 /* We must insert the contents of STR as is without
6106 unibyte<->multibyte conversion. For that, we adjust the
6107 multibyteness of the working buffer to that of STR. */
6108 buffer_to_kill = set_conversion_work_buffer (multibyte);
6109 if (NILP (buffer_to_kill))
6110 unwind_arg = Fcons (Vlast_coding_system_used, Qnil);
6111 else
6112 unwind_arg = list2 (Vlast_coding_system_used, buffer_to_kill);
6113 record_unwind_protect (code_convert_region_unwind, unwind_arg);
6114
6115 insert_from_string (str, 0, 0,
6116 SCHARS (str), SBYTES (str), 0);
6117 UNGCPRO;
6118 inhibit_pre_post_conversion = 1;
6119 if (encodep)
6120 {
6121 struct buffer *prev = current_buffer;
6122
6123 call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
6124 if (prev != current_buffer)
6125 /* We must kill the current buffer too. */
6126 Fsetcdr (unwind_arg, Fcons (Fcurrent_buffer (), XCDR (unwind_arg)));
6127 }
6128 else
6129 {
6130 Vlast_coding_system_used = coding->symbol;
6131 TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
6132 call1 (coding->post_read_conversion, make_number (Z - BEG));
6133 coding->symbol = Vlast_coding_system_used;
6134 }
6135 inhibit_pre_post_conversion = 0;
6136 Vdeactivate_mark = old_deactivate_mark;
6137 str = make_buffer_string (BEG, Z, 1);
6138 return unbind_to (count, str);
6139 }
6140
6141
6142 /* Run pre-write-conversion function of CODING on NCHARS/NBYTES
6143 text in *STR. *SIZE is the allocated bytes for STR. As it
6144 is intended that this function is called from encode_terminal_code,
6145 the pre-write-conversion function is run by safe_call and thus
6146 "Error during redisplay: ..." is logged when an error occurs.
6147
6148 Store the resulting text in *STR and set CODING->produced_char and
6149 CODING->produced to the number of characters and bytes
6150 respectively. If the size of *STR is too small, enlarge it by
6151 xrealloc and update *STR and *SIZE. */
6152
6153 void
6154 run_pre_write_conversin_on_c_str (str, size, nchars, nbytes, coding)
6155 unsigned char **str;
6156 int *size, nchars, nbytes;
6157 struct coding_system *coding;
6158 {
6159 struct gcpro gcpro1, gcpro2;
6160 struct buffer *cur = current_buffer;
6161 struct buffer *prev;
6162 Lisp_Object old_deactivate_mark, old_last_coding_system_used;
6163 Lisp_Object args[3];
6164 Lisp_Object buffer_to_kill;
6165
6166 /* It is not crucial to specbind this. */
6167 old_deactivate_mark = Vdeactivate_mark;
6168 old_last_coding_system_used = Vlast_coding_system_used;
6169 GCPRO2 (old_deactivate_mark, old_last_coding_system_used);
6170
6171 /* We must insert the contents of STR as is without
6172 unibyte<->multibyte conversion. For that, we adjust the
6173 multibyteness of the working buffer to that of STR. */
6174 buffer_to_kill = set_conversion_work_buffer (coding->src_multibyte);
6175 insert_1_both (*str, nchars, nbytes, 0, 0, 0);
6176 UNGCPRO;
6177 inhibit_pre_post_conversion = 1;
6178 prev = current_buffer;
6179 args[0] = coding->pre_write_conversion;
6180 args[1] = make_number (BEG);
6181 args[2] = make_number (Z);
6182 safe_call (3, args);
6183 inhibit_pre_post_conversion = 0;
6184 Vdeactivate_mark = old_deactivate_mark;
6185 Vlast_coding_system_used = old_last_coding_system_used;
6186 coding->produced_char = Z - BEG;
6187 coding->produced = Z_BYTE - BEG_BYTE;
6188 if (coding->produced > *size)
6189 {
6190 *size = coding->produced;
6191 *str = xrealloc (*str, *size);
6192 }
6193 if (BEG < GPT && GPT < Z)
6194 move_gap (BEG);
6195 bcopy (BEG_ADDR, *str, coding->produced);
6196 coding->src_multibyte
6197 = ! NILP (current_buffer->enable_multibyte_characters);
6198 if (prev != current_buffer)
6199 Fkill_buffer (Fcurrent_buffer ());
6200 set_buffer_internal (cur);
6201 if (! NILP (buffer_to_kill))
6202 Fkill_buffer (buffer_to_kill);
6203 }
6204
6205
6206 Lisp_Object
6207 decode_coding_string (str, coding, nocopy)
6208 Lisp_Object str;
6209 struct coding_system *coding;
6210 int nocopy;
6211 {
6212 int len;
6213 struct conversion_buffer buf;
6214 int from, to_byte;
6215 Lisp_Object saved_coding_symbol;
6216 int result;
6217 int require_decoding;
6218 int shrinked_bytes = 0;
6219 Lisp_Object newstr;
6220 int consumed, consumed_char, produced, produced_char;
6221
6222 from = 0;
6223 to_byte = SBYTES (str);
6224
6225 saved_coding_symbol = coding->symbol;
6226 coding->src_multibyte = STRING_MULTIBYTE (str);
6227 coding->dst_multibyte = 1;
6228 if (CODING_REQUIRE_DETECTION (coding))
6229 {
6230 /* See the comments in code_convert_region. */
6231 if (coding->type == coding_type_undecided)
6232 {
6233 detect_coding (coding, SDATA (str), to_byte);
6234 if (coding->type == coding_type_undecided)
6235 {
6236 coding->type = coding_type_emacs_mule;
6237 coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
6238 /* As emacs-mule decoder will handle composition, we
6239 need this setting to allocate coding->cmp_data
6240 later. */
6241 coding->composing = COMPOSITION_NO;
6242 }
6243 }
6244 if (coding->eol_type == CODING_EOL_UNDECIDED
6245 && coding->type != coding_type_ccl)
6246 {
6247 saved_coding_symbol = coding->symbol;
6248 detect_eol (coding, SDATA (str), to_byte);
6249 if (coding->eol_type == CODING_EOL_UNDECIDED)
6250 coding->eol_type = CODING_EOL_LF;
6251 /* We had better recover the original eol format if we
6252 encounter an inconsistent eol format while decoding. */
6253 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
6254 }
6255 }
6256
6257 if (coding->type == coding_type_no_conversion
6258 || coding->type == coding_type_raw_text)
6259 coding->dst_multibyte = 0;
6260
6261 require_decoding = CODING_REQUIRE_DECODING (coding);
6262
6263 if (STRING_MULTIBYTE (str))
6264 {
6265 /* Decoding routines expect the source text to be unibyte. */
6266 str = Fstring_as_unibyte (str);
6267 to_byte = SBYTES (str);
6268 nocopy = 1;
6269 coding->src_multibyte = 0;
6270 }
6271
6272 /* Try to skip the heading and tailing ASCIIs. */
6273 if (require_decoding && coding->type != coding_type_ccl)
6274 {
6275 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6276 0);
6277 if (from == to_byte)
6278 require_decoding = 0;
6279 shrinked_bytes = from + (SBYTES (str) - to_byte);
6280 }
6281
6282 if (!require_decoding
6283 && !(SYMBOLP (coding->post_read_conversion)
6284 && !NILP (Ffboundp (coding->post_read_conversion))))
6285 {
6286 coding->consumed = SBYTES (str);
6287 coding->consumed_char = SCHARS (str);
6288 if (coding->dst_multibyte)
6289 {
6290 str = Fstring_as_multibyte (str);
6291 nocopy = 1;
6292 }
6293 coding->produced = SBYTES (str);
6294 coding->produced_char = SCHARS (str);
6295 return (nocopy ? str : Fcopy_sequence (str));
6296 }
6297
6298 if (coding->composing != COMPOSITION_DISABLED)
6299 coding_allocate_composition_data (coding, from);
6300 len = decoding_buffer_size (coding, to_byte - from);
6301 allocate_conversion_buffer (buf, len);
6302
6303 consumed = consumed_char = produced = produced_char = 0;
6304 while (1)
6305 {
6306 result = decode_coding (coding, SDATA (str) + from + consumed,
6307 buf.data + produced, to_byte - from - consumed,
6308 buf.size - produced);
6309 consumed += coding->consumed;
6310 consumed_char += coding->consumed_char;
6311 produced += coding->produced;
6312 produced_char += coding->produced_char;
6313 if (result == CODING_FINISH_NORMAL
6314 || result == CODING_FINISH_INTERRUPT
6315 || (result == CODING_FINISH_INSUFFICIENT_SRC
6316 && coding->consumed == 0))
6317 break;
6318 if (result == CODING_FINISH_INSUFFICIENT_CMP)
6319 coding_allocate_composition_data (coding, from + produced_char);
6320 else if (result == CODING_FINISH_INSUFFICIENT_DST)
6321 extend_conversion_buffer (&buf);
6322 else if (result == CODING_FINISH_INCONSISTENT_EOL)
6323 {
6324 Lisp_Object eol_type;
6325
6326 /* Recover the original EOL format. */
6327 if (coding->eol_type == CODING_EOL_CR)
6328 {
6329 unsigned char *p;
6330 for (p = buf.data; p < buf.data + produced; p++)
6331 if (*p == '\n') *p = '\r';
6332 }
6333 else if (coding->eol_type == CODING_EOL_CRLF)
6334 {
6335 int num_eol = 0;
6336 unsigned char *p0, *p1;
6337 for (p0 = buf.data, p1 = p0 + produced; p0 < p1; p0++)
6338 if (*p0 == '\n') num_eol++;
6339 if (produced + num_eol >= buf.size)
6340 extend_conversion_buffer (&buf);
6341 for (p0 = buf.data + produced, p1 = p0 + num_eol; p0 > buf.data;)
6342 {
6343 *--p1 = *--p0;
6344 if (*p0 == '\n') *--p1 = '\r';
6345 }
6346 produced += num_eol;
6347 produced_char += num_eol;
6348 }
6349 /* Suppress eol-format conversion in the further conversion. */
6350 coding->eol_type = CODING_EOL_LF;
6351
6352 /* Set the coding system symbol to that for Unix-like EOL. */
6353 eol_type = Fget (saved_coding_symbol, Qeol_type);
6354 if (VECTORP (eol_type)
6355 && XVECTOR (eol_type)->size == 3
6356 && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
6357 coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
6358 else
6359 coding->symbol = saved_coding_symbol;
6360
6361
6362 }
6363 }
6364
6365 coding->consumed = consumed;
6366 coding->consumed_char = consumed_char;
6367 coding->produced = produced;
6368 coding->produced_char = produced_char;
6369
6370 if (coding->dst_multibyte)
6371 newstr = make_uninit_multibyte_string (produced_char + shrinked_bytes,
6372 produced + shrinked_bytes);
6373 else
6374 newstr = make_uninit_string (produced + shrinked_bytes);
6375 if (from > 0)
6376 STRING_COPYIN (newstr, 0, SDATA (str), from);
6377 STRING_COPYIN (newstr, from, buf.data, produced);
6378 if (shrinked_bytes > from)
6379 STRING_COPYIN (newstr, from + produced,
6380 SDATA (str) + to_byte,
6381 shrinked_bytes - from);
6382 free_conversion_buffer (&buf);
6383
6384 coding->consumed += shrinked_bytes;
6385 coding->consumed_char += shrinked_bytes;
6386 coding->produced += shrinked_bytes;
6387 coding->produced_char += shrinked_bytes;
6388
6389 if (coding->cmp_data && coding->cmp_data->used)
6390 coding_restore_composition (coding, newstr);
6391 coding_free_composition_data (coding);
6392
6393 if (SYMBOLP (coding->post_read_conversion)
6394 && !NILP (Ffboundp (coding->post_read_conversion)))
6395 newstr = run_pre_post_conversion_on_str (newstr, coding, 0);
6396
6397 return newstr;
6398 }
6399
6400 Lisp_Object
6401 encode_coding_string (str, coding, nocopy)
6402 Lisp_Object str;
6403 struct coding_system *coding;
6404 int nocopy;
6405 {
6406 int len;
6407 struct conversion_buffer buf;
6408 int from, to, to_byte;
6409 int result;
6410 int shrinked_bytes = 0;
6411 Lisp_Object newstr;
6412 int consumed, consumed_char, produced, produced_char;
6413
6414 if (SYMBOLP (coding->pre_write_conversion)
6415 && !NILP (Ffboundp (coding->pre_write_conversion)))
6416 {
6417 str = run_pre_post_conversion_on_str (str, coding, 1);
6418 /* As STR is just newly generated, we don't have to copy it
6419 anymore. */
6420 nocopy = 1;
6421 }
6422
6423 from = 0;
6424 to = SCHARS (str);
6425 to_byte = SBYTES (str);
6426
6427 /* Encoding routines determine the multibyteness of the source text
6428 by coding->src_multibyte. */
6429 coding->src_multibyte = SCHARS (str) < SBYTES (str);
6430 coding->dst_multibyte = 0;
6431 if (! CODING_REQUIRE_ENCODING (coding))
6432 goto no_need_of_encoding;
6433
6434 if (coding->composing != COMPOSITION_DISABLED)
6435 coding_save_composition (coding, from, to, str);
6436
6437 /* Try to skip the heading and tailing ASCIIs. We can't skip them
6438 if we must run CCL program or there are compositions to
6439 encode. */
6440 if (coding->type != coding_type_ccl
6441 && (! coding->cmp_data || coding->cmp_data->used == 0))
6442 {
6443 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6444 1);
6445 if (from == to_byte)
6446 {
6447 coding_free_composition_data (coding);
6448 goto no_need_of_encoding;
6449 }
6450 shrinked_bytes = from + (SBYTES (str) - to_byte);
6451 }
6452
6453 len = encoding_buffer_size (coding, to_byte - from);
6454 allocate_conversion_buffer (buf, len);
6455
6456 consumed = consumed_char = produced = produced_char = 0;
6457 while (1)
6458 {
6459 result = encode_coding (coding, SDATA (str) + from + consumed,
6460 buf.data + produced, to_byte - from - consumed,
6461 buf.size - produced);
6462 consumed += coding->consumed;
6463 consumed_char += coding->consumed_char;
6464 produced += coding->produced;
6465 produced_char += coding->produced_char;
6466 if (result == CODING_FINISH_NORMAL
6467 || result == CODING_FINISH_INTERRUPT
6468 || (result == CODING_FINISH_INSUFFICIENT_SRC
6469 && coding->consumed == 0))
6470 break;
6471 /* Now result should be CODING_FINISH_INSUFFICIENT_DST. */
6472 extend_conversion_buffer (&buf);
6473 }
6474
6475 coding->consumed = consumed;
6476 coding->consumed_char = consumed_char;
6477 coding->produced = produced;
6478 coding->produced_char = produced_char;
6479
6480 newstr = make_uninit_string (produced + shrinked_bytes);
6481 if (from > 0)
6482 STRING_COPYIN (newstr, 0, SDATA (str), from);
6483 STRING_COPYIN (newstr, from, buf.data, produced);
6484 if (shrinked_bytes > from)
6485 STRING_COPYIN (newstr, from + produced,
6486 SDATA (str) + to_byte,
6487 shrinked_bytes - from);
6488
6489 free_conversion_buffer (&buf);
6490 coding_free_composition_data (coding);
6491
6492 return newstr;
6493
6494 no_need_of_encoding:
6495 coding->consumed = SBYTES (str);
6496 coding->consumed_char = SCHARS (str);
6497 if (STRING_MULTIBYTE (str))
6498 {
6499 if (nocopy)
6500 /* We are sure that STR doesn't contain a multibyte
6501 character. */
6502 STRING_SET_UNIBYTE (str);
6503 else
6504 {
6505 str = Fstring_as_unibyte (str);
6506 nocopy = 1;
6507 }
6508 }
6509 coding->produced = SBYTES (str);
6510 coding->produced_char = SCHARS (str);
6511 return (nocopy ? str : Fcopy_sequence (str));
6512 }
6513
6514 \f
6515 #ifdef emacs
6516 /*** 8. Emacs Lisp library functions ***/
6517
6518 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6519 doc: /* Return t if OBJECT is nil or a coding-system.
6520 See the documentation of `make-coding-system' for information
6521 about coding-system objects. */)
6522 (obj)
6523 Lisp_Object obj;
6524 {
6525 if (NILP (obj))
6526 return Qt;
6527 if (!SYMBOLP (obj))
6528 return Qnil;
6529 if (! NILP (Fget (obj, Qcoding_system_define_form)))
6530 return Qt;
6531 /* Get coding-spec vector for OBJ. */
6532 obj = Fget (obj, Qcoding_system);
6533 return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
6534 ? Qt : Qnil);
6535 }
6536
6537 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6538 Sread_non_nil_coding_system, 1, 1, 0,
6539 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6540 (prompt)
6541 Lisp_Object prompt;
6542 {
6543 Lisp_Object val;
6544 do
6545 {
6546 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6547 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6548 }
6549 while (SCHARS (val) == 0);
6550 return (Fintern (val, Qnil));
6551 }
6552
6553 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6554 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6555 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
6556 (prompt, default_coding_system)
6557 Lisp_Object prompt, default_coding_system;
6558 {
6559 Lisp_Object val;
6560 if (SYMBOLP (default_coding_system))
6561 default_coding_system = SYMBOL_NAME (default_coding_system);
6562 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6563 Qt, Qnil, Qcoding_system_history,
6564 default_coding_system, Qnil);
6565 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
6566 }
6567
6568 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6569 1, 1, 0,
6570 doc: /* Check validity of CODING-SYSTEM.
6571 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
6572 It is valid if it is nil or a symbol with a non-nil `coding-system' property.
6573 The value of this property should be a vector of length 5. */)
6574 (coding_system)
6575 Lisp_Object coding_system;
6576 {
6577 Lisp_Object define_form;
6578
6579 define_form = Fget (coding_system, Qcoding_system_define_form);
6580 if (! NILP (define_form))
6581 {
6582 Fput (coding_system, Qcoding_system_define_form, Qnil);
6583 safe_eval (define_form);
6584 }
6585 if (!NILP (Fcoding_system_p (coding_system)))
6586 return coding_system;
6587 xsignal1 (Qcoding_system_error, coding_system);
6588 }
6589 \f
6590 Lisp_Object
6591 detect_coding_system (src, src_bytes, highest, multibytep)
6592 const unsigned char *src;
6593 int src_bytes, highest;
6594 int multibytep;
6595 {
6596 int coding_mask, eol_type;
6597 Lisp_Object val, tmp;
6598 int dummy;
6599
6600 coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy, multibytep);
6601 eol_type = detect_eol_type (src, src_bytes, &dummy);
6602 if (eol_type == CODING_EOL_INCONSISTENT)
6603 eol_type = CODING_EOL_UNDECIDED;
6604
6605 if (!coding_mask)
6606 {
6607 val = Qundecided;
6608 if (eol_type != CODING_EOL_UNDECIDED)
6609 {
6610 Lisp_Object val2;
6611 val2 = Fget (Qundecided, Qeol_type);
6612 if (VECTORP (val2))
6613 val = XVECTOR (val2)->contents[eol_type];
6614 }
6615 return (highest ? val : Fcons (val, Qnil));
6616 }
6617
6618 /* At first, gather possible coding systems in VAL. */
6619 val = Qnil;
6620 for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
6621 {
6622 Lisp_Object category_val, category_index;
6623
6624 category_index = Fget (XCAR (tmp), Qcoding_category_index);
6625 category_val = Fsymbol_value (XCAR (tmp));
6626 if (!NILP (category_val)
6627 && NATNUMP (category_index)
6628 && (coding_mask & (1 << XFASTINT (category_index))))
6629 {
6630 val = Fcons (category_val, val);
6631 if (highest)
6632 break;
6633 }
6634 }
6635 if (!highest)
6636 val = Fnreverse (val);
6637
6638 /* Then, replace the elements with subsidiary coding systems. */
6639 for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
6640 {
6641 if (eol_type != CODING_EOL_UNDECIDED
6642 && eol_type != CODING_EOL_INCONSISTENT)
6643 {
6644 Lisp_Object eol;
6645 eol = Fget (XCAR (tmp), Qeol_type);
6646 if (VECTORP (eol))
6647 XSETCAR (tmp, XVECTOR (eol)->contents[eol_type]);
6648 }
6649 }
6650 return (highest ? XCAR (val) : val);
6651 }
6652
6653 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6654 2, 3, 0,
6655 doc: /* Detect how the byte sequence in the region is encoded.
6656 Return a list of possible coding systems used on decoding a byte
6657 sequence containing the bytes in the region between START and END when
6658 the coding system `undecided' is specified. The list is ordered by
6659 priority decided in the current language environment.
6660
6661 If only ASCII characters are found (except for such ISO-2022 control
6662 characters ISO-2022 as ESC), it returns a list of single element
6663 `undecided' or its subsidiary coding system according to a detected
6664 end-of-line format.
6665
6666 If optional argument HIGHEST is non-nil, return the coding system of
6667 highest priority. */)
6668 (start, end, highest)
6669 Lisp_Object start, end, highest;
6670 {
6671 int from, to;
6672 int from_byte, to_byte;
6673 int include_anchor_byte = 0;
6674
6675 CHECK_NUMBER_COERCE_MARKER (start);
6676 CHECK_NUMBER_COERCE_MARKER (end);
6677
6678 validate_region (&start, &end);
6679 from = XINT (start), to = XINT (end);
6680 from_byte = CHAR_TO_BYTE (from);
6681 to_byte = CHAR_TO_BYTE (to);
6682
6683 if (from < GPT && to >= GPT)
6684 move_gap_both (to, to_byte);
6685 /* If we an anchor byte `\0' follows the region, we include it in
6686 the detecting source. Then code detectors can handle the tailing
6687 byte sequence more accurately.
6688
6689 Fix me: This is not a perfect solution. It is better that we
6690 add one more argument, say LAST_BLOCK, to all detect_coding_XXX.
6691 */
6692 if (to == Z || (to == GPT && GAP_SIZE > 0))
6693 include_anchor_byte = 1;
6694 return detect_coding_system (BYTE_POS_ADDR (from_byte),
6695 to_byte - from_byte + include_anchor_byte,
6696 !NILP (highest),
6697 !NILP (current_buffer
6698 ->enable_multibyte_characters));
6699 }
6700
6701 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6702 1, 2, 0,
6703 doc: /* Detect how the byte sequence in STRING is encoded.
6704 Return a list of possible coding systems used on decoding a byte
6705 sequence containing the bytes in STRING when the coding system
6706 `undecided' is specified. The list is ordered by priority decided in
6707 the current language environment.
6708
6709 If only ASCII characters are found (except for such ISO-2022 control
6710 characters ISO-2022 as ESC), it returns a list of single element
6711 `undecided' or its subsidiary coding system according to a detected
6712 end-of-line format.
6713
6714 If optional argument HIGHEST is non-nil, return the coding system of
6715 highest priority. */)
6716 (string, highest)
6717 Lisp_Object string, highest;
6718 {
6719 CHECK_STRING (string);
6720
6721 return detect_coding_system (SDATA (string),
6722 /* "+ 1" is to include the anchor byte
6723 `\0'. With this, code detectors can
6724 handle the tailing bytes more
6725 accurately. */
6726 SBYTES (string) + 1,
6727 !NILP (highest),
6728 STRING_MULTIBYTE (string));
6729 }
6730
6731 /* Subroutine for Ffind_coding_systems_region_internal.
6732
6733 Return a list of coding systems that safely encode the multibyte
6734 text between P and PEND. SAFE_CODINGS, if non-nil, is an alist of
6735 possible coding systems. If it is nil, it means that we have not
6736 yet found any coding systems.
6737
6738 WORK_TABLE a char-table of which element is set to t once the
6739 element is looked up.
6740
6741 If a non-ASCII single byte char is found, set
6742 *single_byte_char_found to 1. */
6743
6744 static Lisp_Object
6745 find_safe_codings (p, pend, safe_codings, work_table, single_byte_char_found)
6746 unsigned char *p, *pend;
6747 Lisp_Object safe_codings, work_table;
6748 int *single_byte_char_found;
6749 {
6750 int c, len;
6751 Lisp_Object val, ch;
6752 Lisp_Object prev, tail;
6753
6754 if (NILP (safe_codings))
6755 goto done_safe_codings;
6756 while (p < pend)
6757 {
6758 c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6759 p += len;
6760 if (ASCII_BYTE_P (c))
6761 /* We can ignore ASCII characters here. */
6762 continue;
6763 if (SINGLE_BYTE_CHAR_P (c))
6764 *single_byte_char_found = 1;
6765 /* Check the safe coding systems for C. */
6766 ch = make_number (c);
6767 val = Faref (work_table, ch);
6768 if (EQ (val, Qt))
6769 /* This element was already checked. Ignore it. */
6770 continue;
6771 /* Remember that we checked this element. */
6772 Faset (work_table, ch, Qt);
6773
6774 for (prev = tail = safe_codings; CONSP (tail); tail = XCDR (tail))
6775 {
6776 Lisp_Object elt, translation_table, hash_table, accept_latin_extra;
6777 int encodable;
6778
6779 elt = XCAR (tail);
6780 if (CONSP (XCDR (elt)))
6781 {
6782 /* This entry has this format now:
6783 ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6784 ACCEPT-LATIN-EXTRA ) */
6785 val = XCDR (elt);
6786 encodable = ! NILP (Faref (XCAR (val), ch));
6787 if (! encodable)
6788 {
6789 val = XCDR (val);
6790 translation_table = XCAR (val);
6791 hash_table = XCAR (XCDR (val));
6792 accept_latin_extra = XCAR (XCDR (XCDR (val)));
6793 }
6794 }
6795 else
6796 {
6797 /* This entry has this format now: ( CODING . SAFE-CHARS) */
6798 encodable = ! NILP (Faref (XCDR (elt), ch));
6799 if (! encodable)
6800 {
6801 /* Transform the format to:
6802 ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6803 ACCEPT-LATIN-EXTRA ) */
6804 val = Fget (XCAR (elt), Qcoding_system);
6805 translation_table
6806 = Fplist_get (AREF (val, 3),
6807 Qtranslation_table_for_encode);
6808 if (SYMBOLP (translation_table))
6809 translation_table = Fget (translation_table,
6810 Qtranslation_table);
6811 hash_table
6812 = (CHAR_TABLE_P (translation_table)
6813 ? XCHAR_TABLE (translation_table)->extras[1]
6814 : Qnil);
6815 accept_latin_extra
6816 = ((EQ (AREF (val, 0), make_number (2))
6817 && VECTORP (AREF (val, 4)))
6818 ? AREF (AREF (val, 4), 16)
6819 : Qnil);
6820 XSETCAR (tail, list5 (XCAR (elt), XCDR (elt),
6821 translation_table, hash_table,
6822 accept_latin_extra));
6823 }
6824 }
6825
6826 if (! encodable
6827 && ((CHAR_TABLE_P (translation_table)
6828 && ! NILP (Faref (translation_table, ch)))
6829 || (HASH_TABLE_P (hash_table)
6830 && ! NILP (Fgethash (ch, hash_table, Qnil)))
6831 || (SINGLE_BYTE_CHAR_P (c)
6832 && ! NILP (accept_latin_extra)
6833 && VECTORP (Vlatin_extra_code_table)
6834 && ! NILP (AREF (Vlatin_extra_code_table, c)))))
6835 encodable = 1;
6836 if (encodable)
6837 prev = tail;
6838 else
6839 {
6840 /* Exclude this coding system from SAFE_CODINGS. */
6841 if (EQ (tail, safe_codings))
6842 {
6843 safe_codings = XCDR (safe_codings);
6844 if (NILP (safe_codings))
6845 goto done_safe_codings;
6846 }
6847 else
6848 XSETCDR (prev, XCDR (tail));
6849 }
6850 }
6851 }
6852
6853 done_safe_codings:
6854 /* If the above loop was terminated before P reaches PEND, it means
6855 SAFE_CODINGS was set to nil. If we have not yet found an
6856 non-ASCII single-byte char, check it now. */
6857 if (! *single_byte_char_found)
6858 while (p < pend)
6859 {
6860 c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6861 p += len;
6862 if (! ASCII_BYTE_P (c)
6863 && SINGLE_BYTE_CHAR_P (c))
6864 {
6865 *single_byte_char_found = 1;
6866 break;
6867 }
6868 }
6869 return safe_codings;
6870 }
6871
6872 DEFUN ("find-coding-systems-region-internal",
6873 Ffind_coding_systems_region_internal,
6874 Sfind_coding_systems_region_internal, 2, 2, 0,
6875 doc: /* Internal use only. */)
6876 (start, end)
6877 Lisp_Object start, end;
6878 {
6879 Lisp_Object work_table, safe_codings;
6880 int non_ascii_p = 0;
6881 int single_byte_char_found = 0;
6882 const unsigned char *p1, *p1end, *p2, *p2end, *p;
6883
6884 if (STRINGP (start))
6885 {
6886 if (!STRING_MULTIBYTE (start))
6887 return Qt;
6888 p1 = SDATA (start), p1end = p1 + SBYTES (start);
6889 p2 = p2end = p1end;
6890 if (SCHARS (start) != SBYTES (start))
6891 non_ascii_p = 1;
6892 }
6893 else
6894 {
6895 int from, to, stop;
6896
6897 CHECK_NUMBER_COERCE_MARKER (start);
6898 CHECK_NUMBER_COERCE_MARKER (end);
6899 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6900 args_out_of_range (start, end);
6901 if (NILP (current_buffer->enable_multibyte_characters))
6902 return Qt;
6903 from = CHAR_TO_BYTE (XINT (start));
6904 to = CHAR_TO_BYTE (XINT (end));
6905 stop = from < GPT_BYTE && GPT_BYTE < to ? GPT_BYTE : to;
6906 p1 = BYTE_POS_ADDR (from), p1end = p1 + (stop - from);
6907 if (stop == to)
6908 p2 = p2end = p1end;
6909 else
6910 p2 = BYTE_POS_ADDR (stop), p2end = p2 + (to - stop);
6911 if (XINT (end) - XINT (start) != to - from)
6912 non_ascii_p = 1;
6913 }
6914
6915 if (!non_ascii_p)
6916 {
6917 /* We are sure that the text contains no multibyte character.
6918 Check if it contains eight-bit-graphic. */
6919 p = p1;
6920 for (p = p1; p < p1end && ASCII_BYTE_P (*p); p++);
6921 if (p == p1end)
6922 {
6923 for (p = p2; p < p2end && ASCII_BYTE_P (*p); p++);
6924 if (p == p2end)
6925 return Qt;
6926 }
6927 }
6928
6929 /* The text contains non-ASCII characters. */
6930
6931 work_table = Fmake_char_table (Qchar_coding_system, Qnil);
6932 safe_codings = Fcopy_sequence (XCDR (Vcoding_system_safe_chars));
6933
6934 safe_codings = find_safe_codings (p1, p1end, safe_codings, work_table,
6935 &single_byte_char_found);
6936 if (p2 < p2end)
6937 safe_codings = find_safe_codings (p2, p2end, safe_codings, work_table,
6938 &single_byte_char_found);
6939 if (EQ (safe_codings, XCDR (Vcoding_system_safe_chars)))
6940 safe_codings = Qt;
6941 else
6942 {
6943 /* Turn safe_codings to a list of coding systems... */
6944 Lisp_Object val;
6945
6946 if (single_byte_char_found)
6947 /* ... and append these for eight-bit chars. */
6948 val = Fcons (Qraw_text,
6949 Fcons (Qemacs_mule, Fcons (Qno_conversion, Qnil)));
6950 else
6951 /* ... and append generic coding systems. */
6952 val = Fcopy_sequence (XCAR (Vcoding_system_safe_chars));
6953
6954 for (; CONSP (safe_codings); safe_codings = XCDR (safe_codings))
6955 val = Fcons (XCAR (XCAR (safe_codings)), val);
6956 safe_codings = val;
6957 }
6958
6959 return safe_codings;
6960 }
6961
6962
6963 /* Search from position POS for such characters that are unencodable
6964 accoding to SAFE_CHARS, and return a list of their positions. P
6965 points where in the memory the character at POS exists. Limit the
6966 search at PEND or when Nth unencodable characters are found.
6967
6968 If SAFE_CHARS is a char table, an element for an unencodable
6969 character is nil.
6970
6971 If SAFE_CHARS is nil, all non-ASCII characters are unencodable.
6972
6973 Otherwise, SAFE_CHARS is t, and only eight-bit-contrl and
6974 eight-bit-graphic characters are unencodable. */
6975
6976 static Lisp_Object
6977 unencodable_char_position (safe_chars, pos, p, pend, n)
6978 Lisp_Object safe_chars;
6979 int pos;
6980 unsigned char *p, *pend;
6981 int n;
6982 {
6983 Lisp_Object pos_list;
6984
6985 pos_list = Qnil;
6986 while (p < pend)
6987 {
6988 int len;
6989 int c = STRING_CHAR_AND_LENGTH (p, MAX_MULTIBYTE_LENGTH, len);
6990
6991 if (c >= 128
6992 && (CHAR_TABLE_P (safe_chars)
6993 ? NILP (CHAR_TABLE_REF (safe_chars, c))
6994 : (NILP (safe_chars) || c < 256)))
6995 {
6996 pos_list = Fcons (make_number (pos), pos_list);
6997 if (--n <= 0)
6998 break;
6999 }
7000 pos++;
7001 p += len;
7002 }
7003 return Fnreverse (pos_list);
7004 }
7005
7006
7007 DEFUN ("unencodable-char-position", Funencodable_char_position,
7008 Sunencodable_char_position, 3, 5, 0,
7009 doc: /*
7010 Return position of first un-encodable character in a region.
7011 START and END specfiy the region and CODING-SYSTEM specifies the
7012 encoding to check. Return nil if CODING-SYSTEM does encode the region.
7013
7014 If optional 4th argument COUNT is non-nil, it specifies at most how
7015 many un-encodable characters to search. In this case, the value is a
7016 list of positions.
7017
7018 If optional 5th argument STRING is non-nil, it is a string to search
7019 for un-encodable characters. In that case, START and END are indexes
7020 to the string. */)
7021 (start, end, coding_system, count, string)
7022 Lisp_Object start, end, coding_system, count, string;
7023 {
7024 int n;
7025 Lisp_Object safe_chars;
7026 struct coding_system coding;
7027 Lisp_Object positions;
7028 int from, to;
7029 unsigned char *p, *pend;
7030
7031 if (NILP (string))
7032 {
7033 validate_region (&start, &end);
7034 from = XINT (start);
7035 to = XINT (end);
7036 if (NILP (current_buffer->enable_multibyte_characters))
7037 return Qnil;
7038 p = CHAR_POS_ADDR (from);
7039 if (to == GPT)
7040 pend = GPT_ADDR;
7041 else
7042 pend = CHAR_POS_ADDR (to);
7043 }
7044 else
7045 {
7046 CHECK_STRING (string);
7047 CHECK_NATNUM (start);
7048 CHECK_NATNUM (end);
7049 from = XINT (start);
7050 to = XINT (end);
7051 if (from > to
7052 || to > SCHARS (string))
7053 args_out_of_range_3 (string, start, end);
7054 if (! STRING_MULTIBYTE (string))
7055 return Qnil;
7056 p = SDATA (string) + string_char_to_byte (string, from);
7057 pend = SDATA (string) + string_char_to_byte (string, to);
7058 }
7059
7060 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7061
7062 if (NILP (count))
7063 n = 1;
7064 else
7065 {
7066 CHECK_NATNUM (count);
7067 n = XINT (count);
7068 }
7069
7070 if (coding.type == coding_type_no_conversion
7071 || coding.type == coding_type_raw_text)
7072 return Qnil;
7073
7074 if (coding.type == coding_type_undecided)
7075 safe_chars = Qnil;
7076 else
7077 safe_chars = coding_safe_chars (coding_system);
7078
7079 if (STRINGP (string)
7080 || from >= GPT || to <= GPT)
7081 positions = unencodable_char_position (safe_chars, from, p, pend, n);
7082 else
7083 {
7084 Lisp_Object args[2];
7085
7086 args[0] = unencodable_char_position (safe_chars, from, p, GPT_ADDR, n);
7087 n -= XINT (Flength (args[0]));
7088 if (n <= 0)
7089 positions = args[0];
7090 else
7091 {
7092 args[1] = unencodable_char_position (safe_chars, GPT, GAP_END_ADDR,
7093 pend, n);
7094 positions = Fappend (2, args);
7095 }
7096 }
7097
7098 return (NILP (count) ? Fcar (positions) : positions);
7099 }
7100
7101
7102 Lisp_Object
7103 code_convert_region1 (start, end, coding_system, encodep)
7104 Lisp_Object start, end, coding_system;
7105 int encodep;
7106 {
7107 struct coding_system coding;
7108 int from, to;
7109
7110 CHECK_NUMBER_COERCE_MARKER (start);
7111 CHECK_NUMBER_COERCE_MARKER (end);
7112 CHECK_SYMBOL (coding_system);
7113
7114 validate_region (&start, &end);
7115 from = XFASTINT (start);
7116 to = XFASTINT (end);
7117
7118 if (NILP (coding_system))
7119 return make_number (to - from);
7120
7121 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7122 error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7123
7124 coding.mode |= CODING_MODE_LAST_BLOCK;
7125 coding.src_multibyte = coding.dst_multibyte
7126 = !NILP (current_buffer->enable_multibyte_characters);
7127 code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
7128 &coding, encodep, 1);
7129 Vlast_coding_system_used = coding.symbol;
7130 return make_number (coding.produced_char);
7131 }
7132
7133 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7134 3, 3, "r\nzCoding system: ",
7135 doc: /* Decode the current region from the specified coding system.
7136 When called from a program, takes three arguments:
7137 START, END, and CODING-SYSTEM. START and END are buffer positions.
7138 This function sets `last-coding-system-used' to the precise coding system
7139 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7140 not fully specified.)
7141 It returns the length of the decoded text. */)
7142 (start, end, coding_system)
7143 Lisp_Object start, end, coding_system;
7144 {
7145 return code_convert_region1 (start, end, coding_system, 0);
7146 }
7147
7148 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7149 3, 3, "r\nzCoding system: ",
7150 doc: /* Encode the current region into the specified coding system.
7151 When called from a program, takes three arguments:
7152 START, END, and CODING-SYSTEM. START and END are buffer positions.
7153 This function sets `last-coding-system-used' to the precise coding system
7154 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7155 not fully specified.)
7156 It returns the length of the encoded text. */)
7157 (start, end, coding_system)
7158 Lisp_Object start, end, coding_system;
7159 {
7160 return code_convert_region1 (start, end, coding_system, 1);
7161 }
7162
7163 Lisp_Object
7164 code_convert_string1 (string, coding_system, nocopy, encodep)
7165 Lisp_Object string, coding_system, nocopy;
7166 int encodep;
7167 {
7168 struct coding_system coding;
7169
7170 CHECK_STRING (string);
7171 CHECK_SYMBOL (coding_system);
7172
7173 if (NILP (coding_system))
7174 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
7175
7176 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7177 error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7178
7179 coding.mode |= CODING_MODE_LAST_BLOCK;
7180 string = (encodep
7181 ? encode_coding_string (string, &coding, !NILP (nocopy))
7182 : decode_coding_string (string, &coding, !NILP (nocopy)));
7183 Vlast_coding_system_used = coding.symbol;
7184
7185 return string;
7186 }
7187
7188 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7189 2, 3, 0,
7190 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7191 Optional arg NOCOPY non-nil means it is OK to return STRING itself
7192 if the decoding operation is trivial.
7193 This function sets `last-coding-system-used' to the precise coding system
7194 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7195 not fully specified.) */)
7196 (string, coding_system, nocopy)
7197 Lisp_Object string, coding_system, nocopy;
7198 {
7199 return code_convert_string1 (string, coding_system, nocopy, 0);
7200 }
7201
7202 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
7203 2, 3, 0,
7204 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7205 Optional arg NOCOPY non-nil means it is OK to return STRING itself
7206 if the encoding operation is trivial.
7207 This function sets `last-coding-system-used' to the precise coding system
7208 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7209 not fully specified.) */)
7210 (string, coding_system, nocopy)
7211 Lisp_Object string, coding_system, nocopy;
7212 {
7213 return code_convert_string1 (string, coding_system, nocopy, 1);
7214 }
7215
7216 /* Encode or decode STRING according to CODING_SYSTEM.
7217 Do not set Vlast_coding_system_used.
7218
7219 This function is called only from macros DECODE_FILE and
7220 ENCODE_FILE, thus we ignore character composition. */
7221
7222 Lisp_Object
7223 code_convert_string_norecord (string, coding_system, encodep)
7224 Lisp_Object string, coding_system;
7225 int encodep;
7226 {
7227 struct coding_system coding;
7228
7229 CHECK_STRING (string);
7230 CHECK_SYMBOL (coding_system);
7231
7232 if (NILP (coding_system))
7233 return string;
7234
7235 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7236 error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7237
7238 coding.composing = COMPOSITION_DISABLED;
7239 coding.mode |= CODING_MODE_LAST_BLOCK;
7240 return (encodep
7241 ? encode_coding_string (string, &coding, 1)
7242 : decode_coding_string (string, &coding, 1));
7243 }
7244 \f
7245 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
7246 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7247 Return the corresponding character. */)
7248 (code)
7249 Lisp_Object code;
7250 {
7251 unsigned char c1, c2, s1, s2;
7252 Lisp_Object val;
7253
7254 CHECK_NUMBER (code);
7255 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
7256 if (s1 == 0)
7257 {
7258 if (s2 < 0x80)
7259 XSETFASTINT (val, s2);
7260 else if (s2 >= 0xA0 || s2 <= 0xDF)
7261 XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
7262 else
7263 error ("Invalid Shift JIS code: %x", XFASTINT (code));
7264 }
7265 else
7266 {
7267 if ((s1 < 0x80 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF)
7268 || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
7269 error ("Invalid Shift JIS code: %x", XFASTINT (code));
7270 DECODE_SJIS (s1, s2, c1, c2);
7271 XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
7272 }
7273 return val;
7274 }
7275
7276 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
7277 doc: /* Encode a Japanese character CHAR to shift_jis encoding.
7278 Return the corresponding code in SJIS. */)
7279 (ch)
7280 Lisp_Object ch;
7281 {
7282 int charset, c1, c2, s1, s2;
7283 Lisp_Object val;
7284
7285 CHECK_NUMBER (ch);
7286 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7287 if (charset == CHARSET_ASCII)
7288 {
7289 val = ch;
7290 }
7291 else if (charset == charset_jisx0208
7292 && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
7293 {
7294 ENCODE_SJIS (c1, c2, s1, s2);
7295 XSETFASTINT (val, (s1 << 8) | s2);
7296 }
7297 else if (charset == charset_katakana_jisx0201
7298 && c1 > 0x20 && c2 < 0xE0)
7299 {
7300 XSETFASTINT (val, c1 | 0x80);
7301 }
7302 else
7303 error ("Can't encode to shift_jis: %d", XFASTINT (ch));
7304 return val;
7305 }
7306
7307 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7308 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7309 Return the corresponding character. */)
7310 (code)
7311 Lisp_Object code;
7312 {
7313 int charset;
7314 unsigned char b1, b2, c1, c2;
7315 Lisp_Object val;
7316
7317 CHECK_NUMBER (code);
7318 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
7319 if (b1 == 0)
7320 {
7321 if (b2 >= 0x80)
7322 error ("Invalid BIG5 code: %x", XFASTINT (code));
7323 val = code;
7324 }
7325 else
7326 {
7327 if ((b1 < 0xA1 || b1 > 0xFE)
7328 || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
7329 error ("Invalid BIG5 code: %x", XFASTINT (code));
7330 DECODE_BIG5 (b1, b2, charset, c1, c2);
7331 XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
7332 }
7333 return val;
7334 }
7335
7336 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7337 doc: /* Encode the Big5 character CHAR to BIG5 coding system.
7338 Return the corresponding character code in Big5. */)
7339 (ch)
7340 Lisp_Object ch;
7341 {
7342 int charset, c1, c2, b1, b2;
7343 Lisp_Object val;
7344
7345 CHECK_NUMBER (ch);
7346 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7347 if (charset == CHARSET_ASCII)
7348 {
7349 val = ch;
7350 }
7351 else if ((charset == charset_big5_1
7352 && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
7353 || (charset == charset_big5_2
7354 && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
7355 {
7356 ENCODE_BIG5 (charset, c1, c2, b1, b2);
7357 XSETFASTINT (val, (b1 << 8) | b2);
7358 }
7359 else
7360 error ("Can't encode to Big5: %d", XFASTINT (ch));
7361 return val;
7362 }
7363 \f
7364 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
7365 Sset_terminal_coding_system_internal, 1, 1, 0,
7366 doc: /* Internal use only. */)
7367 (coding_system)
7368 Lisp_Object coding_system;
7369 {
7370 CHECK_SYMBOL (coding_system);
7371 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
7372 /* We had better not send unsafe characters to terminal. */
7373 terminal_coding.mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
7374 /* Character composition should be disabled. */
7375 terminal_coding.composing = COMPOSITION_DISABLED;
7376 /* Error notification should be suppressed. */
7377 terminal_coding.suppress_error = 1;
7378 terminal_coding.src_multibyte = 1;
7379 terminal_coding.dst_multibyte = 0;
7380 return Qnil;
7381 }
7382
7383 DEFUN ("set-safe-terminal-coding-system-internal", Fset_safe_terminal_coding_system_internal,
7384 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7385 doc: /* Internal use only. */)
7386 (coding_system)
7387 Lisp_Object coding_system;
7388 {
7389 CHECK_SYMBOL (coding_system);
7390 setup_coding_system (Fcheck_coding_system (coding_system),
7391 &safe_terminal_coding);
7392 /* Character composition should be disabled. */
7393 safe_terminal_coding.composing = COMPOSITION_DISABLED;
7394 /* Error notification should be suppressed. */
7395 safe_terminal_coding.suppress_error = 1;
7396 safe_terminal_coding.src_multibyte = 1;
7397 safe_terminal_coding.dst_multibyte = 0;
7398 return Qnil;
7399 }
7400
7401 DEFUN ("terminal-coding-system", Fterminal_coding_system,
7402 Sterminal_coding_system, 0, 0, 0,
7403 doc: /* Return coding system specified for terminal output. */)
7404 ()
7405 {
7406 return terminal_coding.symbol;
7407 }
7408
7409 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
7410 Sset_keyboard_coding_system_internal, 1, 1, 0,
7411 doc: /* Internal use only. */)
7412 (coding_system)
7413 Lisp_Object coding_system;
7414 {
7415 CHECK_SYMBOL (coding_system);
7416 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
7417 /* Character composition should be disabled. */
7418 keyboard_coding.composing = COMPOSITION_DISABLED;
7419 return Qnil;
7420 }
7421
7422 DEFUN ("keyboard-coding-system", Fkeyboard_coding_system,
7423 Skeyboard_coding_system, 0, 0, 0,
7424 doc: /* Return coding system specified for decoding keyboard input. */)
7425 ()
7426 {
7427 return keyboard_coding.symbol;
7428 }
7429
7430 \f
7431 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7432 Sfind_operation_coding_system, 1, MANY, 0,
7433 doc: /* Choose a coding system for an operation based on the target name.
7434 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7435 DECODING-SYSTEM is the coding system to use for decoding
7436 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7437 for encoding (in case OPERATION does encoding).
7438
7439 The first argument OPERATION specifies an I/O primitive:
7440 For file I/O, `insert-file-contents' or `write-region'.
7441 For process I/O, `call-process', `call-process-region', or `start-process'.
7442 For network I/O, `open-network-stream'.
7443
7444 The remaining arguments should be the same arguments that were passed
7445 to the primitive. Depending on which primitive, one of those arguments
7446 is selected as the TARGET. For example, if OPERATION does file I/O,
7447 whichever argument specifies the file name is TARGET.
7448
7449 TARGET has a meaning which depends on OPERATION:
7450 For file I/O, TARGET is a file name (except for the special case below).
7451 For process I/O, TARGET is a process name.
7452 For network I/O, TARGET is a service name or a port number
7453
7454 This function looks up what specified for TARGET in,
7455 `file-coding-system-alist', `process-coding-system-alist',
7456 or `network-coding-system-alist' depending on OPERATION.
7457 They may specify a coding system, a cons of coding systems,
7458 or a function symbol to call.
7459 In the last case, we call the function with one argument,
7460 which is a list of all the arguments given to this function.
7461
7462 If OPERATION is `insert-file-contents', the argument corresponding to
7463 TARGET may be a cons (FILENAME . BUFFER). In that case, FILENAME is a
7464 file name to look up, and BUFFER is a buffer that contains the file's
7465 contents (not yet decoded). If `file-coding-system-alist' specifies a
7466 function to call for FILENAME, that function should examine the
7467 contents of BUFFER instead of reading the file.
7468
7469 usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
7470 (nargs, args)
7471 int nargs;
7472 Lisp_Object *args;
7473 {
7474 Lisp_Object operation, target_idx, target, val;
7475 register Lisp_Object chain;
7476
7477 if (nargs < 2)
7478 error ("Too few arguments");
7479 operation = args[0];
7480 if (!SYMBOLP (operation)
7481 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7482 error ("Invalid first argument");
7483 if (nargs < 1 + XINT (target_idx))
7484 error ("Too few arguments for operation: %s",
7485 SDATA (SYMBOL_NAME (operation)));
7486 /* For write-region, if the 6th argument (i.e. VISIT, the 5th
7487 argument to write-region) is string, it must be treated as a
7488 target file name. */
7489 if (EQ (operation, Qwrite_region)
7490 && nargs > 5
7491 && STRINGP (args[5]))
7492 target_idx = make_number (4);
7493 target = args[XINT (target_idx) + 1];
7494 if (!(STRINGP (target)
7495 || (EQ (operation, Qinsert_file_contents) && CONSP (target)
7496 && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
7497 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7498 error ("Invalid argument %d", XINT (target_idx) + 1);
7499 if (CONSP (target))
7500 target = XCAR (target);
7501
7502 chain = ((EQ (operation, Qinsert_file_contents)
7503 || EQ (operation, Qwrite_region))
7504 ? Vfile_coding_system_alist
7505 : (EQ (operation, Qopen_network_stream)
7506 ? Vnetwork_coding_system_alist
7507 : Vprocess_coding_system_alist));
7508 if (NILP (chain))
7509 return Qnil;
7510
7511 for (; CONSP (chain); chain = XCDR (chain))
7512 {
7513 Lisp_Object elt;
7514 elt = XCAR (chain);
7515
7516 if (CONSP (elt)
7517 && ((STRINGP (target)
7518 && STRINGP (XCAR (elt))
7519 && fast_string_match (XCAR (elt), target) >= 0)
7520 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7521 {
7522 val = XCDR (elt);
7523 /* Here, if VAL is both a valid coding system and a valid
7524 function symbol, we return VAL as a coding system. */
7525 if (CONSP (val))
7526 return val;
7527 if (! SYMBOLP (val))
7528 return Qnil;
7529 if (! NILP (Fcoding_system_p (val)))
7530 return Fcons (val, val);
7531 if (! NILP (Ffboundp (val)))
7532 {
7533 /* We use call1 rather than safe_call1
7534 so as to get bug reports about functions called here
7535 which don't handle the current interface. */
7536 val = call1 (val, Flist (nargs, args));
7537 if (CONSP (val))
7538 return val;
7539 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7540 return Fcons (val, val);
7541 }
7542 return Qnil;
7543 }
7544 }
7545 return Qnil;
7546 }
7547
7548 DEFUN ("update-coding-systems-internal", Fupdate_coding_systems_internal,
7549 Supdate_coding_systems_internal, 0, 0, 0,
7550 doc: /* Update internal database for ISO2022 and CCL based coding systems.
7551 When values of any coding categories are changed, you must
7552 call this function. */)
7553 ()
7554 {
7555 int i;
7556
7557 for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
7558 {
7559 Lisp_Object val;
7560
7561 val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[i]);
7562 if (!NILP (val))
7563 {
7564 if (! coding_system_table[i])
7565 coding_system_table[i] = ((struct coding_system *)
7566 xmalloc (sizeof (struct coding_system)));
7567 setup_coding_system (val, coding_system_table[i]);
7568 }
7569 else if (coding_system_table[i])
7570 {
7571 xfree (coding_system_table[i]);
7572 coding_system_table[i] = NULL;
7573 }
7574 }
7575
7576 return Qnil;
7577 }
7578
7579 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
7580 Sset_coding_priority_internal, 0, 0, 0,
7581 doc: /* Update internal database for the current value of `coding-category-list'.
7582 This function is internal use only. */)
7583 ()
7584 {
7585 int i = 0, idx;
7586 Lisp_Object val;
7587
7588 val = Vcoding_category_list;
7589
7590 while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
7591 {
7592 if (! SYMBOLP (XCAR (val)))
7593 break;
7594 idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
7595 if (idx >= CODING_CATEGORY_IDX_MAX)
7596 break;
7597 coding_priorities[i++] = (1 << idx);
7598 val = XCDR (val);
7599 }
7600 /* If coding-category-list is valid and contains all coding
7601 categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not,
7602 the following code saves Emacs from crashing. */
7603 while (i < CODING_CATEGORY_IDX_MAX)
7604 coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
7605
7606 return Qnil;
7607 }
7608
7609 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7610 Sdefine_coding_system_internal, 1, 1, 0,
7611 doc: /* Register CODING-SYSTEM as a base coding system.
7612 This function is internal use only. */)
7613 (coding_system)
7614 Lisp_Object coding_system;
7615 {
7616 Lisp_Object safe_chars, slot;
7617
7618 if (NILP (Fcheck_coding_system (coding_system)))
7619 xsignal1 (Qcoding_system_error, coding_system);
7620
7621 safe_chars = coding_safe_chars (coding_system);
7622 if (! EQ (safe_chars, Qt) && ! CHAR_TABLE_P (safe_chars))
7623 error ("No valid safe-chars property for %s",
7624 SDATA (SYMBOL_NAME (coding_system)));
7625
7626 if (EQ (safe_chars, Qt))
7627 {
7628 if (NILP (Fmemq (coding_system, XCAR (Vcoding_system_safe_chars))))
7629 XSETCAR (Vcoding_system_safe_chars,
7630 Fcons (coding_system, XCAR (Vcoding_system_safe_chars)));
7631 }
7632 else
7633 {
7634 slot = Fassq (coding_system, XCDR (Vcoding_system_safe_chars));
7635 if (NILP (slot))
7636 XSETCDR (Vcoding_system_safe_chars,
7637 nconc2 (XCDR (Vcoding_system_safe_chars),
7638 Fcons (Fcons (coding_system, safe_chars), Qnil)));
7639 else
7640 XSETCDR (slot, safe_chars);
7641 }
7642 return Qnil;
7643 }
7644
7645 #endif /* emacs */
7646
7647 \f
7648 /*** 9. Post-amble ***/
7649
7650 void
7651 init_coding_once ()
7652 {
7653 int i;
7654
7655 /* Emacs' internal format specific initialize routine. */
7656 for (i = 0; i <= 0x20; i++)
7657 emacs_code_class[i] = EMACS_control_code;
7658 emacs_code_class[0x0A] = EMACS_linefeed_code;
7659 emacs_code_class[0x0D] = EMACS_carriage_return_code;
7660 for (i = 0x21 ; i < 0x7F; i++)
7661 emacs_code_class[i] = EMACS_ascii_code;
7662 emacs_code_class[0x7F] = EMACS_control_code;
7663 for (i = 0x80; i < 0xFF; i++)
7664 emacs_code_class[i] = EMACS_invalid_code;
7665 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
7666 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
7667 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
7668 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
7669
7670 /* ISO2022 specific initialize routine. */
7671 for (i = 0; i < 0x20; i++)
7672 iso_code_class[i] = ISO_control_0;
7673 for (i = 0x21; i < 0x7F; i++)
7674 iso_code_class[i] = ISO_graphic_plane_0;
7675 for (i = 0x80; i < 0xA0; i++)
7676 iso_code_class[i] = ISO_control_1;
7677 for (i = 0xA1; i < 0xFF; i++)
7678 iso_code_class[i] = ISO_graphic_plane_1;
7679 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
7680 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
7681 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
7682 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
7683 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
7684 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
7685 iso_code_class[ISO_CODE_ESC] = ISO_escape;
7686 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
7687 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
7688 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
7689
7690 setup_coding_system (Qnil, &keyboard_coding);
7691 setup_coding_system (Qnil, &terminal_coding);
7692 setup_coding_system (Qnil, &safe_terminal_coding);
7693 setup_coding_system (Qnil, &default_buffer_file_coding);
7694
7695 bzero (coding_system_table, sizeof coding_system_table);
7696
7697 bzero (ascii_skip_code, sizeof ascii_skip_code);
7698 for (i = 0; i < 128; i++)
7699 ascii_skip_code[i] = 1;
7700
7701 #if defined (MSDOS) || defined (WINDOWSNT)
7702 system_eol_type = CODING_EOL_CRLF;
7703 #else
7704 system_eol_type = CODING_EOL_LF;
7705 #endif
7706
7707 inhibit_pre_post_conversion = 0;
7708 }
7709
7710 #ifdef emacs
7711
7712 void
7713 syms_of_coding ()
7714 {
7715 staticpro (&Vcode_conversion_workbuf_name);
7716 Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
7717
7718 Qtarget_idx = intern ("target-idx");
7719 staticpro (&Qtarget_idx);
7720
7721 Qcoding_system_history = intern ("coding-system-history");
7722 staticpro (&Qcoding_system_history);
7723 Fset (Qcoding_system_history, Qnil);
7724
7725 /* Target FILENAME is the first argument. */
7726 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
7727 /* Target FILENAME is the third argument. */
7728 Fput (Qwrite_region, Qtarget_idx, make_number (2));
7729
7730 Qcall_process = intern ("call-process");
7731 staticpro (&Qcall_process);
7732 /* Target PROGRAM is the first argument. */
7733 Fput (Qcall_process, Qtarget_idx, make_number (0));
7734
7735 Qcall_process_region = intern ("call-process-region");
7736 staticpro (&Qcall_process_region);
7737 /* Target PROGRAM is the third argument. */
7738 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
7739
7740 Qstart_process = intern ("start-process");
7741 staticpro (&Qstart_process);
7742 /* Target PROGRAM is the third argument. */
7743 Fput (Qstart_process, Qtarget_idx, make_number (2));
7744
7745 Qopen_network_stream = intern ("open-network-stream");
7746 staticpro (&Qopen_network_stream);
7747 /* Target SERVICE is the fourth argument. */
7748 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
7749
7750 Qcoding_system = intern ("coding-system");
7751 staticpro (&Qcoding_system);
7752
7753 Qeol_type = intern ("eol-type");
7754 staticpro (&Qeol_type);
7755
7756 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
7757 staticpro (&Qbuffer_file_coding_system);
7758
7759 Qpost_read_conversion = intern ("post-read-conversion");
7760 staticpro (&Qpost_read_conversion);
7761
7762 Qpre_write_conversion = intern ("pre-write-conversion");
7763 staticpro (&Qpre_write_conversion);
7764
7765 Qno_conversion = intern ("no-conversion");
7766 staticpro (&Qno_conversion);
7767
7768 Qundecided = intern ("undecided");
7769 staticpro (&Qundecided);
7770
7771 Qcoding_system_p = intern ("coding-system-p");
7772 staticpro (&Qcoding_system_p);
7773
7774 Qcoding_system_error = intern ("coding-system-error");
7775 staticpro (&Qcoding_system_error);
7776
7777 Fput (Qcoding_system_error, Qerror_conditions,
7778 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
7779 Fput (Qcoding_system_error, Qerror_message,
7780 build_string ("Invalid coding system"));
7781
7782 Qcoding_category = intern ("coding-category");
7783 staticpro (&Qcoding_category);
7784 Qcoding_category_index = intern ("coding-category-index");
7785 staticpro (&Qcoding_category_index);
7786
7787 Vcoding_category_table
7788 = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
7789 staticpro (&Vcoding_category_table);
7790 {
7791 int i;
7792 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
7793 {
7794 XVECTOR (Vcoding_category_table)->contents[i]
7795 = intern (coding_category_name[i]);
7796 Fput (XVECTOR (Vcoding_category_table)->contents[i],
7797 Qcoding_category_index, make_number (i));
7798 }
7799 }
7800
7801 Vcoding_system_safe_chars = Fcons (Qnil, Qnil);
7802 staticpro (&Vcoding_system_safe_chars);
7803
7804 Qtranslation_table = intern ("translation-table");
7805 staticpro (&Qtranslation_table);
7806 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
7807
7808 Qtranslation_table_id = intern ("translation-table-id");
7809 staticpro (&Qtranslation_table_id);
7810
7811 Qtranslation_table_for_decode = intern ("translation-table-for-decode");
7812 staticpro (&Qtranslation_table_for_decode);
7813
7814 Qtranslation_table_for_encode = intern ("translation-table-for-encode");
7815 staticpro (&Qtranslation_table_for_encode);
7816
7817 Qsafe_chars = intern ("safe-chars");
7818 staticpro (&Qsafe_chars);
7819
7820 Qchar_coding_system = intern ("char-coding-system");
7821 staticpro (&Qchar_coding_system);
7822
7823 /* Intern this now in case it isn't already done.
7824 Setting this variable twice is harmless.
7825 But don't staticpro it here--that is done in alloc.c. */
7826 Qchar_table_extra_slots = intern ("char-table-extra-slots");
7827 Fput (Qsafe_chars, Qchar_table_extra_slots, make_number (0));
7828 Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (0));
7829
7830 Qvalid_codes = intern ("valid-codes");
7831 staticpro (&Qvalid_codes);
7832
7833 Qascii_incompatible = intern ("ascii-incompatible");
7834 staticpro (&Qascii_incompatible);
7835
7836 Qemacs_mule = intern ("emacs-mule");
7837 staticpro (&Qemacs_mule);
7838
7839 Qraw_text = intern ("raw-text");
7840 staticpro (&Qraw_text);
7841
7842 Qutf_8 = intern ("utf-8");
7843 staticpro (&Qutf_8);
7844
7845 Qcoding_system_define_form = intern ("coding-system-define-form");
7846 staticpro (&Qcoding_system_define_form);
7847
7848 defsubr (&Scoding_system_p);
7849 defsubr (&Sread_coding_system);
7850 defsubr (&Sread_non_nil_coding_system);
7851 defsubr (&Scheck_coding_system);
7852 defsubr (&Sdetect_coding_region);
7853 defsubr (&Sdetect_coding_string);
7854 defsubr (&Sfind_coding_systems_region_internal);
7855 defsubr (&Sunencodable_char_position);
7856 defsubr (&Sdecode_coding_region);
7857 defsubr (&Sencode_coding_region);
7858 defsubr (&Sdecode_coding_string);
7859 defsubr (&Sencode_coding_string);
7860 defsubr (&Sdecode_sjis_char);
7861 defsubr (&Sencode_sjis_char);
7862 defsubr (&Sdecode_big5_char);
7863 defsubr (&Sencode_big5_char);
7864 defsubr (&Sset_terminal_coding_system_internal);
7865 defsubr (&Sset_safe_terminal_coding_system_internal);
7866 defsubr (&Sterminal_coding_system);
7867 defsubr (&Sset_keyboard_coding_system_internal);
7868 defsubr (&Skeyboard_coding_system);
7869 defsubr (&Sfind_operation_coding_system);
7870 defsubr (&Supdate_coding_systems_internal);
7871 defsubr (&Sset_coding_priority_internal);
7872 defsubr (&Sdefine_coding_system_internal);
7873
7874 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
7875 doc: /* List of coding systems.
7876
7877 Do not alter the value of this variable manually. This variable should be
7878 updated by the functions `make-coding-system' and
7879 `define-coding-system-alias'. */);
7880 Vcoding_system_list = Qnil;
7881
7882 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
7883 doc: /* Alist of coding system names.
7884 Each element is one element list of coding system name.
7885 This variable is given to `completing-read' as TABLE argument.
7886
7887 Do not alter the value of this variable manually. This variable should be
7888 updated by the functions `make-coding-system' and
7889 `define-coding-system-alias'. */);
7890 Vcoding_system_alist = Qnil;
7891
7892 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
7893 doc: /* List of coding-categories (symbols) ordered by priority.
7894
7895 On detecting a coding system, Emacs tries code detection algorithms
7896 associated with each coding-category one by one in this order. When
7897 one algorithm agrees with a byte sequence of source text, the coding
7898 system bound to the corresponding coding-category is selected.
7899
7900 Don't modify this variable directly, but use `set-coding-priority'. */);
7901 {
7902 int i;
7903
7904 Vcoding_category_list = Qnil;
7905 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
7906 Vcoding_category_list
7907 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
7908 Vcoding_category_list);
7909 }
7910
7911 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
7912 doc: /* Specify the coding system for read operations.
7913 It is useful to bind this variable with `let', but do not set it globally.
7914 If the value is a coding system, it is used for decoding on read operation.
7915 If not, an appropriate element is used from one of the coding system alists:
7916 There are three such tables, `file-coding-system-alist',
7917 `process-coding-system-alist', and `network-coding-system-alist'. */);
7918 Vcoding_system_for_read = Qnil;
7919
7920 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
7921 doc: /* Specify the coding system for write operations.
7922 Programs bind this variable with `let', but you should not set it globally.
7923 If the value is a coding system, it is used for encoding of output,
7924 when writing it to a file and when sending it to a file or subprocess.
7925
7926 If this does not specify a coding system, an appropriate element
7927 is used from one of the coding system alists:
7928 There are three such tables, `file-coding-system-alist',
7929 `process-coding-system-alist', and `network-coding-system-alist'.
7930 For output to files, if the above procedure does not specify a coding system,
7931 the value of `buffer-file-coding-system' is used. */);
7932 Vcoding_system_for_write = Qnil;
7933
7934 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
7935 doc: /* Coding system used in the latest file or process I/O.
7936 Also set by `encode-coding-region', `decode-coding-region',
7937 `encode-coding-string' and `decode-coding-string'. */);
7938 Vlast_coding_system_used = Qnil;
7939
7940 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
7941 doc: /* *Non-nil means always inhibit code conversion of end-of-line format.
7942 See info node `Coding Systems' and info node `Text and Binary' concerning
7943 such conversion. */);
7944 inhibit_eol_conversion = 0;
7945
7946 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
7947 doc: /* Non-nil means process buffer inherits coding system of process output.
7948 Bind it to t if the process output is to be treated as if it were a file
7949 read from some filesystem. */);
7950 inherit_process_coding_system = 0;
7951
7952 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
7953 doc: /* Alist to decide a coding system to use for a file I/O operation.
7954 The format is ((PATTERN . VAL) ...),
7955 where PATTERN is a regular expression matching a file name,
7956 VAL is a coding system, a cons of coding systems, or a function symbol.
7957 If VAL is a coding system, it is used for both decoding and encoding
7958 the file contents.
7959 If VAL is a cons of coding systems, the car part is used for decoding,
7960 and the cdr part is used for encoding.
7961 If VAL is a function symbol, the function must return a coding system
7962 or a cons of coding systems which are used as above. The function is
7963 called with an argument that is a list of the arguments with which
7964 `find-operation-coding-system' was called.
7965
7966 See also the function `find-operation-coding-system'
7967 and the variable `auto-coding-alist'. */);
7968 Vfile_coding_system_alist = Qnil;
7969
7970 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
7971 doc: /* Alist to decide a coding system to use for a process I/O operation.
7972 The format is ((PATTERN . VAL) ...),
7973 where PATTERN is a regular expression matching a program name,
7974 VAL is a coding system, a cons of coding systems, or a function symbol.
7975 If VAL is a coding system, it is used for both decoding what received
7976 from the program and encoding what sent to the program.
7977 If VAL is a cons of coding systems, the car part is used for decoding,
7978 and the cdr part is used for encoding.
7979 If VAL is a function symbol, the function must return a coding system
7980 or a cons of coding systems which are used as above.
7981
7982 See also the function `find-operation-coding-system'. */);
7983 Vprocess_coding_system_alist = Qnil;
7984
7985 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
7986 doc: /* Alist to decide a coding system to use for a network I/O operation.
7987 The format is ((PATTERN . VAL) ...),
7988 where PATTERN is a regular expression matching a network service name
7989 or is a port number to connect to,
7990 VAL is a coding system, a cons of coding systems, or a function symbol.
7991 If VAL is a coding system, it is used for both decoding what received
7992 from the network stream and encoding what sent to the network stream.
7993 If VAL is a cons of coding systems, the car part is used for decoding,
7994 and the cdr part is used for encoding.
7995 If VAL is a function symbol, the function must return a coding system
7996 or a cons of coding systems which are used as above.
7997
7998 See also the function `find-operation-coding-system'. */);
7999 Vnetwork_coding_system_alist = Qnil;
8000
8001 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
8002 doc: /* Coding system to use with system messages.
8003 Also used for decoding keyboard input on X Window system. */);
8004 Vlocale_coding_system = Qnil;
8005
8006 /* The eol mnemonics are reset in startup.el system-dependently. */
8007 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
8008 doc: /* *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
8009 eol_mnemonic_unix = build_string (":");
8010
8011 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
8012 doc: /* *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
8013 eol_mnemonic_dos = build_string ("\\");
8014
8015 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
8016 doc: /* *String displayed in mode line for MAC-like (CR) end-of-line format. */);
8017 eol_mnemonic_mac = build_string ("/");
8018
8019 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
8020 doc: /* *String displayed in mode line when end-of-line format is not yet determined. */);
8021 eol_mnemonic_undecided = build_string (":");
8022
8023 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
8024 doc: /* *Non-nil enables character translation while encoding and decoding. */);
8025 Venable_character_translation = Qt;
8026
8027 DEFVAR_LISP ("standard-translation-table-for-decode",
8028 &Vstandard_translation_table_for_decode,
8029 doc: /* Table for translating characters while decoding. */);
8030 Vstandard_translation_table_for_decode = Qnil;
8031
8032 DEFVAR_LISP ("standard-translation-table-for-encode",
8033 &Vstandard_translation_table_for_encode,
8034 doc: /* Table for translating characters while encoding. */);
8035 Vstandard_translation_table_for_encode = Qnil;
8036
8037 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
8038 doc: /* Alist of charsets vs revision numbers.
8039 While encoding, if a charset (car part of an element) is found,
8040 designate it with the escape sequence identifying revision (cdr part of the element). */);
8041 Vcharset_revision_alist = Qnil;
8042
8043 DEFVAR_LISP ("default-process-coding-system",
8044 &Vdefault_process_coding_system,
8045 doc: /* Cons of coding systems used for process I/O by default.
8046 The car part is used for decoding a process output,
8047 the cdr part is used for encoding a text to be sent to a process. */);
8048 Vdefault_process_coding_system = Qnil;
8049
8050 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
8051 doc: /* Table of extra Latin codes in the range 128..159 (inclusive).
8052 This is a vector of length 256.
8053 If Nth element is non-nil, the existence of code N in a file
8054 \(or output of subprocess) doesn't prevent it to be detected as
8055 a coding system of ISO 2022 variant which has a flag
8056 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
8057 or reading output of a subprocess.
8058 Only 128th through 159th elements has a meaning. */);
8059 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
8060
8061 DEFVAR_LISP ("select-safe-coding-system-function",
8062 &Vselect_safe_coding_system_function,
8063 doc: /* Function to call to select safe coding system for encoding a text.
8064
8065 If set, this function is called to force a user to select a proper
8066 coding system which can encode the text in the case that a default
8067 coding system used in each operation can't encode the text.
8068
8069 The default value is `select-safe-coding-system' (which see). */);
8070 Vselect_safe_coding_system_function = Qnil;
8071
8072 DEFVAR_BOOL ("coding-system-require-warning",
8073 &coding_system_require_warning,
8074 doc: /* Internal use only.
8075 If non-nil, on writing a file, `select-safe-coding-system-function' is
8076 called even if `coding-system-for-write' is non-nil. The command
8077 `universal-coding-system-argument' binds this variable to t temporarily. */);
8078 coding_system_require_warning = 0;
8079
8080
8081 DEFVAR_BOOL ("inhibit-iso-escape-detection",
8082 &inhibit_iso_escape_detection,
8083 doc: /* If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
8084
8085 By default, on reading a file, Emacs tries to detect how the text is
8086 encoded. This code detection is sensitive to escape sequences. If
8087 the sequence is valid as ISO2022, the code is determined as one of
8088 the ISO2022 encodings, and the file is decoded by the corresponding
8089 coding system (e.g. `iso-2022-7bit').
8090
8091 However, there may be a case that you want to read escape sequences in
8092 a file as is. In such a case, you can set this variable to non-nil.
8093 Then, as the code detection ignores any escape sequences, no file is
8094 detected as encoded in some ISO2022 encoding. The result is that all
8095 escape sequences become visible in a buffer.
8096
8097 The default value is nil, and it is strongly recommended not to change
8098 it. That is because many Emacs Lisp source files that contain
8099 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
8100 in Emacs's distribution, and they won't be decoded correctly on
8101 reading if you suppress escape sequence detection.
8102
8103 The other way to read escape sequences in a file without decoding is
8104 to explicitly specify some coding system that doesn't use ISO2022's
8105 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
8106 inhibit_iso_escape_detection = 0;
8107
8108 DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
8109 doc: /* Char table for translating self-inserting characters.
8110 This is applied to the result of input methods, not their input. See also
8111 `keyboard-translate-table'. */);
8112 Vtranslation_table_for_input = Qnil;
8113 }
8114
8115 char *
8116 emacs_strerror (error_number)
8117 int error_number;
8118 {
8119 char *str;
8120
8121 synchronize_system_messages_locale ();
8122 str = strerror (error_number);
8123
8124 if (! NILP (Vlocale_coding_system))
8125 {
8126 Lisp_Object dec = code_convert_string_norecord (build_string (str),
8127 Vlocale_coding_system,
8128 0);
8129 str = (char *) SDATA (dec);
8130 }
8131
8132 return str;
8133 }
8134
8135 #endif /* emacs */
8136
8137 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
8138 (do not change this comment) */