1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001, 2002 Free Software Foundation, Inc.
5 Copyright (C) 2001, 2002
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
9 This file is part of GNU Emacs.
11 GNU Emacs is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
16 GNU Emacs is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with GNU Emacs; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 Boston, MA 02111-1307, USA. */
26 /*** TABLE OF CONTENTS ***
30 2. Emacs' internal format (emacs-utf-8) handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
36 8. Shift-JIS and BIG5 handlers
38 10. C library functions
39 11. Emacs Lisp library functions
44 /*** 0. General comments ***
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
59 C level, a coding system is represented by a vector of attributes
60 stored in the hash table Vcharset_hash_table. The conversion from
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
64 Coding systems are classified into the following types depending on
65 the encoding mechanism. Here's a brief description of the types.
71 o Charset-base coding system
73 A coding system defined by one or more (coded) character sets.
74 Decoding and encoding are done by a code converter defined for each
77 o Old Emacs internal format (emacs-mule)
79 The coding system adopted by old versions of Emacs (20 and 21).
81 o ISO2022-base coding system
83 The most famous coding system for multiple character sets. X's
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
88 o SJIS (or Shift-JIS or MS-Kanji-Code)
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
96 A coding system to encode character sets: ASCII and Big5. Widely
97 used for Chinese (mainly in Taiwan and Hong Kong). Details are
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
104 If a user wants to decode/encode text encoded in a coding system
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
111 A coding system for text containing raw eight-bit data. Emacs
112 treats each byte of source text as a character (except for
113 end-of-line conversion).
117 Like raw text, but don't do end-of-line conversion.
122 How text end-of-line is encoded depends on operating system. For
123 instance, Unix's format is just one byte of LF (line-feed) code,
124 whereas DOS's format is two-byte sequence of `carriage-return' and
125 `line-feed' codes. MacOS's format is usually one byte of
128 Since text character encoding and end-of-line encoding are
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
137 conversion (e.g. the location of source and destination data).
144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX, and update the members of
150 Return 1 if the byte sequence conforms to XXX, otherwise return 0.
152 Below is the template of these functions. */
156 detect_coding_XXX (coding
, detect_info
)
157 struct coding_system
*coding
;
158 struct coding_detection_info
*detect_info
;
160 unsigned char *src
= coding
->source
;
161 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
162 int multibytep
= coding
->src_multibyte
;
163 int consumed_chars
= 0;
169 /* Get one byte from the source. If the souce is exausted, jump
170 to no_more_source:. */
173 if (! __C_conforms_to_XXX___ (c
))
175 if (! __C_strongly_suggests_XXX__ (c
))
176 found
= CATEGORY_MASK_XXX
;
178 /* The byte sequence is invalid for XXX. */
179 detect_info
->rejected
|= CATEGORY_MASK_XXX
;
183 /* The source exausted successfully. */
184 detect_info
->found
|= found
;
189 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
191 These functions decode a byte sequence specified as a source by
192 CODING. The resulting multibyte text goes to a place pointed to by
193 CODING->charbuf, the length of which should not exceed
194 CODING->charbuf_size;
196 These functions set the information of original and decoded texts in
197 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
198 They also set CODING->result to one of CODING_RESULT_XXX indicating
199 how the decoding is finished.
201 Below is the template of these functions. */
205 decode_coding_XXXX (coding
)
206 struct coding_system
*coding
;
208 unsigned char *src
= coding
->source
+ coding
->consumed
;
209 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
210 /* SRC_BASE remembers the start position in source in each loop.
211 The loop will be exited when there's not enough source code, or
212 when there's no room in CHARBUF for a decoded character. */
213 unsigned char *src_base
;
214 /* A buffer to produce decoded characters. */
215 int *charbuf
= coding
->charbuf
;
216 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
217 int multibytep
= coding
->src_multibyte
;
222 if (charbuf
< charbuf_end
)
223 /* No more room to produce a decoded character. */
230 if (src_base
< src_end
231 && coding
->mode
& CODING_MODE_LAST_BLOCK
)
232 /* If the source ends by partial bytes to construct a character,
233 treat them as eight-bit raw data. */
234 while (src_base
< src_end
&& charbuf
< charbuf_end
)
235 *charbuf
++ = *src_base
++;
236 /* Remember how many bytes and characters we consumed. If the
237 source is multibyte, the bytes and chars are not identical. */
238 coding
->consumed
= coding
->consumed_char
= src_base
- coding
->source
;
239 /* Remember how many characters we produced. */
240 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
244 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
246 These functions encode SRC_BYTES length text at SOURCE of Emacs'
247 internal multibyte format by CODING. The resulting byte sequence
248 goes to a place pointed to by DESTINATION, the length of which
249 should not exceed DST_BYTES.
251 These functions set the information of original and encoded texts in
252 the members produced, produced_char, consumed, and consumed_char of
253 the structure *CODING. They also set the member result to one of
254 CODING_RESULT_XXX indicating how the encoding finished.
256 DST_BYTES zero means that source area and destination area are
257 overlapped, which means that we can produce a encoded text until it
258 reaches at the head of not-yet-encoded source text.
260 Below is a template of these functions. */
263 encode_coding_XXX (coding
)
264 struct coding_system
*coding
;
266 int multibytep
= coding
->dst_multibyte
;
267 int *charbuf
= coding
->charbuf
;
268 int *charbuf_end
= charbuf
->charbuf
+ coding
->charbuf_used
;
269 unsigned char *dst
= coding
->destination
+ coding
->produced
;
270 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
271 unsigned char *adjusted_dst_end
= dst_end
- _MAX_BYTES_PRODUCED_IN_LOOP_
;
272 int produced_chars
= 0;
274 for (; charbuf
< charbuf_end
&& dst
< adjusted_dst_end
; charbuf
++)
277 /* Encode C into DST, and increment DST. */
279 label_no_more_destination
:
280 /* How many chars and bytes we produced. */
281 coding
->produced_char
+= produced_chars
;
282 coding
->produced
= dst
- coding
->destination
;
287 /*** 1. Preamble ***/
294 #include "character.h"
297 #include "composite.h"
301 Lisp_Object Vcoding_system_hash_table
;
303 Lisp_Object Qcoding_system
, Qcoding_aliases
, Qeol_type
;
304 Lisp_Object Qunix
, Qdos
;
305 extern Lisp_Object Qmac
; /* frame.c */
306 Lisp_Object Qbuffer_file_coding_system
;
307 Lisp_Object Qpost_read_conversion
, Qpre_write_conversion
;
308 Lisp_Object Qdefault_char
;
309 Lisp_Object Qno_conversion
, Qundecided
;
310 Lisp_Object Qcharset
, Qiso_2022
, Qutf_8
, Qutf_16
, Qshift_jis
, Qbig5
;
311 Lisp_Object Qbig
, Qlittle
;
312 Lisp_Object Qcoding_system_history
;
313 Lisp_Object Qvalid_codes
;
314 Lisp_Object QCcategory
;
316 extern Lisp_Object Qinsert_file_contents
, Qwrite_region
;
317 Lisp_Object Qcall_process
, Qcall_process_region
, Qprocess_argument
;
318 Lisp_Object Qstart_process
, Qopen_network_stream
;
319 Lisp_Object Qtarget_idx
;
321 Lisp_Object Vselect_safe_coding_system_function
;
323 /* Mnemonic string for each format of end-of-line. */
324 Lisp_Object eol_mnemonic_unix
, eol_mnemonic_dos
, eol_mnemonic_mac
;
325 /* Mnemonic string to indicate format of end-of-line is not yet
327 Lisp_Object eol_mnemonic_undecided
;
331 Lisp_Object Vcoding_system_list
, Vcoding_system_alist
;
333 Lisp_Object Qcoding_system_p
, Qcoding_system_error
;
335 /* Coding system emacs-mule and raw-text are for converting only
336 end-of-line format. */
337 Lisp_Object Qemacs_mule
, Qraw_text
;
339 /* Coding-systems are handed between Emacs Lisp programs and C internal
340 routines by the following three variables. */
341 /* Coding-system for reading files and receiving data from process. */
342 Lisp_Object Vcoding_system_for_read
;
343 /* Coding-system for writing files and sending data to process. */
344 Lisp_Object Vcoding_system_for_write
;
345 /* Coding-system actually used in the latest I/O. */
346 Lisp_Object Vlast_coding_system_used
;
348 /* A vector of length 256 which contains information about special
349 Latin codes (especially for dealing with Microsoft codes). */
350 Lisp_Object Vlatin_extra_code_table
;
352 /* Flag to inhibit code conversion of end-of-line format. */
353 int inhibit_eol_conversion
;
355 /* Flag to inhibit ISO2022 escape sequence detection. */
356 int inhibit_iso_escape_detection
;
358 /* Flag to make buffer-file-coding-system inherit from process-coding. */
359 int inherit_process_coding_system
;
361 /* Coding system to be used to encode text for terminal display. */
362 struct coding_system terminal_coding
;
364 /* Coding system to be used to encode text for terminal display when
365 terminal coding system is nil. */
366 struct coding_system safe_terminal_coding
;
368 /* Coding system of what is sent from terminal keyboard. */
369 struct coding_system keyboard_coding
;
371 Lisp_Object Vfile_coding_system_alist
;
372 Lisp_Object Vprocess_coding_system_alist
;
373 Lisp_Object Vnetwork_coding_system_alist
;
375 Lisp_Object Vlocale_coding_system
;
379 /* Flag to tell if we look up translation table on character code
381 Lisp_Object Venable_character_translation
;
382 /* Standard translation table to look up on decoding (reading). */
383 Lisp_Object Vstandard_translation_table_for_decode
;
384 /* Standard translation table to look up on encoding (writing). */
385 Lisp_Object Vstandard_translation_table_for_encode
;
387 Lisp_Object Qtranslation_table
;
388 Lisp_Object Qtranslation_table_id
;
389 Lisp_Object Qtranslation_table_for_decode
;
390 Lisp_Object Qtranslation_table_for_encode
;
392 /* Alist of charsets vs revision number. */
393 static Lisp_Object Vcharset_revision_table
;
395 /* Default coding systems used for process I/O. */
396 Lisp_Object Vdefault_process_coding_system
;
398 /* Global flag to tell that we can't call post-read-conversion and
399 pre-write-conversion functions. Usually the value is zero, but it
400 is set to 1 temporarily while such functions are running. This is
401 to avoid infinite recursive call. */
402 static int inhibit_pre_post_conversion
;
404 /* Two special coding systems. */
405 Lisp_Object Vsjis_coding_system
;
406 Lisp_Object Vbig5_coding_system
;
409 static int detect_coding_utf_8
P_ ((struct coding_system
*,
410 struct coding_detection_info
*info
));
411 static void decode_coding_utf_8
P_ ((struct coding_system
*));
412 static int encode_coding_utf_8
P_ ((struct coding_system
*));
414 static int detect_coding_utf_16
P_ ((struct coding_system
*,
415 struct coding_detection_info
*info
));
416 static void decode_coding_utf_16
P_ ((struct coding_system
*));
417 static int encode_coding_utf_16
P_ ((struct coding_system
*));
419 static int detect_coding_iso_2022
P_ ((struct coding_system
*,
420 struct coding_detection_info
*info
));
421 static void decode_coding_iso_2022
P_ ((struct coding_system
*));
422 static int encode_coding_iso_2022
P_ ((struct coding_system
*));
424 static int detect_coding_emacs_mule
P_ ((struct coding_system
*,
425 struct coding_detection_info
*info
));
426 static void decode_coding_emacs_mule
P_ ((struct coding_system
*));
427 static int encode_coding_emacs_mule
P_ ((struct coding_system
*));
429 static int detect_coding_sjis
P_ ((struct coding_system
*,
430 struct coding_detection_info
*info
));
431 static void decode_coding_sjis
P_ ((struct coding_system
*));
432 static int encode_coding_sjis
P_ ((struct coding_system
*));
434 static int detect_coding_big5
P_ ((struct coding_system
*,
435 struct coding_detection_info
*info
));
436 static void decode_coding_big5
P_ ((struct coding_system
*));
437 static int encode_coding_big5
P_ ((struct coding_system
*));
439 static int detect_coding_ccl
P_ ((struct coding_system
*,
440 struct coding_detection_info
*info
));
441 static void decode_coding_ccl
P_ ((struct coding_system
*));
442 static int encode_coding_ccl
P_ ((struct coding_system
*));
444 static void decode_coding_raw_text
P_ ((struct coding_system
*));
445 static int encode_coding_raw_text
P_ ((struct coding_system
*));
448 /* ISO2022 section */
450 #define CODING_ISO_INITIAL(coding, reg) \
451 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
452 coding_attr_iso_initial), \
456 #define CODING_ISO_REQUEST(coding, charset_id) \
457 ((charset_id <= (coding)->max_charset_id \
458 ? (coding)->safe_charsets[charset_id] \
462 #define CODING_ISO_FLAGS(coding) \
463 ((coding)->spec.iso_2022.flags)
464 #define CODING_ISO_DESIGNATION(coding, reg) \
465 ((coding)->spec.iso_2022.current_designation[reg])
466 #define CODING_ISO_INVOCATION(coding, plane) \
467 ((coding)->spec.iso_2022.current_invocation[plane])
468 #define CODING_ISO_SINGLE_SHIFTING(coding) \
469 ((coding)->spec.iso_2022.single_shifting)
470 #define CODING_ISO_BOL(coding) \
471 ((coding)->spec.iso_2022.bol)
472 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
473 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
475 /* Control characters of ISO2022. */
476 /* code */ /* function */
477 #define ISO_CODE_LF 0x0A /* line-feed */
478 #define ISO_CODE_CR 0x0D /* carriage-return */
479 #define ISO_CODE_SO 0x0E /* shift-out */
480 #define ISO_CODE_SI 0x0F /* shift-in */
481 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
482 #define ISO_CODE_ESC 0x1B /* escape */
483 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
484 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
485 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
487 /* All code (1-byte) of ISO2022 is classified into one of the
489 enum iso_code_class_type
491 ISO_control_0
, /* Control codes in the range
492 0x00..0x1F and 0x7F, except for the
493 following 5 codes. */
494 ISO_carriage_return
, /* ISO_CODE_CR (0x0D) */
495 ISO_shift_out
, /* ISO_CODE_SO (0x0E) */
496 ISO_shift_in
, /* ISO_CODE_SI (0x0F) */
497 ISO_single_shift_2_7
, /* ISO_CODE_SS2_7 (0x19) */
498 ISO_escape
, /* ISO_CODE_SO (0x1B) */
499 ISO_control_1
, /* Control codes in the range
500 0x80..0x9F, except for the
501 following 3 codes. */
502 ISO_single_shift_2
, /* ISO_CODE_SS2 (0x8E) */
503 ISO_single_shift_3
, /* ISO_CODE_SS3 (0x8F) */
504 ISO_control_sequence_introducer
, /* ISO_CODE_CSI (0x9B) */
505 ISO_0x20_or_0x7F
, /* Codes of the values 0x20 or 0x7F. */
506 ISO_graphic_plane_0
, /* Graphic codes in the range 0x21..0x7E. */
507 ISO_0xA0_or_0xFF
, /* Codes of the values 0xA0 or 0xFF. */
508 ISO_graphic_plane_1
/* Graphic codes in the range 0xA1..0xFE. */
511 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
512 `iso-flags' attribute of an iso2022 coding system. */
514 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
515 instead of the correct short-form sequence (e.g. ESC $ A). */
516 #define CODING_ISO_FLAG_LONG_FORM 0x0001
518 /* If set, reset graphic planes and registers at end-of-line to the
520 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
522 /* If set, reset graphic planes and registers before any control
523 characters to the initial state. */
524 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
526 /* If set, encode by 7-bit environment. */
527 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
529 /* If set, use locking-shift function. */
530 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
532 /* If set, use single-shift function. Overwrite
533 CODING_ISO_FLAG_LOCKING_SHIFT. */
534 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
536 /* If set, use designation escape sequence. */
537 #define CODING_ISO_FLAG_DESIGNATION 0x0040
539 /* If set, produce revision number sequence. */
540 #define CODING_ISO_FLAG_REVISION 0x0080
542 /* If set, produce ISO6429's direction specifying sequence. */
543 #define CODING_ISO_FLAG_DIRECTION 0x0100
545 /* If set, assume designation states are reset at beginning of line on
547 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
549 /* If set, designation sequence should be placed at beginning of line
551 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
553 /* If set, do not encode unsafe charactes on output. */
554 #define CODING_ISO_FLAG_SAFE 0x0800
556 /* If set, extra latin codes (128..159) are accepted as a valid code
558 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
560 #define CODING_ISO_FLAG_COMPOSITION 0x2000
562 #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
564 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
566 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
568 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
570 /* A character to be produced on output if encoding of the original
571 character is prohibited by CODING_ISO_FLAG_SAFE. */
572 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
576 #define CODING_UTF_16_BOM(coding) \
577 ((coding)->spec.utf_16.bom)
579 #define CODING_UTF_16_ENDIAN(coding) \
580 ((coding)->spec.utf_16.endian)
582 #define CODING_UTF_16_SURROGATE(coding) \
583 ((coding)->spec.utf_16.surrogate)
587 #define CODING_CCL_DECODER(coding) \
588 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
589 #define CODING_CCL_ENCODER(coding) \
590 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
591 #define CODING_CCL_VALIDS(coding) \
592 (XSTRING (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)) \
595 /* Index for each coding category in `coding_categories' */
599 coding_category_iso_7
,
600 coding_category_iso_7_tight
,
601 coding_category_iso_8_1
,
602 coding_category_iso_8_2
,
603 coding_category_iso_7_else
,
604 coding_category_iso_8_else
,
605 coding_category_utf_8
,
606 coding_category_utf_16_auto
,
607 coding_category_utf_16_be
,
608 coding_category_utf_16_le
,
609 coding_category_utf_16_be_nosig
,
610 coding_category_utf_16_le_nosig
,
611 coding_category_charset
,
612 coding_category_sjis
,
613 coding_category_big5
,
615 coding_category_emacs_mule
,
616 /* All above are targets of code detection. */
617 coding_category_raw_text
,
618 coding_category_undecided
,
622 /* Definitions of flag bits used in detect_coding_XXXX. */
623 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
624 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
625 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
626 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
627 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
628 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
629 #define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8)
630 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
631 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
632 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
633 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
634 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
635 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
636 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
637 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
638 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
639 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
640 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
642 /* This value is returned if detect_coding_mask () find nothing other
643 than ASCII characters. */
644 #define CATEGORY_MASK_ANY \
645 (CATEGORY_MASK_ISO_7 \
646 | CATEGORY_MASK_ISO_7_TIGHT \
647 | CATEGORY_MASK_ISO_8_1 \
648 | CATEGORY_MASK_ISO_8_2 \
649 | CATEGORY_MASK_ISO_7_ELSE \
650 | CATEGORY_MASK_ISO_8_ELSE \
651 | CATEGORY_MASK_UTF_8 \
652 | CATEGORY_MASK_UTF_16_BE \
653 | CATEGORY_MASK_UTF_16_LE \
654 | CATEGORY_MASK_UTF_16_BE_NOSIG \
655 | CATEGORY_MASK_UTF_16_LE_NOSIG \
656 | CATEGORY_MASK_CHARSET \
657 | CATEGORY_MASK_SJIS \
658 | CATEGORY_MASK_BIG5 \
659 | CATEGORY_MASK_CCL \
660 | CATEGORY_MASK_EMACS_MULE)
663 #define CATEGORY_MASK_ISO_7BIT \
664 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
666 #define CATEGORY_MASK_ISO_8BIT \
667 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
669 #define CATEGORY_MASK_ISO_ELSE \
670 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
672 #define CATEGORY_MASK_ISO_ESCAPE \
673 (CATEGORY_MASK_ISO_7 \
674 | CATEGORY_MASK_ISO_7_TIGHT \
675 | CATEGORY_MASK_ISO_7_ELSE \
676 | CATEGORY_MASK_ISO_8_ELSE)
678 #define CATEGORY_MASK_ISO \
679 ( CATEGORY_MASK_ISO_7BIT \
680 | CATEGORY_MASK_ISO_8BIT \
681 | CATEGORY_MASK_ISO_ELSE)
683 #define CATEGORY_MASK_UTF_16 \
684 (CATEGORY_MASK_UTF_16_BE \
685 | CATEGORY_MASK_UTF_16_LE \
686 | CATEGORY_MASK_UTF_16_BE_NOSIG \
687 | CATEGORY_MASK_UTF_16_LE_NOSIG)
690 /* List of symbols `coding-category-xxx' ordered by priority. This
691 variable is exposed to Emacs Lisp. */
692 static Lisp_Object Vcoding_category_list
;
694 /* Table of coding categories (Lisp symbols). This variable is for
696 static Lisp_Object Vcoding_category_table
;
698 /* Table of coding-categories ordered by priority. */
699 static enum coding_category coding_priorities
[coding_category_max
];
701 /* Nth element is a coding context for the coding system bound to the
702 Nth coding category. */
703 static struct coding_system coding_categories
[coding_category_max
];
705 /*** Commonly used macros and functions ***/
708 #define min(a, b) ((a) < (b) ? (a) : (b))
711 #define max(a, b) ((a) > (b) ? (a) : (b))
714 #define CODING_GET_INFO(coding, attrs, eol_type, charset_list) \
716 attrs = CODING_ID_ATTRS (coding->id); \
717 eol_type = CODING_ID_EOL_TYPE (coding->id); \
718 if (VECTORP (eol_type)) \
720 charset_list = CODING_ATTR_CHARSET_LIST (attrs); \
724 /* Safely get one byte from the source text pointed by SRC which ends
725 at SRC_END, and set C to that byte. If there are not enough bytes
726 in the source, it jumps to `no_more_source'. The caller
727 should declare and set these variables appropriately in advance:
728 src, src_end, multibytep
731 #define ONE_MORE_BYTE(c) \
733 if (src == src_end) \
735 if (src_base < src) \
736 coding->result = CODING_RESULT_INSUFFICIENT_SRC; \
737 goto no_more_source; \
740 if (multibytep && (c & 0x80)) \
742 if ((c & 0xFE) != 0xC0) \
743 error ("Undecodable char found"); \
744 c = ((c & 1) << 6) | *src++; \
750 #define ONE_MORE_BYTE_NO_CHECK(c) \
753 if (multibytep && (c & 0x80)) \
755 if ((c & 0xFE) != 0xC0) \
756 error ("Undecodable char found"); \
757 c = ((c & 1) << 6) | *src++; \
763 /* Store a byte C in the place pointed by DST and increment DST to the
764 next free point, and increment PRODUCED_CHARS. The caller should
765 assure that C is 0..127, and declare and set the variable `dst'
766 appropriately in advance.
770 #define EMIT_ONE_ASCII_BYTE(c) \
777 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
779 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
781 produced_chars += 2; \
782 *dst++ = (c1), *dst++ = (c2); \
786 /* Store a byte C in the place pointed by DST and increment DST to the
787 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
788 nonzero, store in an appropriate multibyte from. The caller should
789 declare and set the variables `dst' and `multibytep' appropriately
792 #define EMIT_ONE_BYTE(c) \
799 ch = BYTE8_TO_CHAR (ch); \
800 CHAR_STRING_ADVANCE (ch, dst); \
807 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
809 #define EMIT_TWO_BYTES(c1, c2) \
811 produced_chars += 2; \
818 ch = BYTE8_TO_CHAR (ch); \
819 CHAR_STRING_ADVANCE (ch, dst); \
822 ch = BYTE8_TO_CHAR (ch); \
823 CHAR_STRING_ADVANCE (ch, dst); \
833 #define EMIT_THREE_BYTES(c1, c2, c3) \
835 EMIT_ONE_BYTE (c1); \
836 EMIT_TWO_BYTES (c2, c3); \
840 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
842 EMIT_TWO_BYTES (c1, c2); \
843 EMIT_TWO_BYTES (c3, c4); \
847 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
849 charset_map_loaded = 0; \
850 c = DECODE_CHAR (charset, code); \
851 if (charset_map_loaded) \
853 unsigned char *orig = coding->source; \
856 coding_set_source (coding); \
857 offset = coding->source - orig; \
859 src_base += offset; \
865 #define ASSURE_DESTINATION(bytes) \
867 if (dst + (bytes) >= dst_end) \
869 int more_bytes = charbuf_end - charbuf + (bytes); \
871 dst = alloc_destination (coding, more_bytes, dst); \
872 dst_end = coding->destination + coding->dst_bytes; \
879 coding_set_source (coding
)
880 struct coding_system
*coding
;
882 if (BUFFERP (coding
->src_object
))
884 struct buffer
*buf
= XBUFFER (coding
->src_object
);
886 if (coding
->src_pos
< 0)
887 coding
->source
= BUF_GAP_END_ADDR (buf
) + coding
->src_pos_byte
;
889 coding
->source
= BUF_BYTE_ADDRESS (buf
, coding
->src_pos_byte
);
891 else if (STRINGP (coding
->src_object
))
893 coding
->source
= (XSTRING (coding
->src_object
)->data
894 + coding
->src_pos_byte
);
897 /* Otherwise, the source is C string and is never relocated
898 automatically. Thus we don't have to update anything. */
903 coding_set_destination (coding
)
904 struct coding_system
*coding
;
906 if (BUFFERP (coding
->dst_object
))
908 if (coding
->src_pos
< 0)
910 coding
->destination
= BEG_ADDR
+ coding
->dst_pos_byte
- 1;
911 coding
->dst_bytes
= (GAP_END_ADDR
912 - (coding
->src_bytes
- coding
->consumed
)
913 - coding
->destination
);
917 /* We are sure that coding->dst_pos_byte is before the gap
919 coding
->destination
= (BUF_BEG_ADDR (XBUFFER (coding
->dst_object
))
920 + coding
->dst_pos_byte
- 1);
921 coding
->dst_bytes
= (BUF_GAP_END_ADDR (XBUFFER (coding
->dst_object
))
922 - coding
->destination
);
926 /* Otherwise, the destination is C string and is never relocated
927 automatically. Thus we don't have to update anything. */
933 coding_alloc_by_realloc (coding
, bytes
)
934 struct coding_system
*coding
;
937 coding
->destination
= (unsigned char *) xrealloc (coding
->destination
,
938 coding
->dst_bytes
+ bytes
);
939 coding
->dst_bytes
+= bytes
;
943 coding_alloc_by_making_gap (coding
, bytes
)
944 struct coding_system
*coding
;
947 if (BUFFERP (coding
->dst_object
)
948 && EQ (coding
->src_object
, coding
->dst_object
))
950 EMACS_INT add
= coding
->src_bytes
- coding
->consumed
;
952 GAP_SIZE
-= add
; ZV
+= add
; Z
+= add
; ZV_BYTE
+= add
; Z_BYTE
+= add
;
954 GAP_SIZE
+= add
; ZV
-= add
; Z
-= add
; ZV_BYTE
-= add
; Z_BYTE
-= add
;
958 Lisp_Object this_buffer
;
960 this_buffer
= Fcurrent_buffer ();
961 set_buffer_internal (XBUFFER (coding
->dst_object
));
963 set_buffer_internal (XBUFFER (this_buffer
));
968 static unsigned char *
969 alloc_destination (coding
, nbytes
, dst
)
970 struct coding_system
*coding
;
974 EMACS_INT offset
= dst
- coding
->destination
;
976 if (BUFFERP (coding
->dst_object
))
977 coding_alloc_by_making_gap (coding
, nbytes
);
979 coding_alloc_by_realloc (coding
, nbytes
);
980 coding
->result
= CODING_RESULT_SUCCESS
;
981 coding_set_destination (coding
);
982 dst
= coding
->destination
+ offset
;
986 /** Macros for annotations. */
988 /* Maximum length of annotation data (sum of annotations for
989 composition and charset). */
990 #define MAX_ANNOTATION_LENGTH (5 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 5)
992 /* An annotation data is stored in the array coding->charbuf in this
994 [ -LENGTH ANNOTATION_MASK FROM TO ... ]
995 LENGTH is the number of elements in the annotation.
996 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
997 FROM and TO specify the range of text annotated. They are relative
998 to coding->src_pos (on encoding) or coding->dst_pos (on decoding).
1000 The format of the following elements depend on ANNOTATION_MASK.
1002 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1004 ... METHOD [ COMPOSITION-COMPONENTS ... ]
1005 METHOD is one of enum composition_method.
1006 Optionnal COMPOSITION-COMPONENTS are characters and composition
1009 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1012 #define ADD_ANNOTATION_DATA(buf, len, mask, from, to) \
1014 *(buf)++ = -(len); \
1015 *(buf)++ = (mask); \
1016 *(buf)++ = (from); \
1018 coding->annotated = 1; \
1021 #define ADD_COMPOSITION_DATA(buf, from, to, method) \
1023 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, from, to); \
1028 #define ADD_CHARSET_DATA(buf, from, to, id) \
1030 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_CHARSET_MASK, from, to); \
1035 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1042 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1043 Check if a text is encoded in UTF-8. If it is, return 1, else
1046 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1047 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1048 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1049 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1050 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1051 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1054 detect_coding_utf_8 (coding
, detect_info
)
1055 struct coding_system
*coding
;
1056 struct coding_detection_info
*detect_info
;
1058 unsigned char *src
= coding
->source
, *src_base
= src
;
1059 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1060 int multibytep
= coding
->src_multibyte
;
1061 int consumed_chars
= 0;
1065 detect_info
->checked
|= CATEGORY_MASK_UTF_8
;
1066 /* A coding system of this category is always ASCII compatible. */
1067 src
+= coding
->head_ascii
;
1071 int c
, c1
, c2
, c3
, c4
;
1075 if (UTF_8_1_OCTET_P (c
))
1079 if (! UTF_8_EXTRA_OCTET_P (c1
))
1081 if (UTF_8_2_OCTET_LEADING_P (c
))
1083 found
= CATEGORY_MASK_UTF_8
;
1087 if (! UTF_8_EXTRA_OCTET_P (c2
))
1089 if (UTF_8_3_OCTET_LEADING_P (c
))
1091 found
= CATEGORY_MASK_UTF_8
;
1095 if (! UTF_8_EXTRA_OCTET_P (c3
))
1097 if (UTF_8_4_OCTET_LEADING_P (c
))
1099 found
= CATEGORY_MASK_UTF_8
;
1103 if (! UTF_8_EXTRA_OCTET_P (c4
))
1105 if (UTF_8_5_OCTET_LEADING_P (c
))
1107 found
= CATEGORY_MASK_UTF_8
;
1112 detect_info
->rejected
|= CATEGORY_MASK_UTF_8
;
1116 if (incomplete
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
1118 detect_info
->rejected
|= CATEGORY_MASK_UTF_8
;
1121 detect_info
->found
|= found
;
1127 decode_coding_utf_8 (coding
)
1128 struct coding_system
*coding
;
1130 unsigned char *src
= coding
->source
+ coding
->consumed
;
1131 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1132 unsigned char *src_base
;
1133 int *charbuf
= coding
->charbuf
;
1134 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
1135 int consumed_chars
= 0, consumed_chars_base
;
1136 int multibytep
= coding
->src_multibyte
;
1137 Lisp_Object attr
, eol_type
, charset_list
;
1139 CODING_GET_INFO (coding
, attr
, eol_type
, charset_list
);
1143 int c
, c1
, c2
, c3
, c4
, c5
;
1146 consumed_chars_base
= consumed_chars
;
1148 if (charbuf
>= charbuf_end
)
1152 if (UTF_8_1_OCTET_P(c1
))
1157 if (EQ (eol_type
, Qdos
))
1161 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
1162 goto no_more_source
;
1167 else if (EQ (eol_type
, Qmac
))
1174 if (! UTF_8_EXTRA_OCTET_P (c2
))
1176 if (UTF_8_2_OCTET_LEADING_P (c1
))
1178 c
= ((c1
& 0x1F) << 6) | (c2
& 0x3F);
1179 /* Reject overlong sequences here and below. Encoders
1180 producing them are incorrect, they can be misleading,
1181 and they mess up read/write invariance. */
1188 if (! UTF_8_EXTRA_OCTET_P (c3
))
1190 if (UTF_8_3_OCTET_LEADING_P (c1
))
1192 c
= (((c1
& 0xF) << 12)
1193 | ((c2
& 0x3F) << 6) | (c3
& 0x3F));
1195 || (c
>= 0xd800 && c
< 0xe000)) /* surrogates (invalid) */
1201 if (! UTF_8_EXTRA_OCTET_P (c4
))
1203 if (UTF_8_4_OCTET_LEADING_P (c1
))
1205 c
= (((c1
& 0x7) << 18) | ((c2
& 0x3F) << 12)
1206 | ((c3
& 0x3F) << 6) | (c4
& 0x3F));
1213 if (! UTF_8_EXTRA_OCTET_P (c5
))
1215 if (UTF_8_5_OCTET_LEADING_P (c1
))
1217 c
= (((c1
& 0x3) << 24) | ((c2
& 0x3F) << 18)
1218 | ((c3
& 0x3F) << 12) | ((c4
& 0x3F) << 6)
1220 if ((c
> MAX_CHAR
) || (c
< 0x200000))
1235 consumed_chars
= consumed_chars_base
;
1237 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
1242 coding
->consumed_char
+= consumed_chars_base
;
1243 coding
->consumed
= src_base
- coding
->source
;
1244 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
1249 encode_coding_utf_8 (coding
)
1250 struct coding_system
*coding
;
1252 int multibytep
= coding
->dst_multibyte
;
1253 int *charbuf
= coding
->charbuf
;
1254 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
1255 unsigned char *dst
= coding
->destination
+ coding
->produced
;
1256 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
1257 int produced_chars
= 0;
1262 int safe_room
= MAX_MULTIBYTE_LENGTH
* 2;
1264 while (charbuf
< charbuf_end
)
1266 unsigned char str
[MAX_MULTIBYTE_LENGTH
], *p
, *pend
= str
;
1268 ASSURE_DESTINATION (safe_room
);
1270 if (CHAR_BYTE8_P (c
))
1272 c
= CHAR_TO_BYTE8 (c
);
1277 CHAR_STRING_ADVANCE (c
, pend
);
1278 for (p
= str
; p
< pend
; p
++)
1285 int safe_room
= MAX_MULTIBYTE_LENGTH
;
1287 while (charbuf
< charbuf_end
)
1289 ASSURE_DESTINATION (safe_room
);
1291 dst
+= CHAR_STRING (c
, dst
);
1295 coding
->result
= CODING_RESULT_SUCCESS
;
1296 coding
->produced_char
+= produced_chars
;
1297 coding
->produced
= dst
- coding
->destination
;
1302 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1303 Check if a text is encoded in one of UTF-16 based coding systems.
1304 If it is, return 1, else return 0. */
1306 #define UTF_16_HIGH_SURROGATE_P(val) \
1307 (((val) & 0xFC00) == 0xD800)
1309 #define UTF_16_LOW_SURROGATE_P(val) \
1310 (((val) & 0xFC00) == 0xDC00)
1312 #define UTF_16_INVALID_P(val) \
1313 (((val) == 0xFFFE) \
1314 || ((val) == 0xFFFF) \
1315 || UTF_16_LOW_SURROGATE_P (val))
1319 detect_coding_utf_16 (coding
, detect_info
)
1320 struct coding_system
*coding
;
1321 struct coding_detection_info
*detect_info
;
1323 unsigned char *src
= coding
->source
, *src_base
= src
;
1324 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1325 int multibytep
= coding
->src_multibyte
;
1326 int consumed_chars
= 0;
1329 detect_info
->checked
|= CATEGORY_MASK_UTF_16
;
1331 if (coding
->mode
& CODING_MODE_LAST_BLOCK
1332 && (coding
->src_bytes
& 1))
1334 detect_info
->rejected
|= CATEGORY_MASK_UTF_16
;
1340 if ((c1
== 0xFF) && (c2
== 0xFE))
1342 detect_info
->found
|= (CATEGORY_MASK_UTF_16_LE
1343 | CATEGORY_MASK_UTF_16_AUTO
);
1344 detect_info
->rejected
|= CATEGORY_MASK_UTF_16_BE
;
1346 else if ((c1
== 0xFE) && (c2
== 0xFF))
1348 detect_info
->found
|= (CATEGORY_MASK_UTF_16_BE
1349 | CATEGORY_MASK_UTF_16_AUTO
);
1350 detect_info
->rejected
|= CATEGORY_MASK_UTF_16_LE
;
1357 decode_coding_utf_16 (coding
)
1358 struct coding_system
*coding
;
1360 unsigned char *src
= coding
->source
+ coding
->consumed
;
1361 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1362 unsigned char *src_base
;
1363 int *charbuf
= coding
->charbuf
;
1364 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
1365 int consumed_chars
= 0, consumed_chars_base
;
1366 int multibytep
= coding
->src_multibyte
;
1367 enum utf_16_bom_type bom
= CODING_UTF_16_BOM (coding
);
1368 enum utf_16_endian_type endian
= CODING_UTF_16_ENDIAN (coding
);
1369 int surrogate
= CODING_UTF_16_SURROGATE (coding
);
1370 Lisp_Object attr
, eol_type
, charset_list
;
1372 CODING_GET_INFO (coding
, attr
, eol_type
, charset_list
);
1374 if (bom
== utf_16_with_bom
)
1383 if (endian
== utf_16_big_endian
1384 ? c
!= 0xFEFF : c
!= 0xFFFE)
1386 /* The first two bytes are not BOM. Treat them as bytes
1387 for a normal character. */
1391 CODING_UTF_16_BOM (coding
) = utf_16_without_bom
;
1393 else if (bom
== utf_16_detect_bom
)
1395 /* We have already tried to detect BOM and failed in
1397 CODING_UTF_16_BOM (coding
) = utf_16_without_bom
;
1405 consumed_chars_base
= consumed_chars
;
1407 if (charbuf
+ 2 >= charbuf_end
)
1412 c
= (endian
== utf_16_big_endian
1413 ? ((c1
<< 8) | c2
) : ((c2
<< 8) | c1
));
1416 if (! UTF_16_LOW_SURROGATE_P (c
))
1418 if (endian
== utf_16_big_endian
)
1419 c1
= surrogate
>> 8, c2
= surrogate
& 0xFF;
1421 c1
= surrogate
& 0xFF, c2
= surrogate
>> 8;
1425 if (UTF_16_HIGH_SURROGATE_P (c
))
1426 CODING_UTF_16_SURROGATE (coding
) = surrogate
= c
;
1432 c
= ((surrogate
- 0xD800) << 10) | (c
- 0xDC00);
1433 CODING_UTF_16_SURROGATE (coding
) = surrogate
= 0;
1439 if (UTF_16_HIGH_SURROGATE_P (c
))
1440 CODING_UTF_16_SURROGATE (coding
) = surrogate
= c
;
1447 coding
->consumed_char
+= consumed_chars_base
;
1448 coding
->consumed
= src_base
- coding
->source
;
1449 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
1453 encode_coding_utf_16 (coding
)
1454 struct coding_system
*coding
;
1456 int multibytep
= coding
->dst_multibyte
;
1457 int *charbuf
= coding
->charbuf
;
1458 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
1459 unsigned char *dst
= coding
->destination
+ coding
->produced
;
1460 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
1462 enum utf_16_bom_type bom
= CODING_UTF_16_BOM (coding
);
1463 int big_endian
= CODING_UTF_16_ENDIAN (coding
) == utf_16_big_endian
;
1464 int produced_chars
= 0;
1465 Lisp_Object attrs
, eol_type
, charset_list
;
1468 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
1470 if (bom
!= utf_16_without_bom
)
1472 ASSURE_DESTINATION (safe_room
);
1474 EMIT_TWO_BYTES (0xFE, 0xFF);
1476 EMIT_TWO_BYTES (0xFF, 0xFE);
1477 CODING_UTF_16_BOM (coding
) = utf_16_without_bom
;
1480 while (charbuf
< charbuf_end
)
1482 ASSURE_DESTINATION (safe_room
);
1484 if (c
>= MAX_UNICODE_CHAR
)
1485 c
= coding
->default_char
;
1490 EMIT_TWO_BYTES (c
>> 8, c
& 0xFF);
1492 EMIT_TWO_BYTES (c
& 0xFF, c
>> 8);
1499 c1
= (c
>> 10) + 0xD800;
1500 c2
= (c
& 0x3FF) + 0xDC00;
1502 EMIT_FOUR_BYTES (c1
>> 8, c1
& 0xFF, c2
>> 8, c2
& 0xFF);
1504 EMIT_FOUR_BYTES (c1
& 0xFF, c1
>> 8, c2
& 0xFF, c2
>> 8);
1507 coding
->result
= CODING_RESULT_SUCCESS
;
1508 coding
->produced
= dst
- coding
->destination
;
1509 coding
->produced_char
+= produced_chars
;
1514 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1516 /* Emacs' internal format for representation of multiple character
1517 sets is a kind of multi-byte encoding, i.e. characters are
1518 represented by variable-length sequences of one-byte codes.
1520 ASCII characters and control characters (e.g. `tab', `newline') are
1521 represented by one-byte sequences which are their ASCII codes, in
1522 the range 0x00 through 0x7F.
1524 8-bit characters of the range 0x80..0x9F are represented by
1525 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1528 8-bit characters of the range 0xA0..0xFF are represented by
1529 one-byte sequences which are their 8-bit code.
1531 The other characters are represented by a sequence of `base
1532 leading-code', optional `extended leading-code', and one or two
1533 `position-code's. The length of the sequence is determined by the
1534 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1535 whereas extended leading-code and position-code take the range 0xA0
1536 through 0xFF. See `charset.h' for more details about leading-code
1539 --- CODE RANGE of Emacs' internal format ---
1543 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1544 eight-bit-graphic 0xA0..0xBF
1545 ELSE 0x81..0x9D + [0xA0..0xFF]+
1546 ---------------------------------------------
1548 As this is the internal character representation, the format is
1549 usually not used externally (i.e. in a file or in a data sent to a
1550 process). But, it is possible to have a text externally in this
1551 format (i.e. by encoding by the coding system `emacs-mule').
1553 In that case, a sequence of one-byte codes has a slightly different
1556 At first, all characters in eight-bit-control are represented by
1557 one-byte sequences which are their 8-bit code.
1559 Next, character composition data are represented by the byte
1560 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1562 METHOD is 0xF0 plus one of composition method (enum
1563 composition_method),
1565 BYTES is 0xA0 plus a byte length of this composition data,
1567 CHARS is 0x20 plus a number of characters composed by this
1570 COMPONENTs are characters of multibye form or composition
1571 rules encoded by two-byte of ASCII codes.
1573 In addition, for backward compatibility, the following formats are
1574 also recognized as composition data on decoding.
1577 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1580 MSEQ is a multibyte form but in these special format:
1581 ASCII: 0xA0 ASCII_CODE+0x80,
1582 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1583 RULE is a one byte code of the range 0xA0..0xF0 that
1584 represents a composition rule.
1587 char emacs_mule_bytes
[256];
1590 emacs_mule_char (coding
, src
, nbytes
, nchars
, id
)
1591 struct coding_system
*coding
;
1593 int *nbytes
, *nchars
, *id
;
1595 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1596 int multibytep
= coding
->src_multibyte
;
1597 unsigned char *src_base
= src
;
1598 struct charset
*charset
;
1601 int consumed_chars
= 0;
1604 switch (emacs_mule_bytes
[c
])
1607 if (! (charset
= emacs_mule_charset
[c
]))
1614 if (c
== EMACS_MULE_LEADING_CODE_PRIVATE_11
1615 || c
== EMACS_MULE_LEADING_CODE_PRIVATE_12
)
1618 if (! (charset
= emacs_mule_charset
[c
]))
1625 if (! (charset
= emacs_mule_charset
[c
]))
1628 code
= (c
& 0x7F) << 8;
1636 if (! (charset
= emacs_mule_charset
[c
]))
1639 code
= (c
& 0x7F) << 8;
1646 charset
= CHARSET_FROM_ID (ASCII_BYTE_P (code
)
1647 ? charset_ascii
: charset_eight_bit
);
1653 c
= DECODE_CHAR (charset
, code
);
1656 *nbytes
= src
- src_base
;
1657 *nchars
= consumed_chars
;
1670 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1671 Check if a text is encoded in `emacs-mule'. If it is, return 1,
1675 detect_coding_emacs_mule (coding
, detect_info
)
1676 struct coding_system
*coding
;
1677 struct coding_detection_info
*detect_info
;
1679 unsigned char *src
= coding
->source
, *src_base
= src
;
1680 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1681 int multibytep
= coding
->src_multibyte
;
1682 int consumed_chars
= 0;
1687 detect_info
->checked
|= CATEGORY_MASK_EMACS_MULE
;
1688 /* A coding system of this category is always ASCII compatible. */
1689 src
+= coding
->head_ascii
;
1699 /* Perhaps the start of composite character. We simple skip
1700 it because analyzing it is too heavy for detecting. But,
1701 at least, we check that the composite character
1702 constitues of more than 4 bytes. */
1703 unsigned char *src_base
;
1713 if (src
- src_base
<= 4)
1715 found
= CATEGORY_MASK_EMACS_MULE
;
1723 && (c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
))
1728 unsigned char *src_base
= src
- 1;
1735 if (src
- src_base
!= emacs_mule_bytes
[*src_base
])
1737 found
= CATEGORY_MASK_EMACS_MULE
;
1740 detect_info
->rejected
|= CATEGORY_MASK_EMACS_MULE
;
1744 if (incomplete
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
1746 detect_info
->rejected
|= CATEGORY_MASK_EMACS_MULE
;
1749 detect_info
->found
|= found
;
1754 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1756 /* Decode a character represented as a component of composition
1757 sequence of Emacs 20/21 style at SRC. Set C to that character and
1758 update SRC to the head of next character (or an encoded composition
1759 rule). If SRC doesn't points a composition component, set C to -1.
1760 If SRC points an invalid byte sequence, global exit by a return
1763 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
1767 int nbytes, nchars; \
1769 if (src == src_end) \
1771 c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
1776 goto invalid_code; \
1780 consumed_chars += nchars; \
1785 /* Decode a composition rule represented as a component of composition
1786 sequence of Emacs 20 style at SRC. Store the decoded rule in *BUF,
1787 and increment BUF. If SRC points an invalid byte sequence, set C
1790 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf) \
1792 int c, gref, nref; \
1794 if (src >= src_end) \
1795 goto invalid_code; \
1796 ONE_MORE_BYTE_NO_CHECK (c); \
1798 if (c < 0 || c >= 81) \
1799 goto invalid_code; \
1801 gref = c / 9, nref = c % 9; \
1802 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1806 /* Decode a composition rule represented as a component of composition
1807 sequence of Emacs 21 style at SRC. Store the decoded rule in *BUF,
1808 and increment BUF. If SRC points an invalid byte sequence, set C
1811 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf) \
1815 if (src + 1>= src_end) \
1816 goto invalid_code; \
1817 ONE_MORE_BYTE_NO_CHECK (gref); \
1819 ONE_MORE_BYTE_NO_CHECK (nref); \
1821 if (gref < 0 || gref >= 81 \
1822 || nref < 0 || nref >= 81) \
1823 goto invalid_code; \
1824 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1828 #define DECODE_EMACS_MULE_21_COMPOSITION(c) \
1830 /* Emacs 21 style format. The first three bytes at SRC are \
1831 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
1832 the byte length of this composition information, CHARS is the \
1833 number of characters composed by this composition. */ \
1834 enum composition_method method = c - 0xF2; \
1835 int *charbuf_base = charbuf; \
1837 int consumed_chars_limit; \
1838 int nbytes, nchars; \
1840 ONE_MORE_BYTE (c); \
1841 nbytes = c - 0xA0; \
1843 goto invalid_code; \
1844 ONE_MORE_BYTE (c); \
1845 nchars = c - 0xA0; \
1846 from = coding->produced + char_offset; \
1847 to = from + nchars; \
1848 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
1849 consumed_chars_limit = consumed_chars_base + nbytes; \
1850 if (method != COMPOSITION_RELATIVE) \
1853 while (consumed_chars < consumed_chars_limit) \
1855 if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
1856 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf); \
1858 DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
1861 if (consumed_chars < consumed_chars_limit) \
1862 goto invalid_code; \
1863 charbuf_base[0] -= i; \
1868 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
1870 /* Emacs 20 style format for relative composition. */ \
1871 /* Store multibyte form of characters to be composed. */ \
1872 enum composition_method method = COMPOSITION_RELATIVE; \
1873 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1874 int *buf = components; \
1879 ONE_MORE_BYTE (c); /* skip 0x80 */ \
1880 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1881 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1883 goto invalid_code; \
1884 from = coding->produced_char + char_offset; \
1886 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
1887 for (j = 0; j < i; j++) \
1888 *charbuf++ = components[j]; \
1892 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
1894 /* Emacs 20 style format for rule-base composition. */ \
1895 /* Store multibyte form of characters to be composed. */ \
1896 enum composition_method method = COMPOSITION_WITH_RULE; \
1897 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1898 int *buf = components; \
1902 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1903 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1905 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \
1906 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1908 if (i < 1 || (buf - components) % 2 == 0) \
1909 goto invalid_code; \
1910 if (charbuf + i + (i / 2) + 1 < charbuf_end) \
1911 goto no_more_source; \
1912 from = coding->produced_char + char_offset; \
1914 ADD_COMPOSITION_DATA (buf, from, to, method); \
1915 for (j = 0; j < i; j++) \
1916 *charbuf++ = components[j]; \
1917 for (j = 0; j < i; j += 2) \
1918 *charbuf++ = components[j]; \
1923 decode_coding_emacs_mule (coding
)
1924 struct coding_system
*coding
;
1926 unsigned char *src
= coding
->source
+ coding
->consumed
;
1927 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
1928 unsigned char *src_base
;
1929 int *charbuf
= coding
->charbuf
;
1930 int *charbuf_end
= charbuf
+ coding
->charbuf_size
- MAX_ANNOTATION_LENGTH
;
1931 int consumed_chars
= 0, consumed_chars_base
;
1932 int multibytep
= coding
->src_multibyte
;
1933 Lisp_Object attrs
, eol_type
, charset_list
;
1934 int char_offset
= coding
->produced_char
;
1935 int last_offset
= char_offset
;
1936 int last_id
= charset_ascii
;
1938 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
1945 consumed_chars_base
= consumed_chars
;
1947 if (charbuf
>= charbuf_end
)
1956 if (EQ (eol_type
, Qdos
))
1960 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
1961 goto no_more_source
;
1966 else if (EQ (eol_type
, Qmac
))
1975 if (c
- 0xF2 >= COMPOSITION_RELATIVE
1976 && c
- 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS
)
1977 DECODE_EMACS_MULE_21_COMPOSITION (c
);
1979 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c
);
1981 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c
);
1985 else if (c
< 0xA0 && emacs_mule_bytes
[c
] > 1)
1991 consumed_chars
= consumed_chars_base
;
1992 c
= emacs_mule_char (coding
, src
, &nbytes
, &nchars
, &id
);
2001 if (last_id
!= charset_ascii
)
2002 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
2004 last_offset
= char_offset
;
2008 consumed_chars
+= nchars
;
2015 consumed_chars
= consumed_chars_base
;
2017 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
2023 if (last_id
!= charset_ascii
)
2024 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
2025 coding
->consumed_char
+= consumed_chars_base
;
2026 coding
->consumed
= src_base
- coding
->source
;
2027 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
2031 #define EMACS_MULE_LEADING_CODES(id, codes) \
2034 codes[0] = id, codes[1] = 0; \
2035 else if (id < 0xE0) \
2036 codes[0] = 0x9A, codes[1] = id; \
2037 else if (id < 0xF0) \
2038 codes[0] = 0x9B, codes[1] = id; \
2039 else if (id < 0xF5) \
2040 codes[0] = 0x9C, codes[1] = id; \
2042 codes[0] = 0x9D, codes[1] = id; \
2047 encode_coding_emacs_mule (coding
)
2048 struct coding_system
*coding
;
2050 int multibytep
= coding
->dst_multibyte
;
2051 int *charbuf
= coding
->charbuf
;
2052 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
2053 unsigned char *dst
= coding
->destination
+ coding
->produced
;
2054 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
2056 int produced_chars
= 0;
2057 Lisp_Object attrs
, eol_type
, charset_list
;
2059 int preferred_charset_id
= -1;
2061 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
2063 while (charbuf
< charbuf_end
)
2065 ASSURE_DESTINATION (safe_room
);
2070 /* Handle an annotation. */
2073 case CODING_ANNOTATE_COMPOSITION_MASK
:
2074 /* Not yet implemented. */
2076 case CODING_ANNOTATE_CHARSET_MASK
:
2077 preferred_charset_id
= charbuf
[3];
2078 if (preferred_charset_id
>= 0
2079 && NILP (Fmemq (make_number (preferred_charset_id
),
2081 preferred_charset_id
= -1;
2090 if (ASCII_CHAR_P (c
))
2091 EMIT_ONE_ASCII_BYTE (c
);
2092 else if (CHAR_BYTE8_P (c
))
2094 c
= CHAR_TO_BYTE8 (c
);
2099 struct charset
*charset
;
2103 unsigned char leading_codes
[2];
2105 if (preferred_charset_id
>= 0)
2107 charset
= CHARSET_FROM_ID (preferred_charset_id
);
2108 if (! CHAR_CHARSET_P (c
, charset
))
2109 charset
= char_charset (c
, charset_list
, NULL
);
2112 charset
= char_charset (c
, charset_list
, &code
);
2115 c
= coding
->default_char
;
2116 if (ASCII_CHAR_P (c
))
2118 EMIT_ONE_ASCII_BYTE (c
);
2121 charset
= char_charset (c
, charset_list
, &code
);
2123 dimension
= CHARSET_DIMENSION (charset
);
2124 emacs_mule_id
= CHARSET_EMACS_MULE_ID (charset
);
2125 EMACS_MULE_LEADING_CODES (emacs_mule_id
, leading_codes
);
2126 EMIT_ONE_BYTE (leading_codes
[0]);
2127 if (leading_codes
[1])
2128 EMIT_ONE_BYTE (leading_codes
[1]);
2130 EMIT_ONE_BYTE (code
);
2133 EMIT_ONE_BYTE (code
>> 8);
2134 EMIT_ONE_BYTE (code
& 0xFF);
2138 coding
->result
= CODING_RESULT_SUCCESS
;
2139 coding
->produced_char
+= produced_chars
;
2140 coding
->produced
= dst
- coding
->destination
;
2145 /*** 7. ISO2022 handlers ***/
2147 /* The following note describes the coding system ISO2022 briefly.
2148 Since the intention of this note is to help understand the
2149 functions in this file, some parts are NOT ACCURATE or are OVERLY
2150 SIMPLIFIED. For thorough understanding, please refer to the
2151 original document of ISO2022. This is equivalent to the standard
2152 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2154 ISO2022 provides many mechanisms to encode several character sets
2155 in 7-bit and 8-bit environments. For 7-bit environments, all text
2156 is encoded using bytes less than 128. This may make the encoded
2157 text a little bit longer, but the text passes more easily through
2158 several types of gateway, some of which strip off the MSB (Most
2161 There are two kinds of character sets: control character sets and
2162 graphic character sets. The former contain control characters such
2163 as `newline' and `escape' to provide control functions (control
2164 functions are also provided by escape sequences). The latter
2165 contain graphic characters such as 'A' and '-'. Emacs recognizes
2166 two control character sets and many graphic character sets.
2168 Graphic character sets are classified into one of the following
2169 four classes, according to the number of bytes (DIMENSION) and
2170 number of characters in one dimension (CHARS) of the set:
2171 - DIMENSION1_CHARS94
2172 - DIMENSION1_CHARS96
2173 - DIMENSION2_CHARS94
2174 - DIMENSION2_CHARS96
2176 In addition, each character set is assigned an identification tag,
2177 unique for each set, called the "final character" (denoted as <F>
2178 hereafter). The <F> of each character set is decided by ECMA(*)
2179 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2180 (0x30..0x3F are for private use only).
2182 Note (*): ECMA = European Computer Manufacturers Association
2184 Here are examples of graphic character sets [NAME(<F>)]:
2185 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2186 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2187 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2188 o DIMENSION2_CHARS96 -- none for the moment
2190 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2191 C0 [0x00..0x1F] -- control character plane 0
2192 GL [0x20..0x7F] -- graphic character plane 0
2193 C1 [0x80..0x9F] -- control character plane 1
2194 GR [0xA0..0xFF] -- graphic character plane 1
2196 A control character set is directly designated and invoked to C0 or
2197 C1 by an escape sequence. The most common case is that:
2198 - ISO646's control character set is designated/invoked to C0, and
2199 - ISO6429's control character set is designated/invoked to C1,
2200 and usually these designations/invocations are omitted in encoded
2201 text. In a 7-bit environment, only C0 can be used, and a control
2202 character for C1 is encoded by an appropriate escape sequence to
2203 fit into the environment. All control characters for C1 are
2204 defined to have corresponding escape sequences.
2206 A graphic character set is at first designated to one of four
2207 graphic registers (G0 through G3), then these graphic registers are
2208 invoked to GL or GR. These designations and invocations can be
2209 done independently. The most common case is that G0 is invoked to
2210 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2211 these invocations and designations are omitted in encoded text.
2212 In a 7-bit environment, only GL can be used.
2214 When a graphic character set of CHARS94 is invoked to GL, codes
2215 0x20 and 0x7F of the GL area work as control characters SPACE and
2216 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2219 There are two ways of invocation: locking-shift and single-shift.
2220 With locking-shift, the invocation lasts until the next different
2221 invocation, whereas with single-shift, the invocation affects the
2222 following character only and doesn't affect the locking-shift
2223 state. Invocations are done by the following control characters or
2226 ----------------------------------------------------------------------
2227 abbrev function cntrl escape seq description
2228 ----------------------------------------------------------------------
2229 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2230 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2231 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2232 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2233 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2234 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2235 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2236 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2237 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2238 ----------------------------------------------------------------------
2239 (*) These are not used by any known coding system.
2241 Control characters for these functions are defined by macros
2242 ISO_CODE_XXX in `coding.h'.
2244 Designations are done by the following escape sequences:
2245 ----------------------------------------------------------------------
2246 escape sequence description
2247 ----------------------------------------------------------------------
2248 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2249 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2250 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2251 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2252 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2253 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2254 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2255 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2256 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2257 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2258 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2259 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2260 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2261 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2262 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2263 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2264 ----------------------------------------------------------------------
2266 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2267 of dimension 1, chars 94, and final character <F>, etc...
2269 Note (*): Although these designations are not allowed in ISO2022,
2270 Emacs accepts them on decoding, and produces them on encoding
2271 CHARS96 character sets in a coding system which is characterized as
2272 7-bit environment, non-locking-shift, and non-single-shift.
2274 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2275 '(' must be omitted. We refer to this as "short-form" hereafter.
2277 Now you may notice that there are a lot of ways of encoding the
2278 same multilingual text in ISO2022. Actually, there exist many
2279 coding systems such as Compound Text (used in X11's inter client
2280 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2281 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2282 localized platforms), and all of these are variants of ISO2022.
2284 In addition to the above, Emacs handles two more kinds of escape
2285 sequences: ISO6429's direction specification and Emacs' private
2286 sequence for specifying character composition.
2288 ISO6429's direction specification takes the following form:
2289 o CSI ']' -- end of the current direction
2290 o CSI '0' ']' -- end of the current direction
2291 o CSI '1' ']' -- start of left-to-right text
2292 o CSI '2' ']' -- start of right-to-left text
2293 The control character CSI (0x9B: control sequence introducer) is
2294 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2296 Character composition specification takes the following form:
2297 o ESC '0' -- start relative composition
2298 o ESC '1' -- end composition
2299 o ESC '2' -- start rule-base composition (*)
2300 o ESC '3' -- start relative composition with alternate chars (**)
2301 o ESC '4' -- start rule-base composition with alternate chars (**)
2302 Since these are not standard escape sequences of any ISO standard,
2303 the use of them with these meanings is restricted to Emacs only.
2305 (*) This form is used only in Emacs 20.7 and older versions,
2306 but newer versions can safely decode it.
2307 (**) This form is used only in Emacs 21.1 and newer versions,
2308 and older versions can't decode it.
2310 Here's a list of example usages of these composition escape
2311 sequences (categorized by `enum composition_method').
2313 COMPOSITION_RELATIVE:
2314 ESC 0 CHAR [ CHAR ] ESC 1
2315 COMPOSITION_WITH_RULE:
2316 ESC 2 CHAR [ RULE CHAR ] ESC 1
2317 COMPOSITION_WITH_ALTCHARS:
2318 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2319 COMPOSITION_WITH_RULE_ALTCHARS:
2320 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2322 enum iso_code_class_type iso_code_class
[256];
2324 #define SAFE_CHARSET_P(coding, id) \
2325 ((id) <= (coding)->max_charset_id \
2326 && (coding)->safe_charsets[id] >= 0)
2329 #define SHIFT_OUT_OK(category) \
2330 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2333 setup_iso_safe_charsets (attrs
)
2336 Lisp_Object charset_list
, safe_charsets
;
2337 Lisp_Object request
;
2338 Lisp_Object reg_usage
;
2341 int flags
= XINT (AREF (attrs
, coding_attr_iso_flags
));
2344 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
2345 if ((flags
& CODING_ISO_FLAG_FULL_SUPPORT
)
2346 && ! EQ (charset_list
, Viso_2022_charset_list
))
2348 CODING_ATTR_CHARSET_LIST (attrs
)
2349 = charset_list
= Viso_2022_charset_list
;
2350 ASET (attrs
, coding_attr_safe_charsets
, Qnil
);
2353 if (STRINGP (AREF (attrs
, coding_attr_safe_charsets
)))
2357 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
2359 int id
= XINT (XCAR (tail
));
2360 if (max_charset_id
< id
)
2361 max_charset_id
= id
;
2364 safe_charsets
= Fmake_string (make_number (max_charset_id
+ 1),
2366 request
= AREF (attrs
, coding_attr_iso_request
);
2367 reg_usage
= AREF (attrs
, coding_attr_iso_usage
);
2368 reg94
= XINT (XCAR (reg_usage
));
2369 reg96
= XINT (XCDR (reg_usage
));
2371 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
2375 struct charset
*charset
;
2378 charset
= CHARSET_FROM_ID (XINT (id
));
2379 reg
= Fcdr (Fassq (id
, request
));
2381 XSTRING (safe_charsets
)->data
[XINT (id
)] = XINT (reg
);
2382 else if (charset
->iso_chars_96
)
2385 XSTRING (safe_charsets
)->data
[XINT (id
)] = reg96
;
2390 XSTRING (safe_charsets
)->data
[XINT (id
)] = reg94
;
2393 ASET (attrs
, coding_attr_safe_charsets
, safe_charsets
);
2397 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2398 Check if a text is encoded in one of ISO-2022 based codig systems.
2399 If it is, return 1, else return 0. */
2402 detect_coding_iso_2022 (coding
, detect_info
)
2403 struct coding_system
*coding
;
2404 struct coding_detection_info
*detect_info
;
2406 unsigned char *src
= coding
->source
, *src_base
= src
;
2407 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
2408 int multibytep
= coding
->src_multibyte
;
2409 int single_shifting
= 0;
2412 int consumed_chars
= 0;
2417 detect_info
->checked
|= CATEGORY_MASK_ISO
;
2419 for (i
= coding_category_iso_7
; i
<= coding_category_iso_8_else
; i
++)
2421 struct coding_system
*this = &(coding_categories
[i
]);
2422 Lisp_Object attrs
, val
;
2424 attrs
= CODING_ID_ATTRS (this->id
);
2425 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2426 && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs
), Viso_2022_charset_list
))
2427 setup_iso_safe_charsets (attrs
);
2428 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
2429 this->max_charset_id
= XSTRING (val
)->size
- 1;
2430 this->safe_charsets
= (char *) XSTRING (val
)->data
;
2433 /* A coding system of this category is always ASCII compatible. */
2434 src
+= coding
->head_ascii
;
2436 while (rejected
!= CATEGORY_MASK_ISO
)
2442 if (inhibit_iso_escape_detection
)
2444 single_shifting
= 0;
2446 if (c
>= '(' && c
<= '/')
2448 /* Designation sequence for a charset of dimension 1. */
2450 if (c1
< ' ' || c1
>= 0x80
2451 || (id
= iso_charset_table
[0][c
>= ','][c1
]) < 0)
2452 /* Invalid designation sequence. Just ignore. */
2457 /* Designation sequence for a charset of dimension 2. */
2459 if (c
>= '@' && c
<= 'B')
2460 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
2461 id
= iso_charset_table
[1][0][c
];
2462 else if (c
>= '(' && c
<= '/')
2465 if (c1
< ' ' || c1
>= 0x80
2466 || (id
= iso_charset_table
[1][c
>= ','][c1
]) < 0)
2467 /* Invalid designation sequence. Just ignore. */
2471 /* Invalid designation sequence. Just ignore it. */
2474 else if (c
== 'N' || c
== 'O')
2476 /* ESC <Fe> for SS2 or SS3. */
2477 single_shifting
= 1;
2478 rejected
|= CATEGORY_MASK_ISO_7BIT
| CATEGORY_MASK_ISO_8BIT
;
2481 else if (c
>= '0' && c
<= '4')
2483 /* ESC <Fp> for start/end composition. */
2484 found
|= CATEGORY_MASK_ISO
;
2489 /* Invalid escape sequence. Just ignore it. */
2493 /* We found a valid designation sequence for CHARSET. */
2494 rejected
|= CATEGORY_MASK_ISO_8BIT
;
2495 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7
],
2497 found
|= CATEGORY_MASK_ISO_7
;
2499 rejected
|= CATEGORY_MASK_ISO_7
;
2500 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7_tight
],
2502 found
|= CATEGORY_MASK_ISO_7_TIGHT
;
2504 rejected
|= CATEGORY_MASK_ISO_7_TIGHT
;
2505 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_7_else
],
2507 found
|= CATEGORY_MASK_ISO_7_ELSE
;
2509 rejected
|= CATEGORY_MASK_ISO_7_ELSE
;
2510 if (SAFE_CHARSET_P (&coding_categories
[coding_category_iso_8_else
],
2512 found
|= CATEGORY_MASK_ISO_8_ELSE
;
2514 rejected
|= CATEGORY_MASK_ISO_8_ELSE
;
2519 /* Locking shift out/in. */
2520 if (inhibit_iso_escape_detection
)
2522 single_shifting
= 0;
2523 rejected
|= CATEGORY_MASK_ISO_7BIT
| CATEGORY_MASK_ISO_8BIT
;
2524 found
|= CATEGORY_MASK_ISO_ELSE
;
2528 /* Control sequence introducer. */
2529 single_shifting
= 0;
2530 rejected
|= CATEGORY_MASK_ISO_7BIT
| CATEGORY_MASK_ISO_7_ELSE
;
2531 found
|= CATEGORY_MASK_ISO_8_ELSE
;
2532 goto check_extra_latin
;
2538 if (inhibit_iso_escape_detection
)
2540 single_shifting
= 1;
2541 rejected
|= CATEGORY_MASK_ISO_7BIT
;
2542 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_1
])
2543 & CODING_ISO_FLAG_SINGLE_SHIFT
)
2544 found
|= CATEGORY_MASK_ISO_8_1
;
2545 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_2
])
2546 & CODING_ISO_FLAG_SINGLE_SHIFT
)
2547 found
|= CATEGORY_MASK_ISO_8_2
;
2548 goto check_extra_latin
;
2553 single_shifting
= 0;
2558 rejected
|= CATEGORY_MASK_ISO_7BIT
| CATEGORY_MASK_ISO_7_ELSE
;
2559 found
|= CATEGORY_MASK_ISO_8_1
;
2560 /* Check the length of succeeding codes of the range
2561 0xA0..0FF. If the byte length is even, we include
2562 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
2563 only when we are not single shifting. */
2564 if (! single_shifting
2565 && ! (rejected
& CATEGORY_MASK_ISO_8_2
))
2568 while (src
< src_end
)
2576 if (i
& 1 && src
< src_end
)
2577 rejected
|= CATEGORY_MASK_ISO_8_2
;
2579 found
|= CATEGORY_MASK_ISO_8_2
;
2584 single_shifting
= 0;
2585 if (! VECTORP (Vlatin_extra_code_table
)
2586 || NILP (XVECTOR (Vlatin_extra_code_table
)->contents
[c
]))
2588 rejected
= CATEGORY_MASK_ISO
;
2591 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_1
])
2592 & CODING_ISO_FLAG_LATIN_EXTRA
)
2593 found
|= CATEGORY_MASK_ISO_8_1
;
2595 rejected
|= CATEGORY_MASK_ISO_8_1
;
2596 if (CODING_ISO_FLAGS (&coding_categories
[coding_category_iso_8_2
])
2597 & CODING_ISO_FLAG_LATIN_EXTRA
)
2598 found
|= CATEGORY_MASK_ISO_8_2
;
2600 rejected
|= CATEGORY_MASK_ISO_8_2
;
2603 detect_info
->rejected
|= CATEGORY_MASK_ISO
;
2607 detect_info
->rejected
|= rejected
;
2608 detect_info
->found
|= (found
& ~rejected
);
2613 /* Set designation state into CODING. */
2614 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
2618 if (final < '0' || final >= 128 \
2619 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
2620 || !SAFE_CHARSET_P (coding, id)) \
2622 CODING_ISO_DESIGNATION (coding, reg) = -2; \
2623 goto invalid_code; \
2625 prev = CODING_ISO_DESIGNATION (coding, reg); \
2626 if (id == charset_jisx0201_roman) \
2628 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
2629 id = charset_ascii; \
2631 else if (id == charset_jisx0208_1978) \
2633 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
2634 id = charset_jisx0208; \
2636 CODING_ISO_DESIGNATION (coding, reg) = id; \
2637 /* If there was an invalid designation to REG previously, and this \
2638 designation is ASCII to REG, we should keep this designation \
2640 if (prev == -2 && id == charset_ascii) \
2641 goto invalid_code; \
2645 #define MAYBE_FINISH_COMPOSITION() \
2648 if (composition_state == COMPOSING_NO) \
2650 /* It is assured that we have enough room for producing \
2651 characters stored in the table `components'. */ \
2652 if (charbuf + component_idx > charbuf_end) \
2653 goto no_more_source; \
2654 composition_state = COMPOSING_NO; \
2655 if (method == COMPOSITION_RELATIVE \
2656 || method == COMPOSITION_WITH_ALTCHARS) \
2658 for (i = 0; i < component_idx; i++) \
2659 *charbuf++ = components[i]; \
2660 char_offset += component_idx; \
2664 for (i = 0; i < component_idx; i += 2) \
2665 *charbuf++ = components[i]; \
2666 char_offset += (component_idx / 2) + 1; \
2671 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2672 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2673 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2674 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2675 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2678 #define DECODE_COMPOSITION_START(c1) \
2681 && composition_state == COMPOSING_COMPONENT_RULE) \
2683 component_len = component_idx; \
2684 composition_state = COMPOSING_CHAR; \
2690 MAYBE_FINISH_COMPOSITION (); \
2691 if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \
2692 goto no_more_source; \
2693 for (p = src; p < src_end - 1; p++) \
2694 if (*p == ISO_CODE_ESC && p[1] == '1') \
2696 if (p == src_end - 1) \
2698 if (coding->mode & CODING_MODE_LAST_BLOCK) \
2699 goto invalid_code; \
2700 goto no_more_source; \
2703 /* This is surely the start of a composition. */ \
2704 method = (c1 == '0' ? COMPOSITION_RELATIVE \
2705 : c1 == '2' ? COMPOSITION_WITH_RULE \
2706 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
2707 : COMPOSITION_WITH_RULE_ALTCHARS); \
2708 composition_state = (c1 <= '2' ? COMPOSING_CHAR \
2709 : COMPOSING_COMPONENT_CHAR); \
2710 component_idx = component_len = 0; \
2715 /* Handle compositoin end sequence ESC 1. */
2717 #define DECODE_COMPOSITION_END() \
2719 int nchars = (component_len > 0 ? component_idx - component_len \
2720 : method == COMPOSITION_RELATIVE ? component_idx \
2721 : (component_idx + 1) / 2); \
2723 int *saved_charbuf = charbuf; \
2724 int from = coding->produced_char + char_offset; \
2725 int to = from + nchars; \
2727 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
2728 if (method != COMPOSITION_RELATIVE) \
2730 if (component_len == 0) \
2731 for (i = 0; i < component_idx; i++) \
2732 *charbuf++ = components[i]; \
2734 for (i = 0; i < component_len; i++) \
2735 *charbuf++ = components[i]; \
2736 *saved_charbuf = saved_charbuf - charbuf; \
2738 if (method == COMPOSITION_WITH_RULE) \
2739 for (i = 0; i < component_idx; i += 2, char_offset++) \
2740 *charbuf++ = components[i]; \
2742 for (i = component_len; i < component_idx; i++, char_offset++) \
2743 *charbuf++ = components[i]; \
2744 coding->annotated = 1; \
2745 composition_state = COMPOSING_NO; \
2749 /* Decode a composition rule from the byte C1 (and maybe one more byte
2750 from SRC) and store one encoded composition rule in
2751 coding->cmp_data. */
2753 #define DECODE_COMPOSITION_RULE(c1) \
2756 if (c1 < 81) /* old format (before ver.21) */ \
2758 int gref = (c1) / 9; \
2759 int nref = (c1) % 9; \
2760 if (gref == 4) gref = 10; \
2761 if (nref == 4) nref = 10; \
2762 c1 = COMPOSITION_ENCODE_RULE (gref, nref); \
2764 else if (c1 < 93) /* new format (after ver.21) */ \
2766 ONE_MORE_BYTE (c2); \
2767 c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
2774 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2777 decode_coding_iso_2022 (coding
)
2778 struct coding_system
*coding
;
2780 unsigned char *src
= coding
->source
+ coding
->consumed
;
2781 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
2782 unsigned char *src_base
;
2783 int *charbuf
= coding
->charbuf
;
2785 = charbuf
+ coding
->charbuf_size
- 4 - MAX_ANNOTATION_LENGTH
;
2786 int consumed_chars
= 0, consumed_chars_base
;
2787 int multibytep
= coding
->src_multibyte
;
2788 /* Charsets invoked to graphic plane 0 and 1 respectively. */
2789 int charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2790 int charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
2791 struct charset
*charset
;
2793 /* For handling composition sequence. */
2794 #define COMPOSING_NO 0
2795 #define COMPOSING_CHAR 1
2796 #define COMPOSING_RULE 2
2797 #define COMPOSING_COMPONENT_CHAR 3
2798 #define COMPOSING_COMPONENT_RULE 4
2800 int composition_state
= COMPOSING_NO
;
2801 enum composition_method method
;
2802 int components
[MAX_COMPOSITION_COMPONENTS
* 2 + 1];
2805 Lisp_Object attrs
, eol_type
, charset_list
;
2806 int char_offset
= coding
->produced_char
;
2807 int last_offset
= char_offset
;
2808 int last_id
= charset_ascii
;
2810 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
2811 setup_iso_safe_charsets (attrs
);
2818 consumed_chars_base
= consumed_chars
;
2820 if (charbuf
>= charbuf_end
)
2825 /* We produce at most one character. */
2826 switch (iso_code_class
[c1
])
2828 case ISO_0x20_or_0x7F
:
2829 if (composition_state
!= COMPOSING_NO
)
2831 if (composition_state
== COMPOSING_RULE
2832 || composition_state
== COMPOSING_COMPONENT_RULE
)
2834 DECODE_COMPOSITION_RULE (c1
);
2835 components
[component_idx
++] = c1
;
2836 composition_state
--;
2840 if (charset_id_0
< 0
2841 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0
)))
2842 /* This is SPACE or DEL. */
2843 charset
= CHARSET_FROM_ID (charset_ascii
);
2845 charset
= CHARSET_FROM_ID (charset_id_0
);
2848 case ISO_graphic_plane_0
:
2849 if (composition_state
!= COMPOSING_NO
)
2851 if (composition_state
== COMPOSING_RULE
2852 || composition_state
== COMPOSING_COMPONENT_RULE
)
2854 DECODE_COMPOSITION_RULE (c1
);
2855 components
[component_idx
++] = c1
;
2856 composition_state
--;
2860 charset
= CHARSET_FROM_ID (charset_id_0
);
2863 case ISO_0xA0_or_0xFF
:
2864 if (charset_id_1
< 0
2865 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1
))
2866 || CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SEVEN_BITS
)
2868 /* This is a graphic character, we fall down ... */
2870 case ISO_graphic_plane_1
:
2871 if (charset_id_1
< 0)
2873 charset
= CHARSET_FROM_ID (charset_id_1
);
2876 case ISO_carriage_return
:
2879 if (EQ (eol_type
, Qdos
))
2883 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
2884 goto no_more_source
;
2889 else if (EQ (eol_type
, Qmac
))
2895 MAYBE_FINISH_COMPOSITION ();
2896 charset
= CHARSET_FROM_ID (charset_ascii
);
2900 MAYBE_FINISH_COMPOSITION ();
2904 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
2905 || CODING_ISO_DESIGNATION (coding
, 1) < 0)
2907 CODING_ISO_INVOCATION (coding
, 0) = 1;
2908 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2912 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
))
2914 CODING_ISO_INVOCATION (coding
, 0) = 0;
2915 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2918 case ISO_single_shift_2_7
:
2919 case ISO_single_shift_2
:
2920 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
))
2922 /* SS2 is handled as an escape sequence of ESC 'N' */
2924 goto label_escape_sequence
;
2926 case ISO_single_shift_3
:
2927 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
))
2929 /* SS2 is handled as an escape sequence of ESC 'O' */
2931 goto label_escape_sequence
;
2933 case ISO_control_sequence_introducer
:
2934 /* CSI is handled as an escape sequence of ESC '[' ... */
2936 goto label_escape_sequence
;
2940 label_escape_sequence
:
2941 /* Escape sequences handled here are invocation,
2942 designation, direction specification, and character
2943 composition specification. */
2946 case '&': /* revision of following character set */
2948 if (!(c1
>= '@' && c1
<= '~'))
2951 if (c1
!= ISO_CODE_ESC
)
2954 goto label_escape_sequence
;
2956 case '$': /* designation of 2-byte character set */
2957 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATION
))
2960 if (c1
>= '@' && c1
<= 'B')
2961 { /* designation of JISX0208.1978, GB2312.1980,
2963 DECODE_DESIGNATION (0, 2, 0, c1
);
2965 else if (c1
>= 0x28 && c1
<= 0x2B)
2966 { /* designation of DIMENSION2_CHARS94 character set */
2968 DECODE_DESIGNATION (c1
- 0x28, 2, 0, c2
);
2970 else if (c1
>= 0x2C && c1
<= 0x2F)
2971 { /* designation of DIMENSION2_CHARS96 character set */
2973 DECODE_DESIGNATION (c1
- 0x2C, 2, 1, c2
);
2977 /* We must update these variables now. */
2978 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2979 charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
2982 case 'n': /* invocation of locking-shift-2 */
2983 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
2984 || CODING_ISO_DESIGNATION (coding
, 2) < 0)
2986 CODING_ISO_INVOCATION (coding
, 0) = 2;
2987 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2990 case 'o': /* invocation of locking-shift-3 */
2991 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_LOCKING_SHIFT
)
2992 || CODING_ISO_DESIGNATION (coding
, 3) < 0)
2994 CODING_ISO_INVOCATION (coding
, 0) = 3;
2995 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
2998 case 'N': /* invocation of single-shift-2 */
2999 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
3000 || CODING_ISO_DESIGNATION (coding
, 2) < 0)
3002 charset
= CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding
, 2));
3004 if (c1
< 0x20 || (c1
>= 0x80 && c1
< 0xA0))
3008 case 'O': /* invocation of single-shift-3 */
3009 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
3010 || CODING_ISO_DESIGNATION (coding
, 3) < 0)
3012 charset
= CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding
, 3));
3014 if (c1
< 0x20 || (c1
>= 0x80 && c1
< 0xA0))
3018 case '0': case '2': case '3': case '4': /* start composition */
3019 if (! (coding
->common_flags
& CODING_ANNOTATE_COMPOSITION_MASK
))
3021 DECODE_COMPOSITION_START (c1
);
3024 case '1': /* end composition */
3025 if (composition_state
== COMPOSING_NO
)
3027 DECODE_COMPOSITION_END ();
3030 case '[': /* specification of direction */
3031 if (! CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DIRECTION
)
3033 /* For the moment, nested direction is not supported.
3034 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3035 left-to-right, and nozero means right-to-left. */
3039 case ']': /* end of the current direction */
3040 coding
->mode
&= ~CODING_MODE_DIRECTION
;
3042 case '0': /* end of the current direction */
3043 case '1': /* start of left-to-right direction */
3046 coding
->mode
&= ~CODING_MODE_DIRECTION
;
3051 case '2': /* start of right-to-left direction */
3054 coding
->mode
|= CODING_MODE_DIRECTION
;
3068 /* CTEXT extended segment:
3069 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3070 We keep these bytes as is for the moment.
3071 They may be decoded by post-read-conversion. */
3075 ONE_MORE_BYTE (dim
);
3078 size
= ((M
- 128) * 128) + (L
- 128);
3079 if (charbuf
+ 8 + size
> charbuf_end
)
3081 *charbuf
++ = ISO_CODE_ESC
;
3085 *charbuf
++ = BYTE8_TO_CHAR (M
);
3086 *charbuf
++ = BYTE8_TO_CHAR (L
);
3090 *charbuf
++ = ASCII_BYTE_P (c1
) ? c1
: BYTE8_TO_CHAR (c1
);
3095 /* XFree86 extension for embedding UTF-8 in CTEXT:
3096 ESC % G --UTF-8-BYTES-- ESC % @
3097 We keep these bytes as is for the moment.
3098 They may be decoded by post-read-conversion. */
3101 if (p
+ 6 > charbuf_end
)
3103 *p
++ = ISO_CODE_ESC
;
3106 while (p
< charbuf_end
)
3109 if (c1
== ISO_CODE_ESC
3110 && src
+ 1 < src_end
3114 *p
++ = ASCII_BYTE_P (c1
) ? c1
: BYTE8_TO_CHAR (c1
);
3116 if (p
+ 3 > charbuf_end
)
3118 *p
++ = ISO_CODE_ESC
;
3129 if (! (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATION
))
3131 if (c1
>= 0x28 && c1
<= 0x2B)
3132 { /* designation of DIMENSION1_CHARS94 character set */
3134 DECODE_DESIGNATION (c1
- 0x28, 1, 0, c2
);
3136 else if (c1
>= 0x2C && c1
<= 0x2F)
3137 { /* designation of DIMENSION1_CHARS96 character set */
3139 DECODE_DESIGNATION (c1
- 0x2C, 1, 1, c2
);
3143 /* We must update these variables now. */
3144 charset_id_0
= CODING_ISO_INVOKED_CHARSET (coding
, 0);
3145 charset_id_1
= CODING_ISO_INVOKED_CHARSET (coding
, 1);
3150 if (charset
->id
!= charset_ascii
3151 && last_id
!= charset
->id
)
3153 if (last_id
!= charset_ascii
)
3154 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
3155 last_id
= charset
->id
;
3156 last_offset
= char_offset
;
3159 /* Now we know CHARSET and 1st position code C1 of a character.
3160 Produce a decoded character while getting 2nd position code
3163 if (CHARSET_DIMENSION (charset
) > 1)
3166 if (c2
< 0x20 || (c2
>= 0x80 && c2
< 0xA0))
3167 /* C2 is not in a valid range. */
3169 c1
= (c1
<< 8) | (c2
& 0x7F);
3170 if (CHARSET_DIMENSION (charset
) > 2)
3173 if (c2
< 0x20 || (c2
>= 0x80 && c2
< 0xA0))
3174 /* C2 is not in a valid range. */
3176 c1
= (c1
<< 8) | (c2
& 0x7F);
3180 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c1
, c
);
3183 MAYBE_FINISH_COMPOSITION ();
3184 for (; src_base
< src
; src_base
++, char_offset
++)
3186 if (ASCII_BYTE_P (*src_base
))
3187 *charbuf
++ = *src_base
;
3189 *charbuf
++ = BYTE8_TO_CHAR (*src_base
);
3192 else if (composition_state
== COMPOSING_NO
)
3199 components
[component_idx
++] = c
;
3200 if (method
== COMPOSITION_WITH_RULE
3201 || (method
== COMPOSITION_WITH_RULE_ALTCHARS
3202 && composition_state
== COMPOSING_COMPONENT_CHAR
))
3203 composition_state
++;
3208 MAYBE_FINISH_COMPOSITION ();
3210 consumed_chars
= consumed_chars_base
;
3212 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
3222 if (last_id
!= charset_ascii
)
3223 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
3224 coding
->consumed_char
+= consumed_chars_base
;
3225 coding
->consumed
= src_base
- coding
->source
;
3226 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
3230 /* ISO2022 encoding stuff. */
3233 It is not enough to say just "ISO2022" on encoding, we have to
3234 specify more details. In Emacs, each coding system of ISO2022
3235 variant has the following specifications:
3236 1. Initial designation to G0 thru G3.
3237 2. Allows short-form designation?
3238 3. ASCII should be designated to G0 before control characters?
3239 4. ASCII should be designated to G0 at end of line?
3240 5. 7-bit environment or 8-bit environment?
3241 6. Use locking-shift?
3242 7. Use Single-shift?
3243 And the following two are only for Japanese:
3244 8. Use ASCII in place of JIS0201-1976-Roman?
3245 9. Use JISX0208-1983 in place of JISX0208-1978?
3246 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3247 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
3251 /* Produce codes (escape sequence) for designating CHARSET to graphic
3252 register REG at DST, and increment DST. If <final-char> of CHARSET is
3253 '@', 'A', or 'B' and the coding system CODING allows, produce
3254 designation sequence of short-form. */
3256 #define ENCODE_DESIGNATION(charset, reg, coding) \
3258 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
3259 char *intermediate_char_94 = "()*+"; \
3260 char *intermediate_char_96 = ",-./"; \
3261 int revision = -1; \
3264 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
3265 revision = CHARSET_ISO_REVISION (charset); \
3267 if (revision >= 0) \
3269 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3270 EMIT_ONE_BYTE ('@' + revision); \
3272 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
3273 if (CHARSET_DIMENSION (charset) == 1) \
3275 if (! CHARSET_ISO_CHARS_96 (charset)) \
3276 c = intermediate_char_94[reg]; \
3278 c = intermediate_char_96[reg]; \
3279 EMIT_ONE_ASCII_BYTE (c); \
3283 EMIT_ONE_ASCII_BYTE ('$'); \
3284 if (! CHARSET_ISO_CHARS_96 (charset)) \
3286 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
3288 || final_char < '@' || final_char > 'B') \
3289 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
3292 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
3294 EMIT_ONE_ASCII_BYTE (final_char); \
3296 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
3300 /* The following two macros produce codes (control character or escape
3301 sequence) for ISO2022 single-shift functions (single-shift-2 and
3304 #define ENCODE_SINGLE_SHIFT_2 \
3306 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3307 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3309 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3310 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3314 #define ENCODE_SINGLE_SHIFT_3 \
3316 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3317 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
3319 EMIT_ONE_BYTE (ISO_CODE_SS3); \
3320 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3324 /* The following four macros produce codes (control character or
3325 escape sequence) for ISO2022 locking-shift functions (shift-in,
3326 shift-out, locking-shift-2, and locking-shift-3). */
3328 #define ENCODE_SHIFT_IN \
3330 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
3331 CODING_ISO_INVOCATION (coding, 0) = 0; \
3335 #define ENCODE_SHIFT_OUT \
3337 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
3338 CODING_ISO_INVOCATION (coding, 0) = 1; \
3342 #define ENCODE_LOCKING_SHIFT_2 \
3344 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3345 CODING_ISO_INVOCATION (coding, 0) = 2; \
3349 #define ENCODE_LOCKING_SHIFT_3 \
3351 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3352 CODING_ISO_INVOCATION (coding, 0) = 3; \
3356 /* Produce codes for a DIMENSION1 character whose character set is
3357 CHARSET and whose position-code is C1. Designation and invocation
3358 sequences are also produced in advance if necessary. */
3360 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
3362 int id = CHARSET_ID (charset); \
3364 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3365 && id == charset_ascii) \
3367 id = charset_jisx0201_roman; \
3368 charset = CHARSET_FROM_ID (id); \
3371 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3373 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3374 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3376 EMIT_ONE_BYTE (c1 | 0x80); \
3377 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3380 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3382 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3385 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3387 EMIT_ONE_BYTE (c1 | 0x80); \
3391 /* Since CHARSET is not yet invoked to any graphic planes, we \
3392 must invoke it, or, at first, designate it to some graphic \
3393 register. Then repeat the loop to actually produce the \
3395 dst = encode_invocation_designation (charset, coding, dst, \
3400 /* Produce codes for a DIMENSION2 character whose character set is
3401 CHARSET and whose position-codes are C1 and C2. Designation and
3402 invocation codes are also produced in advance if necessary. */
3404 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
3406 int id = CHARSET_ID (charset); \
3408 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3409 && id == charset_jisx0208) \
3411 id = charset_jisx0208_1978; \
3412 charset = CHARSET_FROM_ID (id); \
3415 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3417 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3418 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3420 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3421 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3424 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3426 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3429 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3431 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3435 /* Since CHARSET is not yet invoked to any graphic planes, we \
3436 must invoke it, or, at first, designate it to some graphic \
3437 register. Then repeat the loop to actually produce the \
3439 dst = encode_invocation_designation (charset, coding, dst, \
3444 #define ENCODE_ISO_CHARACTER(charset, c) \
3446 int code = ENCODE_CHAR ((charset),(c)); \
3448 if (CHARSET_DIMENSION (charset) == 1) \
3449 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
3451 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3455 /* Produce designation and invocation codes at a place pointed by DST
3456 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
3460 encode_invocation_designation (charset
, coding
, dst
, p_nchars
)
3461 struct charset
*charset
;
3462 struct coding_system
*coding
;
3466 int multibytep
= coding
->dst_multibyte
;
3467 int produced_chars
= *p_nchars
;
3468 int reg
; /* graphic register number */
3469 int id
= CHARSET_ID (charset
);
3471 /* At first, check designations. */
3472 for (reg
= 0; reg
< 4; reg
++)
3473 if (id
== CODING_ISO_DESIGNATION (coding
, reg
))
3478 /* CHARSET is not yet designated to any graphic registers. */
3479 /* At first check the requested designation. */
3480 reg
= CODING_ISO_REQUEST (coding
, id
);
3482 /* Since CHARSET requests no special designation, designate it
3483 to graphic register 0. */
3486 ENCODE_DESIGNATION (charset
, reg
, coding
);
3489 if (CODING_ISO_INVOCATION (coding
, 0) != reg
3490 && CODING_ISO_INVOCATION (coding
, 1) != reg
)
3492 /* Since the graphic register REG is not invoked to any graphic
3493 planes, invoke it to graphic plane 0. */
3496 case 0: /* graphic register 0 */
3500 case 1: /* graphic register 1 */
3504 case 2: /* graphic register 2 */
3505 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
3506 ENCODE_SINGLE_SHIFT_2
;
3508 ENCODE_LOCKING_SHIFT_2
;
3511 case 3: /* graphic register 3 */
3512 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_SINGLE_SHIFT
)
3513 ENCODE_SINGLE_SHIFT_3
;
3515 ENCODE_LOCKING_SHIFT_3
;
3520 *p_nchars
= produced_chars
;
3524 /* The following three macros produce codes for indicating direction
3526 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
3528 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
3529 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
3531 EMIT_ONE_BYTE (ISO_CODE_CSI); \
3535 #define ENCODE_DIRECTION_R2L() \
3537 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3538 EMIT_TWO_ASCII_BYTES ('2', ']'); \
3542 #define ENCODE_DIRECTION_L2R() \
3544 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3545 EMIT_TWO_ASCII_BYTES ('0', ']'); \
3549 /* Produce codes for designation and invocation to reset the graphic
3550 planes and registers to initial state. */
3551 #define ENCODE_RESET_PLANE_AND_REGISTER() \
3554 struct charset *charset; \
3556 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
3558 for (reg = 0; reg < 4; reg++) \
3559 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
3560 && (CODING_ISO_DESIGNATION (coding, reg) \
3561 != CODING_ISO_INITIAL (coding, reg))) \
3563 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3564 ENCODE_DESIGNATION (charset, reg, coding); \
3569 /* Produce designation sequences of charsets in the line started from
3570 SRC to a place pointed by DST, and return updated DST.
3572 If the current block ends before any end-of-line, we may fail to
3573 find all the necessary designations. */
3575 static unsigned char *
3576 encode_designation_at_bol (coding
, charbuf
, charbuf_end
, dst
)
3577 struct coding_system
*coding
;
3578 int *charbuf
, *charbuf_end
;
3581 struct charset
*charset
;
3582 /* Table of charsets to be designated to each graphic register. */
3584 int c
, found
= 0, reg
;
3585 int produced_chars
= 0;
3586 int multibytep
= coding
->dst_multibyte
;
3588 Lisp_Object charset_list
;
3590 attrs
= CODING_ID_ATTRS (coding
->id
);
3591 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
3592 if (EQ (charset_list
, Qiso_2022
))
3593 charset_list
= Viso_2022_charset_list
;
3595 for (reg
= 0; reg
< 4; reg
++)
3605 charset
= char_charset (c
, charset_list
, NULL
);
3606 id
= CHARSET_ID (charset
);
3607 reg
= CODING_ISO_REQUEST (coding
, id
);
3608 if (reg
>= 0 && r
[reg
] < 0)
3617 for (reg
= 0; reg
< 4; reg
++)
3619 && CODING_ISO_DESIGNATION (coding
, reg
) != r
[reg
])
3620 ENCODE_DESIGNATION (CHARSET_FROM_ID (r
[reg
]), reg
, coding
);
3626 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
3629 encode_coding_iso_2022 (coding
)
3630 struct coding_system
*coding
;
3632 int multibytep
= coding
->dst_multibyte
;
3633 int *charbuf
= coding
->charbuf
;
3634 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
3635 unsigned char *dst
= coding
->destination
+ coding
->produced
;
3636 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
3639 = (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3640 && CODING_ISO_BOL (coding
));
3641 int produced_chars
= 0;
3642 Lisp_Object attrs
, eol_type
, charset_list
;
3643 int ascii_compatible
;
3645 int preferred_charset_id
= -1;
3647 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
3648 setup_iso_safe_charsets (attrs
);
3649 /* Charset list may have been changed. */
3650 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
); \
3651 coding
->safe_charsets
3652 = (char *) XSTRING (CODING_ATTR_SAFE_CHARSETS(attrs
))->data
;
3654 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
3656 while (charbuf
< charbuf_end
)
3658 ASSURE_DESTINATION (safe_room
);
3660 if (bol_designation
)
3662 unsigned char *dst_prev
= dst
;
3664 /* We have to produce designation sequences if any now. */
3665 dst
= encode_designation_at_bol (coding
, charbuf
, charbuf_end
, dst
);
3666 bol_designation
= 0;
3667 /* We are sure that designation sequences are all ASCII bytes. */
3668 produced_chars
+= dst
- dst_prev
;
3675 /* Handle an annotation. */
3678 case CODING_ANNOTATE_COMPOSITION_MASK
:
3679 /* Not yet implemented. */
3681 case CODING_ANNOTATE_CHARSET_MASK
:
3682 preferred_charset_id
= charbuf
[3];
3683 if (preferred_charset_id
>= 0
3684 && NILP (Fmemq (make_number (preferred_charset_id
),
3686 preferred_charset_id
= -1;
3695 /* Now encode the character C. */
3696 if (c
< 0x20 || c
== 0x7F)
3699 || (c
== '\r' && EQ (eol_type
, Qmac
)))
3701 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_EOL
)
3702 ENCODE_RESET_PLANE_AND_REGISTER ();
3703 if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_INIT_AT_BOL
)
3707 for (i
= 0; i
< 4; i
++)
3708 CODING_ISO_DESIGNATION (coding
, i
)
3709 = CODING_ISO_INITIAL (coding
, i
);
3712 = CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
;
3714 else if (CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_CNTL
)
3715 ENCODE_RESET_PLANE_AND_REGISTER ();
3716 EMIT_ONE_ASCII_BYTE (c
);
3718 else if (ASCII_CHAR_P (c
))
3720 if (ascii_compatible
)
3721 EMIT_ONE_ASCII_BYTE (c
);
3724 struct charset
*charset
= CHARSET_FROM_ID (charset_ascii
);
3725 ENCODE_ISO_CHARACTER (charset
, c
);
3728 else if (CHAR_BYTE8_P (c
))
3730 c
= CHAR_TO_BYTE8 (c
);
3735 struct charset
*charset
;
3737 if (preferred_charset_id
>= 0)
3739 charset
= CHARSET_FROM_ID (preferred_charset_id
);
3740 if (! CHAR_CHARSET_P (c
, charset
))
3741 charset
= char_charset (c
, charset_list
, NULL
);
3744 charset
= char_charset (c
, charset_list
, NULL
);
3747 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
3749 c
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
3750 charset
= CHARSET_FROM_ID (charset_ascii
);
3754 c
= coding
->default_char
;
3755 charset
= char_charset (c
, charset_list
, NULL
);
3758 ENCODE_ISO_CHARACTER (charset
, c
);
3762 if (coding
->mode
& CODING_MODE_LAST_BLOCK
3763 && CODING_ISO_FLAGS (coding
) & CODING_ISO_FLAG_RESET_AT_EOL
)
3765 ASSURE_DESTINATION (safe_room
);
3766 ENCODE_RESET_PLANE_AND_REGISTER ();
3768 coding
->result
= CODING_RESULT_SUCCESS
;
3769 CODING_ISO_BOL (coding
) = bol_designation
;
3770 coding
->produced_char
+= produced_chars
;
3771 coding
->produced
= dst
- coding
->destination
;
3776 /*** 8,9. SJIS and BIG5 handlers ***/
3778 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3779 quite widely. So, for the moment, Emacs supports them in the bare
3780 C code. But, in the future, they may be supported only by CCL. */
3782 /* SJIS is a coding system encoding three character sets: ASCII, right
3783 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3784 as is. A character of charset katakana-jisx0201 is encoded by
3785 "position-code + 0x80". A character of charset japanese-jisx0208
3786 is encoded in 2-byte but two position-codes are divided and shifted
3787 so that it fit in the range below.
3789 --- CODE RANGE of SJIS ---
3790 (character set) (range)
3792 KATAKANA-JISX0201 0xA0 .. 0xDF
3793 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
3794 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3795 -------------------------------
3799 /* BIG5 is a coding system encoding two character sets: ASCII and
3800 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3801 character set and is encoded in two-byte.
3803 --- CODE RANGE of BIG5 ---
3804 (character set) (range)
3806 Big5 (1st byte) 0xA1 .. 0xFE
3807 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3808 --------------------------
3812 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3813 Check if a text is encoded in SJIS. If it is, return
3814 CATEGORY_MASK_SJIS, else return 0. */
3817 detect_coding_sjis (coding
, detect_info
)
3818 struct coding_system
*coding
;
3819 struct coding_detection_info
*detect_info
;
3821 unsigned char *src
= coding
->source
, *src_base
= src
;
3822 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3823 int multibytep
= coding
->src_multibyte
;
3824 int consumed_chars
= 0;
3829 detect_info
->checked
|= CATEGORY_MASK_SJIS
;
3830 /* A coding system of this category is always ASCII compatible. */
3831 src
+= coding
->head_ascii
;
3840 if ((c
>= 0x81 && c
<= 0x9F) || (c
>= 0xE0 && c
<= 0xEF))
3843 if (c
< 0x40 || c
== 0x7F || c
> 0xFC)
3845 found
= CATEGORY_MASK_SJIS
;
3847 else if (c
>= 0xA0 && c
< 0xE0)
3848 found
= CATEGORY_MASK_SJIS
;
3852 detect_info
->rejected
|= CATEGORY_MASK_SJIS
;
3856 if (incomplete
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
3858 detect_info
->rejected
|= CATEGORY_MASK_SJIS
;
3861 detect_info
->found
|= found
;
3865 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3866 Check if a text is encoded in BIG5. If it is, return
3867 CATEGORY_MASK_BIG5, else return 0. */
3870 detect_coding_big5 (coding
, detect_info
)
3871 struct coding_system
*coding
;
3872 struct coding_detection_info
*detect_info
;
3874 unsigned char *src
= coding
->source
, *src_base
= src
;
3875 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3876 int multibytep
= coding
->src_multibyte
;
3877 int consumed_chars
= 0;
3882 detect_info
->checked
|= CATEGORY_MASK_BIG5
;
3883 /* A coding system of this category is always ASCII compatible. */
3884 src
+= coding
->head_ascii
;
3896 if (c
< 0x40 || (c
>= 0x7F && c
<= 0xA0))
3898 found
= CATEGORY_MASK_BIG5
;
3903 detect_info
->rejected
|= CATEGORY_MASK_BIG5
;
3907 if (incomplete
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
3909 detect_info
->rejected
|= CATEGORY_MASK_BIG5
;
3912 detect_info
->found
|= found
;
3916 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3917 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
3920 decode_coding_sjis (coding
)
3921 struct coding_system
*coding
;
3923 unsigned char *src
= coding
->source
+ coding
->consumed
;
3924 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
3925 unsigned char *src_base
;
3926 int *charbuf
= coding
->charbuf
;
3927 int *charbuf_end
= charbuf
+ coding
->charbuf_size
- MAX_ANNOTATION_LENGTH
;
3928 int consumed_chars
= 0, consumed_chars_base
;
3929 int multibytep
= coding
->src_multibyte
;
3930 struct charset
*charset_roman
, *charset_kanji
, *charset_kana
;
3931 Lisp_Object attrs
, eol_type
, charset_list
, val
;
3932 int char_offset
= coding
->produced_char
;
3933 int last_offset
= char_offset
;
3934 int last_id
= charset_ascii
;
3936 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
3939 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
3940 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
3941 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
)));
3948 consumed_chars_base
= consumed_chars
;
3950 if (charbuf
>= charbuf_end
)
3957 if (EQ (eol_type
, Qdos
))
3961 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
3962 goto no_more_source
;
3967 else if (EQ (eol_type
, Qmac
))
3972 struct charset
*charset
;
3975 charset
= charset_roman
;
3980 if (c
< 0xA0 || c
>= 0xE0)
3982 /* SJIS -> JISX0208 */
3984 if (c1
< 0x40 || c1
== 0x7F || c1
> 0xFC)
3988 charset
= charset_kanji
;
3992 /* SJIS -> JISX0201-Kana */
3994 charset
= charset_kana
;
3999 if (charset
->id
!= charset_ascii
4000 && last_id
!= charset
->id
)
4002 if (last_id
!= charset_ascii
)
4003 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
4004 last_id
= charset
->id
;
4005 last_offset
= char_offset
;
4007 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c
, c
);
4015 consumed_chars
= consumed_chars_base
;
4017 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
4023 if (last_id
!= charset_ascii
)
4024 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
4025 coding
->consumed_char
+= consumed_chars_base
;
4026 coding
->consumed
= src_base
- coding
->source
;
4027 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4031 decode_coding_big5 (coding
)
4032 struct coding_system
*coding
;
4034 unsigned char *src
= coding
->source
+ coding
->consumed
;
4035 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4036 unsigned char *src_base
;
4037 int *charbuf
= coding
->charbuf
;
4038 int *charbuf_end
= charbuf
+ coding
->charbuf_size
- MAX_ANNOTATION_LENGTH
;
4039 int consumed_chars
= 0, consumed_chars_base
;
4040 int multibytep
= coding
->src_multibyte
;
4041 struct charset
*charset_roman
, *charset_big5
;
4042 Lisp_Object attrs
, eol_type
, charset_list
, val
;
4043 int char_offset
= coding
->produced_char
;
4044 int last_offset
= char_offset
;
4045 int last_id
= charset_ascii
;
4047 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
4049 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4050 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
4057 consumed_chars_base
= consumed_chars
;
4059 if (charbuf
>= charbuf_end
)
4066 if (EQ (eol_type
, Qdos
))
4070 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
4071 goto no_more_source
;
4076 else if (EQ (eol_type
, Qmac
))
4081 struct charset
*charset
;
4083 charset
= charset_roman
;
4087 if (c
< 0xA1 || c
> 0xFE)
4090 if (c1
< 0x40 || (c1
> 0x7E && c1
< 0xA1) || c1
> 0xFE)
4093 charset
= charset_big5
;
4095 if (charset
->id
!= charset_ascii
4096 && last_id
!= charset
->id
)
4098 if (last_id
!= charset_ascii
)
4099 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
4100 last_id
= charset
->id
;
4101 last_offset
= char_offset
;
4103 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
, charset
, c
, c
);
4112 consumed_chars
= consumed_chars_base
;
4114 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
4120 if (last_id
!= charset_ascii
)
4121 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
4122 coding
->consumed_char
+= consumed_chars_base
;
4123 coding
->consumed
= src_base
- coding
->source
;
4124 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4127 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4128 This function can encode charsets `ascii', `katakana-jisx0201',
4129 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4130 are sure that all these charsets are registered as official charset
4131 (i.e. do not have extended leading-codes). Characters of other
4132 charsets are produced without any encoding. If SJIS_P is 1, encode
4133 SJIS text, else encode BIG5 text. */
4136 encode_coding_sjis (coding
)
4137 struct coding_system
*coding
;
4139 int multibytep
= coding
->dst_multibyte
;
4140 int *charbuf
= coding
->charbuf
;
4141 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4142 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4143 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4145 int produced_chars
= 0;
4146 Lisp_Object attrs
, eol_type
, charset_list
, val
;
4147 int ascii_compatible
;
4148 struct charset
*charset_roman
, *charset_kanji
, *charset_kana
;
4151 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
4153 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4154 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4155 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
)));
4157 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
4159 while (charbuf
< charbuf_end
)
4161 ASSURE_DESTINATION (safe_room
);
4163 /* Now encode the character C. */
4164 if (ASCII_CHAR_P (c
) && ascii_compatible
)
4165 EMIT_ONE_ASCII_BYTE (c
);
4166 else if (CHAR_BYTE8_P (c
))
4168 c
= CHAR_TO_BYTE8 (c
);
4174 struct charset
*charset
= char_charset (c
, charset_list
, &code
);
4178 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
4180 code
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
4181 charset
= CHARSET_FROM_ID (charset_ascii
);
4185 c
= coding
->default_char
;
4186 charset
= char_charset (c
, charset_list
, &code
);
4189 if (code
== CHARSET_INVALID_CODE (charset
))
4191 if (charset
== charset_kanji
)
4195 c1
= code
>> 8, c2
= code
& 0xFF;
4196 EMIT_TWO_BYTES (c1
, c2
);
4198 else if (charset
== charset_kana
)
4199 EMIT_ONE_BYTE (code
| 0x80);
4201 EMIT_ONE_ASCII_BYTE (code
& 0x7F);
4204 coding
->result
= CODING_RESULT_SUCCESS
;
4205 coding
->produced_char
+= produced_chars
;
4206 coding
->produced
= dst
- coding
->destination
;
4211 encode_coding_big5 (coding
)
4212 struct coding_system
*coding
;
4214 int multibytep
= coding
->dst_multibyte
;
4215 int *charbuf
= coding
->charbuf
;
4216 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4217 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4218 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4220 int produced_chars
= 0;
4221 Lisp_Object attrs
, eol_type
, charset_list
, val
;
4222 int ascii_compatible
;
4223 struct charset
*charset_roman
, *charset_big5
;
4226 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
4228 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
4229 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
4230 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
4232 while (charbuf
< charbuf_end
)
4234 ASSURE_DESTINATION (safe_room
);
4236 /* Now encode the character C. */
4237 if (ASCII_CHAR_P (c
) && ascii_compatible
)
4238 EMIT_ONE_ASCII_BYTE (c
);
4239 else if (CHAR_BYTE8_P (c
))
4241 c
= CHAR_TO_BYTE8 (c
);
4247 struct charset
*charset
= char_charset (c
, charset_list
, &code
);
4251 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
4253 code
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
4254 charset
= CHARSET_FROM_ID (charset_ascii
);
4258 c
= coding
->default_char
;
4259 charset
= char_charset (c
, charset_list
, &code
);
4262 if (code
== CHARSET_INVALID_CODE (charset
))
4264 if (charset
== charset_big5
)
4268 c1
= code
>> 8, c2
= code
& 0xFF;
4269 EMIT_TWO_BYTES (c1
, c2
);
4272 EMIT_ONE_ASCII_BYTE (code
& 0x7F);
4275 coding
->result
= CODING_RESULT_SUCCESS
;
4276 coding
->produced_char
+= produced_chars
;
4277 coding
->produced
= dst
- coding
->destination
;
4282 /*** 10. CCL handlers ***/
4284 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4285 Check if a text is encoded in a coding system of which
4286 encoder/decoder are written in CCL program. If it is, return
4287 CATEGORY_MASK_CCL, else return 0. */
4290 detect_coding_ccl (coding
, detect_info
)
4291 struct coding_system
*coding
;
4292 struct coding_detection_info
*detect_info
;
4294 unsigned char *src
= coding
->source
, *src_base
= src
;
4295 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4296 int multibytep
= coding
->src_multibyte
;
4297 int consumed_chars
= 0;
4299 unsigned char *valids
= CODING_CCL_VALIDS (coding
);
4300 int head_ascii
= coding
->head_ascii
;
4303 detect_info
->checked
|= CATEGORY_MASK_CCL
;
4305 coding
= &coding_categories
[coding_category_ccl
];
4306 attrs
= CODING_ID_ATTRS (coding
->id
);
4307 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
4316 if ((valids
[c
] > 1))
4317 found
= CATEGORY_MASK_CCL
;
4319 detect_info
->rejected
|= CATEGORY_MASK_CCL
;
4323 detect_info
->found
|= found
;
4328 decode_coding_ccl (coding
)
4329 struct coding_system
*coding
;
4331 const unsigned char *src
= coding
->source
+ coding
->consumed
;
4332 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4333 int *charbuf
= coding
->charbuf
;
4334 int *charbuf_end
= charbuf
+ coding
->charbuf_size
;
4335 int consumed_chars
= 0;
4336 int multibytep
= coding
->src_multibyte
;
4337 struct ccl_program ccl
;
4338 int source_charbuf
[1024];
4339 int source_byteidx
[1024];
4340 Lisp_Object attrs
, eol_type
, charset_list
;
4342 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
4343 setup_ccl_program (&ccl
, CODING_CCL_DECODER (coding
));
4345 while (src
< src_end
)
4347 const unsigned char *p
= src
;
4348 int *source
, *source_end
;
4352 while (i
< 1024 && p
< src_end
)
4354 source_byteidx
[i
] = p
- src
;
4355 source_charbuf
[i
++] = STRING_CHAR_ADVANCE (p
);
4358 while (i
< 1024 && p
< src_end
)
4359 source_charbuf
[i
++] = *p
++;
4361 if (p
== src_end
&& coding
->mode
& CODING_MODE_LAST_BLOCK
)
4364 source
= source_charbuf
;
4365 source_end
= source
+ i
;
4366 while (source
< source_end
)
4368 ccl_driver (&ccl
, source
, charbuf
,
4369 source_end
- source
, charbuf_end
- charbuf
,
4371 source
+= ccl
.consumed
;
4372 charbuf
+= ccl
.produced
;
4373 if (ccl
.status
!= CCL_STAT_SUSPEND_BY_DST
)
4376 if (source
< source_end
)
4377 src
+= source_byteidx
[source
- source_charbuf
];
4380 consumed_chars
+= source
- source_charbuf
;
4382 if (ccl
.status
!= CCL_STAT_SUSPEND_BY_SRC
4383 && ccl
.status
!= CODING_RESULT_INSUFFICIENT_SRC
)
4389 case CCL_STAT_SUSPEND_BY_SRC
:
4390 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
4392 case CCL_STAT_SUSPEND_BY_DST
:
4395 case CCL_STAT_INVALID_CMD
:
4396 coding
->result
= CODING_RESULT_INTERRUPT
;
4399 coding
->result
= CODING_RESULT_SUCCESS
;
4402 coding
->consumed_char
+= consumed_chars
;
4403 coding
->consumed
= src
- coding
->source
;
4404 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4408 encode_coding_ccl (coding
)
4409 struct coding_system
*coding
;
4411 struct ccl_program ccl
;
4412 int multibytep
= coding
->dst_multibyte
;
4413 int *charbuf
= coding
->charbuf
;
4414 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4415 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4416 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4417 unsigned char *adjusted_dst_end
= dst_end
- 1;
4418 int destination_charbuf
[1024];
4419 int i
, produced_chars
= 0;
4420 Lisp_Object attrs
, eol_type
, charset_list
;
4422 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
4423 setup_ccl_program (&ccl
, CODING_CCL_ENCODER (coding
));
4425 ccl
.last_block
= coding
->mode
& CODING_MODE_LAST_BLOCK
;
4426 ccl
.dst_multibyte
= coding
->dst_multibyte
;
4428 while (charbuf
< charbuf_end
&& dst
< adjusted_dst_end
)
4430 int dst_bytes
= dst_end
- dst
;
4431 if (dst_bytes
> 1024)
4434 ccl_driver (&ccl
, charbuf
, destination_charbuf
,
4435 charbuf_end
- charbuf
, dst_bytes
, charset_list
);
4436 charbuf
+= ccl
.consumed
;
4438 for (i
= 0; i
< ccl
.produced
; i
++)
4439 EMIT_ONE_BYTE (destination_charbuf
[i
] & 0xFF);
4442 for (i
= 0; i
< ccl
.produced
; i
++)
4443 *dst
++ = destination_charbuf
[i
] & 0xFF;
4444 produced_chars
+= ccl
.produced
;
4450 case CCL_STAT_SUSPEND_BY_SRC
:
4451 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
4453 case CCL_STAT_SUSPEND_BY_DST
:
4454 coding
->result
= CODING_RESULT_INSUFFICIENT_DST
;
4457 case CCL_STAT_INVALID_CMD
:
4458 coding
->result
= CODING_RESULT_INTERRUPT
;
4461 coding
->result
= CODING_RESULT_SUCCESS
;
4465 coding
->produced_char
+= produced_chars
;
4466 coding
->produced
= dst
- coding
->destination
;
4472 /*** 10, 11. no-conversion handlers ***/
4474 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4477 decode_coding_raw_text (coding
)
4478 struct coding_system
*coding
;
4480 coding
->chars_at_source
= 1;
4481 coding
->consumed_char
= 0;
4482 coding
->consumed
= 0;
4483 coding
->result
= CODING_RESULT_SUCCESS
;
4487 encode_coding_raw_text (coding
)
4488 struct coding_system
*coding
;
4490 int multibytep
= coding
->dst_multibyte
;
4491 int *charbuf
= coding
->charbuf
;
4492 int *charbuf_end
= coding
->charbuf
+ coding
->charbuf_used
;
4493 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4494 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4495 int produced_chars
= 0;
4500 int safe_room
= MAX_MULTIBYTE_LENGTH
* 2;
4502 if (coding
->src_multibyte
)
4503 while (charbuf
< charbuf_end
)
4505 ASSURE_DESTINATION (safe_room
);
4507 if (ASCII_CHAR_P (c
))
4508 EMIT_ONE_ASCII_BYTE (c
);
4509 else if (CHAR_BYTE8_P (c
))
4511 c
= CHAR_TO_BYTE8 (c
);
4516 unsigned char str
[MAX_MULTIBYTE_LENGTH
], *p0
= str
, *p1
= str
;
4518 CHAR_STRING_ADVANCE (c
, p1
);
4521 EMIT_ONE_BYTE (*p0
);
4527 while (charbuf
< charbuf_end
)
4529 ASSURE_DESTINATION (safe_room
);
4536 if (coding
->src_multibyte
)
4538 int safe_room
= MAX_MULTIBYTE_LENGTH
;
4540 while (charbuf
< charbuf_end
)
4542 ASSURE_DESTINATION (safe_room
);
4544 if (ASCII_CHAR_P (c
))
4546 else if (CHAR_BYTE8_P (c
))
4547 *dst
++ = CHAR_TO_BYTE8 (c
);
4549 CHAR_STRING_ADVANCE (c
, dst
);
4555 ASSURE_DESTINATION (charbuf_end
- charbuf
);
4556 while (charbuf
< charbuf_end
&& dst
< dst_end
)
4557 *dst
++ = *charbuf
++;
4558 produced_chars
= dst
- (coding
->destination
+ coding
->dst_bytes
);
4561 coding
->result
= CODING_RESULT_SUCCESS
;
4562 coding
->produced_char
+= produced_chars
;
4563 coding
->produced
= dst
- coding
->destination
;
4567 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4568 Check if a text is encoded in a charset-based coding system. If it
4569 is, return 1, else return 0. */
4572 detect_coding_charset (coding
, detect_info
)
4573 struct coding_system
*coding
;
4574 struct coding_detection_info
*detect_info
;
4576 unsigned char *src
= coding
->source
, *src_base
= src
;
4577 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4578 int multibytep
= coding
->src_multibyte
;
4579 int consumed_chars
= 0;
4580 Lisp_Object attrs
, valids
;
4583 detect_info
->checked
|= CATEGORY_MASK_CHARSET
;
4585 coding
= &coding_categories
[coding_category_charset
];
4586 attrs
= CODING_ID_ATTRS (coding
->id
);
4587 valids
= AREF (attrs
, coding_attr_charset_valids
);
4589 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
4590 src
+= coding
->head_ascii
;
4597 if (NILP (AREF (valids
, c
)))
4600 found
= CATEGORY_MASK_CHARSET
;
4602 detect_info
->rejected
|= CATEGORY_MASK_CHARSET
;
4606 detect_info
->found
|= found
;
4611 decode_coding_charset (coding
)
4612 struct coding_system
*coding
;
4614 unsigned char *src
= coding
->source
+ coding
->consumed
;
4615 unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
4616 unsigned char *src_base
;
4617 int *charbuf
= coding
->charbuf
;
4618 int *charbuf_end
= charbuf
+ coding
->charbuf_size
- MAX_ANNOTATION_LENGTH
;
4619 int consumed_chars
= 0, consumed_chars_base
;
4620 int multibytep
= coding
->src_multibyte
;
4621 Lisp_Object attrs
, eol_type
, charset_list
, valids
;
4622 int char_offset
= coding
->produced_char
;
4623 int last_offset
= char_offset
;
4624 int last_id
= charset_ascii
;
4626 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
4627 valids
= AREF (attrs
, coding_attr_charset_valids
);
4634 consumed_chars_base
= consumed_chars
;
4636 if (charbuf
>= charbuf_end
)
4642 /* Here we assume that no charset maps '\r' to something
4644 if (EQ (eol_type
, Qdos
))
4648 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
4649 goto no_more_source
;
4654 else if (EQ (eol_type
, Qmac
))
4660 struct charset
*charset
;
4665 val
= AREF (valids
, c
);
4670 charset
= CHARSET_FROM_ID (XFASTINT (val
));
4671 dim
= CHARSET_DIMENSION (charset
);
4675 code
= (code
<< 8) | c
;
4678 CODING_DECODE_CHAR (coding
, src
, src_base
, src_end
,
4683 /* VAL is a list of charset IDs. It is assured that the
4684 list is sorted by charset dimensions (smaller one
4688 charset
= CHARSET_FROM_ID (XFASTINT (XCAR (val
)));
4689 dim
= CHARSET_DIMENSION (charset
);
4693 code
= (code
<< 8) | c
;
4696 CODING_DECODE_CHAR (coding
, src
, src_base
,
4697 src_end
, charset
, code
, c
);
4705 if (charset
->id
!= charset_ascii
4706 && last_id
!= charset
->id
)
4708 if (last_id
!= charset_ascii
)
4709 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
4710 last_id
= charset
->id
;
4711 last_offset
= char_offset
;
4720 consumed_chars
= consumed_chars_base
;
4722 *charbuf
++ = ASCII_BYTE_P (c
) ? c
: BYTE8_TO_CHAR (c
);
4728 if (last_id
!= charset_ascii
)
4729 ADD_CHARSET_DATA (charbuf
, last_offset
, char_offset
, last_id
);
4730 coding
->consumed_char
+= consumed_chars_base
;
4731 coding
->consumed
= src_base
- coding
->source
;
4732 coding
->charbuf_used
= charbuf
- coding
->charbuf
;
4736 encode_coding_charset (coding
)
4737 struct coding_system
*coding
;
4739 int multibytep
= coding
->dst_multibyte
;
4740 int *charbuf
= coding
->charbuf
;
4741 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
4742 unsigned char *dst
= coding
->destination
+ coding
->produced
;
4743 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
4744 int safe_room
= MAX_MULTIBYTE_LENGTH
;
4745 int produced_chars
= 0;
4746 Lisp_Object attrs
, eol_type
, charset_list
;
4747 int ascii_compatible
;
4750 CODING_GET_INFO (coding
, attrs
, eol_type
, charset_list
);
4751 ascii_compatible
= ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
));
4753 while (charbuf
< charbuf_end
)
4755 struct charset
*charset
;
4758 ASSURE_DESTINATION (safe_room
);
4760 if (ascii_compatible
&& ASCII_CHAR_P (c
))
4761 EMIT_ONE_ASCII_BYTE (c
);
4762 else if (CHAR_BYTE8_P (c
))
4764 c
= CHAR_TO_BYTE8 (c
);
4769 charset
= char_charset (c
, charset_list
, &code
);
4772 if (CHARSET_DIMENSION (charset
) == 1)
4773 EMIT_ONE_BYTE (code
);
4774 else if (CHARSET_DIMENSION (charset
) == 2)
4775 EMIT_TWO_BYTES (code
>> 8, code
& 0xFF);
4776 else if (CHARSET_DIMENSION (charset
) == 3)
4777 EMIT_THREE_BYTES (code
>> 16, (code
>> 8) & 0xFF, code
& 0xFF);
4779 EMIT_FOUR_BYTES (code
>> 24, (code
>> 16) & 0xFF,
4780 (code
>> 8) & 0xFF, code
& 0xFF);
4784 if (coding
->mode
& CODING_MODE_SAFE_ENCODING
)
4785 c
= CODING_INHIBIT_CHARACTER_SUBSTITUTION
;
4787 c
= coding
->default_char
;
4793 coding
->result
= CODING_RESULT_SUCCESS
;
4794 coding
->produced_char
+= produced_chars
;
4795 coding
->produced
= dst
- coding
->destination
;
4800 /*** 7. C library functions ***/
4802 /* Setup coding context CODING from information about CODING_SYSTEM.
4803 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
4804 CODING_SYSTEM is invalid, signal an error. */
4807 setup_coding_system (coding_system
, coding
)
4808 Lisp_Object coding_system
;
4809 struct coding_system
*coding
;
4812 Lisp_Object eol_type
;
4813 Lisp_Object coding_type
;
4816 if (NILP (coding_system
))
4817 coding_system
= Qno_conversion
;
4819 CHECK_CODING_SYSTEM_GET_ID (coding_system
, coding
->id
);
4821 attrs
= CODING_ID_ATTRS (coding
->id
);
4822 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
4825 coding
->head_ascii
= -1;
4826 coding
->common_flags
4827 = (VECTORP (eol_type
) ? CODING_REQUIRE_DETECTION_MASK
: 0);
4828 if (! NILP (CODING_ATTR_POST_READ (attrs
)))
4829 coding
->common_flags
|= CODING_REQUIRE_DECODING_MASK
;
4830 if (! NILP (CODING_ATTR_PRE_WRITE (attrs
)))
4831 coding
->common_flags
|= CODING_REQUIRE_ENCODING_MASK
;
4833 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
4834 coding
->max_charset_id
= XSTRING (val
)->size
- 1;
4835 coding
->safe_charsets
= (char *) XSTRING (val
)->data
;
4836 coding
->default_char
= XINT (CODING_ATTR_DEFAULT_CHAR (attrs
));
4838 coding_type
= CODING_ATTR_TYPE (attrs
);
4839 if (EQ (coding_type
, Qundecided
))
4841 coding
->detector
= NULL
;
4842 coding
->decoder
= decode_coding_raw_text
;
4843 coding
->encoder
= encode_coding_raw_text
;
4844 coding
->common_flags
|= CODING_REQUIRE_DETECTION_MASK
;
4846 else if (EQ (coding_type
, Qiso_2022
))
4849 int flags
= XINT (AREF (attrs
, coding_attr_iso_flags
));
4851 /* Invoke graphic register 0 to plane 0. */
4852 CODING_ISO_INVOCATION (coding
, 0) = 0;
4853 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
4854 CODING_ISO_INVOCATION (coding
, 1)
4855 = (flags
& CODING_ISO_FLAG_SEVEN_BITS
? -1 : 1);
4856 /* Setup the initial status of designation. */
4857 for (i
= 0; i
< 4; i
++)
4858 CODING_ISO_DESIGNATION (coding
, i
) = CODING_ISO_INITIAL (coding
, i
);
4859 /* Not single shifting initially. */
4860 CODING_ISO_SINGLE_SHIFTING (coding
) = 0;
4861 /* Beginning of buffer should also be regarded as bol. */
4862 CODING_ISO_BOL (coding
) = 1;
4863 coding
->detector
= detect_coding_iso_2022
;
4864 coding
->decoder
= decode_coding_iso_2022
;
4865 coding
->encoder
= encode_coding_iso_2022
;
4866 if (flags
& CODING_ISO_FLAG_SAFE
)
4867 coding
->mode
|= CODING_MODE_SAFE_ENCODING
;
4868 coding
->common_flags
4869 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
4870 | CODING_REQUIRE_FLUSHING_MASK
);
4871 if (flags
& CODING_ISO_FLAG_COMPOSITION
)
4872 coding
->common_flags
|= CODING_ANNOTATE_COMPOSITION_MASK
;
4873 if (flags
& CODING_ISO_FLAG_DESIGNATION
)
4874 coding
->common_flags
|= CODING_ANNOTATE_CHARSET_MASK
;
4875 if (flags
& CODING_ISO_FLAG_FULL_SUPPORT
)
4877 setup_iso_safe_charsets (attrs
);
4878 val
= CODING_ATTR_SAFE_CHARSETS (attrs
);
4879 coding
->max_charset_id
= XSTRING (val
)->size
- 1;
4880 coding
->safe_charsets
= (char *) XSTRING (val
)->data
;
4882 CODING_ISO_FLAGS (coding
) = flags
;
4884 else if (EQ (coding_type
, Qcharset
))
4886 coding
->detector
= detect_coding_charset
;
4887 coding
->decoder
= decode_coding_charset
;
4888 coding
->encoder
= encode_coding_charset
;
4889 coding
->common_flags
4890 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4892 else if (EQ (coding_type
, Qutf_8
))
4894 coding
->detector
= detect_coding_utf_8
;
4895 coding
->decoder
= decode_coding_utf_8
;
4896 coding
->encoder
= encode_coding_utf_8
;
4897 coding
->common_flags
4898 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4900 else if (EQ (coding_type
, Qutf_16
))
4902 val
= AREF (attrs
, coding_attr_utf_16_bom
);
4903 CODING_UTF_16_BOM (coding
) = (CONSP (val
) ? utf_16_detect_bom
4904 : EQ (val
, Qt
) ? utf_16_with_bom
4905 : utf_16_without_bom
);
4906 val
= AREF (attrs
, coding_attr_utf_16_endian
);
4907 CODING_UTF_16_ENDIAN (coding
) = (EQ (val
, Qbig
) ? utf_16_big_endian
4908 : utf_16_little_endian
);
4909 CODING_UTF_16_SURROGATE (coding
) = 0;
4910 coding
->detector
= detect_coding_utf_16
;
4911 coding
->decoder
= decode_coding_utf_16
;
4912 coding
->encoder
= encode_coding_utf_16
;
4913 coding
->common_flags
4914 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4915 if (CODING_UTF_16_BOM (coding
) == utf_16_detect_bom
)
4916 coding
->common_flags
|= CODING_REQUIRE_DETECTION_MASK
;
4918 else if (EQ (coding_type
, Qccl
))
4920 coding
->detector
= detect_coding_ccl
;
4921 coding
->decoder
= decode_coding_ccl
;
4922 coding
->encoder
= encode_coding_ccl
;
4923 coding
->common_flags
4924 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
4925 | CODING_REQUIRE_FLUSHING_MASK
);
4927 else if (EQ (coding_type
, Qemacs_mule
))
4929 coding
->detector
= detect_coding_emacs_mule
;
4930 coding
->decoder
= decode_coding_emacs_mule
;
4931 coding
->encoder
= encode_coding_emacs_mule
;
4932 coding
->common_flags
4933 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4934 if (! NILP (AREF (attrs
, coding_attr_emacs_mule_full
))
4935 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs
), Vemacs_mule_charset_list
))
4937 Lisp_Object tail
, safe_charsets
;
4938 int max_charset_id
= 0;
4940 for (tail
= Vemacs_mule_charset_list
; CONSP (tail
);
4942 if (max_charset_id
< XFASTINT (XCAR (tail
)))
4943 max_charset_id
= XFASTINT (XCAR (tail
));
4944 safe_charsets
= Fmake_string (make_number (max_charset_id
+ 1),
4946 for (tail
= Vemacs_mule_charset_list
; CONSP (tail
);
4948 XSTRING (safe_charsets
)->data
[XFASTINT (XCAR (tail
))] = 0;
4949 coding
->max_charset_id
= max_charset_id
;
4950 coding
->safe_charsets
= (char *) XSTRING (safe_charsets
)->data
;
4953 else if (EQ (coding_type
, Qshift_jis
))
4955 coding
->detector
= detect_coding_sjis
;
4956 coding
->decoder
= decode_coding_sjis
;
4957 coding
->encoder
= encode_coding_sjis
;
4958 coding
->common_flags
4959 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4961 else if (EQ (coding_type
, Qbig5
))
4963 coding
->detector
= detect_coding_big5
;
4964 coding
->decoder
= decode_coding_big5
;
4965 coding
->encoder
= encode_coding_big5
;
4966 coding
->common_flags
4967 |= (CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
);
4969 else /* EQ (coding_type, Qraw_text) */
4971 coding
->detector
= NULL
;
4972 coding
->decoder
= decode_coding_raw_text
;
4973 coding
->encoder
= encode_coding_raw_text
;
4974 coding
->common_flags
|= CODING_FOR_UNIBYTE_MASK
;
4980 /* Return raw-text or one of its subsidiaries that has the same
4981 eol_type as CODING-SYSTEM. */
4984 raw_text_coding_system (coding_system
)
4985 Lisp_Object coding_system
;
4987 Lisp_Object spec
, attrs
;
4988 Lisp_Object eol_type
, raw_text_eol_type
;
4990 if (NILP (coding_system
))
4992 spec
= CODING_SYSTEM_SPEC (coding_system
);
4993 attrs
= AREF (spec
, 0);
4995 if (EQ (CODING_ATTR_TYPE (attrs
), Qraw_text
))
4996 return coding_system
;
4998 eol_type
= AREF (spec
, 2);
4999 if (VECTORP (eol_type
))
5001 spec
= CODING_SYSTEM_SPEC (Qraw_text
);
5002 raw_text_eol_type
= AREF (spec
, 2);
5003 return (EQ (eol_type
, Qunix
) ? AREF (raw_text_eol_type
, 0)
5004 : EQ (eol_type
, Qdos
) ? AREF (raw_text_eol_type
, 1)
5005 : AREF (raw_text_eol_type
, 2));
5009 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5010 does, return one of the subsidiary that has the same eol-spec as
5011 PARENT. Otherwise, return CODING_SYSTEM. */
5014 coding_inherit_eol_type (coding_system
, parent
)
5015 Lisp_Object coding_system
, parent
;
5017 Lisp_Object spec
, attrs
, eol_type
;
5019 if (NILP (coding_system
))
5020 coding_system
= Qraw_text
;
5021 spec
= CODING_SYSTEM_SPEC (coding_system
);
5022 attrs
= AREF (spec
, 0);
5023 eol_type
= AREF (spec
, 2);
5024 if (VECTORP (eol_type
)
5027 Lisp_Object parent_spec
;
5028 Lisp_Object parent_eol_type
;
5031 = CODING_SYSTEM_SPEC (buffer_defaults
.buffer_file_coding_system
);
5032 parent_eol_type
= AREF (parent_spec
, 2);
5033 if (EQ (parent_eol_type
, Qunix
))
5034 coding_system
= AREF (eol_type
, 0);
5035 else if (EQ (parent_eol_type
, Qdos
))
5036 coding_system
= AREF (eol_type
, 1);
5037 else if (EQ (parent_eol_type
, Qmac
))
5038 coding_system
= AREF (eol_type
, 2);
5040 return coding_system
;
5043 /* Emacs has a mechanism to automatically detect a coding system if it
5044 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
5045 it's impossible to distinguish some coding systems accurately
5046 because they use the same range of codes. So, at first, coding
5047 systems are categorized into 7, those are:
5049 o coding-category-emacs-mule
5051 The category for a coding system which has the same code range
5052 as Emacs' internal format. Assigned the coding-system (Lisp
5053 symbol) `emacs-mule' by default.
5055 o coding-category-sjis
5057 The category for a coding system which has the same code range
5058 as SJIS. Assigned the coding-system (Lisp
5059 symbol) `japanese-shift-jis' by default.
5061 o coding-category-iso-7
5063 The category for a coding system which has the same code range
5064 as ISO2022 of 7-bit environment. This doesn't use any locking
5065 shift and single shift functions. This can encode/decode all
5066 charsets. Assigned the coding-system (Lisp symbol)
5067 `iso-2022-7bit' by default.
5069 o coding-category-iso-7-tight
5071 Same as coding-category-iso-7 except that this can
5072 encode/decode only the specified charsets.
5074 o coding-category-iso-8-1
5076 The category for a coding system which has the same code range
5077 as ISO2022 of 8-bit environment and graphic plane 1 used only
5078 for DIMENSION1 charset. This doesn't use any locking shift
5079 and single shift functions. Assigned the coding-system (Lisp
5080 symbol) `iso-latin-1' by default.
5082 o coding-category-iso-8-2
5084 The category for a coding system which has the same code range
5085 as ISO2022 of 8-bit environment and graphic plane 1 used only
5086 for DIMENSION2 charset. This doesn't use any locking shift
5087 and single shift functions. Assigned the coding-system (Lisp
5088 symbol) `japanese-iso-8bit' by default.
5090 o coding-category-iso-7-else
5092 The category for a coding system which has the same code range
5093 as ISO2022 of 7-bit environemnt but uses locking shift or
5094 single shift functions. Assigned the coding-system (Lisp
5095 symbol) `iso-2022-7bit-lock' by default.
5097 o coding-category-iso-8-else
5099 The category for a coding system which has the same code range
5100 as ISO2022 of 8-bit environemnt but uses locking shift or
5101 single shift functions. Assigned the coding-system (Lisp
5102 symbol) `iso-2022-8bit-ss2' by default.
5104 o coding-category-big5
5106 The category for a coding system which has the same code range
5107 as BIG5. Assigned the coding-system (Lisp symbol)
5108 `cn-big5' by default.
5110 o coding-category-utf-8
5112 The category for a coding system which has the same code range
5113 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
5114 symbol) `utf-8' by default.
5116 o coding-category-utf-16-be
5118 The category for a coding system in which a text has an
5119 Unicode signature (cf. Unicode Standard) in the order of BIG
5120 endian at the head. Assigned the coding-system (Lisp symbol)
5121 `utf-16-be' by default.
5123 o coding-category-utf-16-le
5125 The category for a coding system in which a text has an
5126 Unicode signature (cf. Unicode Standard) in the order of
5127 LITTLE endian at the head. Assigned the coding-system (Lisp
5128 symbol) `utf-16-le' by default.
5130 o coding-category-ccl
5132 The category for a coding system of which encoder/decoder is
5133 written in CCL programs. The default value is nil, i.e., no
5134 coding system is assigned.
5136 o coding-category-binary
5138 The category for a coding system not categorized in any of the
5139 above. Assigned the coding-system (Lisp symbol)
5140 `no-conversion' by default.
5142 Each of them is a Lisp symbol and the value is an actual
5143 `coding-system's (this is also a Lisp symbol) assigned by a user.
5144 What Emacs does actually is to detect a category of coding system.
5145 Then, it uses a `coding-system' assigned to it. If Emacs can't
5146 decide only one possible category, it selects a category of the
5147 highest priority. Priorities of categories are also specified by a
5148 user in a Lisp variable `coding-category-list'.
5152 #define EOL_SEEN_NONE 0
5153 #define EOL_SEEN_LF 1
5154 #define EOL_SEEN_CR 2
5155 #define EOL_SEEN_CRLF 4
5157 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
5158 SOURCE is encoded. If CATEGORY is one of
5159 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5160 two-byte, else they are encoded by one-byte.
5162 Return one of EOL_SEEN_XXX. */
5164 #define MAX_EOL_CHECK_COUNT 3
5167 detect_eol (source
, src_bytes
, category
)
5168 unsigned char *source
;
5169 EMACS_INT src_bytes
;
5170 enum coding_category category
;
5172 unsigned char *src
= source
, *src_end
= src
+ src_bytes
;
5175 int eol_seen
= EOL_SEEN_NONE
;
5177 if ((1 << category
) & CATEGORY_MASK_UTF_16
)
5181 msb
= category
== (coding_category_utf_16_le
5182 | coding_category_utf_16_le_nosig
);
5185 while (src
+ 1 < src_end
)
5188 if (src
[msb
] == 0 && (c
== '\n' || c
== '\r'))
5193 this_eol
= EOL_SEEN_LF
;
5194 else if (src
+ 3 >= src_end
5195 || src
[msb
+ 2] != 0
5196 || src
[lsb
+ 2] != '\n')
5197 this_eol
= EOL_SEEN_CR
;
5199 this_eol
= EOL_SEEN_CRLF
;
5201 if (eol_seen
== EOL_SEEN_NONE
)
5202 /* This is the first end-of-line. */
5203 eol_seen
= this_eol
;
5204 else if (eol_seen
!= this_eol
)
5206 /* The found type is different from what found before. */
5207 eol_seen
= EOL_SEEN_LF
;
5210 if (++total
== MAX_EOL_CHECK_COUNT
)
5218 while (src
< src_end
)
5221 if (c
== '\n' || c
== '\r')
5226 this_eol
= EOL_SEEN_LF
;
5227 else if (src
>= src_end
|| *src
!= '\n')
5228 this_eol
= EOL_SEEN_CR
;
5230 this_eol
= EOL_SEEN_CRLF
, src
++;
5232 if (eol_seen
== EOL_SEEN_NONE
)
5233 /* This is the first end-of-line. */
5234 eol_seen
= this_eol
;
5235 else if (eol_seen
!= this_eol
)
5237 /* The found type is different from what found before. */
5238 eol_seen
= EOL_SEEN_LF
;
5241 if (++total
== MAX_EOL_CHECK_COUNT
)
5251 adjust_coding_eol_type (coding
, eol_seen
)
5252 struct coding_system
*coding
;
5255 Lisp_Object eol_type
;
5257 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
5258 if (eol_seen
& EOL_SEEN_LF
)
5259 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 0));
5260 else if (eol_seen
& EOL_SEEN_CRLF
)
5261 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 1));
5262 else if (eol_seen
& EOL_SEEN_CR
)
5263 coding
->id
= CODING_SYSTEM_ID (AREF (eol_type
, 2));
5266 /* Detect how a text specified in CODING is encoded. If a coding
5267 system is detected, update fields of CODING by the detected coding
5271 detect_coding (coding
)
5272 struct coding_system
*coding
;
5274 unsigned char *src
, *src_end
;
5275 Lisp_Object attrs
, coding_type
;
5277 coding
->consumed
= coding
->consumed_char
= 0;
5278 coding
->produced
= coding
->produced_char
= 0;
5279 coding_set_source (coding
);
5281 src_end
= coding
->source
+ coding
->src_bytes
;
5283 /* If we have not yet decided the text encoding type, detect it
5285 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding
->id
)), Qundecided
))
5289 for (src
= coding
->source
; src
< src_end
; src
++)
5292 if (c
& 0x80 || (c
< 0x20 && (c
== ISO_CODE_ESC
5294 || c
== ISO_CODE_SO
)))
5297 coding
->head_ascii
= src
- (coding
->source
+ coding
->consumed
);
5299 if (coding
->head_ascii
< coding
->src_bytes
)
5301 struct coding_detection_info detect_info
;
5302 enum coding_category category
;
5303 struct coding_system
*this;
5305 detect_info
.checked
= detect_info
.found
= detect_info
.rejected
= 0;
5306 for (i
= 0; i
< coding_category_raw_text
; i
++)
5308 category
= coding_priorities
[i
];
5309 this = coding_categories
+ category
;
5312 /* No coding system of this category is defined. */
5313 detect_info
.rejected
|= (1 << category
);
5315 else if (category
>= coding_category_raw_text
)
5317 else if (detect_info
.checked
& (1 << category
))
5319 if (detect_info
.found
& (1 << category
))
5322 else if ((*(this->detector
)) (coding
, &detect_info
)
5323 && detect_info
.found
& (1 << category
))
5326 if (i
< coding_category_raw_text
)
5327 setup_coding_system (CODING_ID_NAME (this->id
), coding
);
5328 else if (detect_info
.rejected
== CATEGORY_MASK_ANY
)
5329 setup_coding_system (Qraw_text
, coding
);
5330 else if (detect_info
.rejected
)
5331 for (i
= 0; i
< coding_category_raw_text
; i
++)
5332 if (! (detect_info
.rejected
& (1 << coding_priorities
[i
])))
5334 this = coding_categories
+ coding_priorities
[i
];
5335 setup_coding_system (CODING_ID_NAME (this->id
), coding
);
5340 else if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding
->id
)), Qutf_16
))
5342 Lisp_Object coding_systems
;
5343 struct coding_detection_info detect_info
;
5346 = AREF (CODING_ID_ATTRS (coding
->id
), coding_attr_utf_16_bom
);
5347 detect_info
.found
= detect_info
.rejected
= 0;
5348 if (CONSP (coding_systems
)
5349 && detect_coding_utf_16 (coding
, &detect_info
)
5350 && (detect_info
.found
& (CATEGORY_MASK_UTF_16_LE
5351 | CATEGORY_MASK_UTF_16_BE
)))
5353 if (detect_info
.found
& CATEGORY_MASK_UTF_16_LE
)
5354 setup_coding_system (XCAR (coding_systems
), coding
);
5356 setup_coding_system (XCDR (coding_systems
), coding
);
5360 attrs
= CODING_ID_ATTRS (coding
->id
);
5361 coding_type
= CODING_ATTR_TYPE (attrs
);
5363 /* If we have not yet decided the EOL type, detect it now. But, the
5364 detection is impossible for a CCL based coding system, in which
5365 case, we detct the EOL type after decoding. */
5366 if (VECTORP (CODING_ID_EOL_TYPE (coding
->id
))
5367 && ! EQ (coding_type
, Qccl
))
5369 int eol_seen
= detect_eol (coding
->source
, coding
->src_bytes
,
5370 XINT (CODING_ATTR_CATEGORY (attrs
)));
5372 if (eol_seen
!= EOL_SEEN_NONE
)
5373 adjust_coding_eol_type (coding
, eol_seen
);
5380 struct coding_system
*coding
;
5382 if (VECTORP (CODING_ID_EOL_TYPE (coding
->id
)))
5384 unsigned char *p
= CHAR_POS_ADDR (coding
->dst_pos
);
5385 unsigned char *pend
= p
+ coding
->produced
;
5386 int eol_seen
= EOL_SEEN_NONE
;
5388 for (; p
< pend
; p
++)
5391 eol_seen
|= EOL_SEEN_LF
;
5392 else if (*p
== '\r')
5394 if (p
+ 1 < pend
&& *(p
+ 1) == '\n')
5396 eol_seen
|= EOL_SEEN_CRLF
;
5400 eol_seen
|= EOL_SEEN_CR
;
5403 if (eol_seen
!= EOL_SEEN_NONE
)
5404 adjust_coding_eol_type (coding
, eol_seen
);
5407 if (EQ (CODING_ID_EOL_TYPE (coding
->id
), Qmac
))
5409 unsigned char *p
= CHAR_POS_ADDR (coding
->dst_pos
);
5410 unsigned char *pend
= p
+ coding
->produced
;
5412 for (; p
< pend
; p
++)
5416 else if (EQ (CODING_ID_EOL_TYPE (coding
->id
), Qdos
))
5418 unsigned char *p
, *pbeg
, *pend
;
5419 Lisp_Object undo_list
;
5421 move_gap_both (coding
->dst_pos
+ coding
->produced_char
,
5422 coding
->dst_pos_byte
+ coding
->produced
);
5423 undo_list
= current_buffer
->undo_list
;
5424 current_buffer
->undo_list
= Qt
;
5425 del_range_2 (coding
->dst_pos
, coding
->dst_pos_byte
, GPT
, GPT_BYTE
, 0);
5426 current_buffer
->undo_list
= undo_list
;
5428 pend
= pbeg
+ coding
->produced
;
5430 for (p
= pend
- 1; p
>= pbeg
; p
--)
5433 safe_bcopy ((char *) (p
+ 1), (char *) p
, pend
- p
- 1);
5436 coding
->produced_char
-= coding
->produced
- (pend
- pbeg
);
5437 coding
->produced
= pend
- pbeg
;
5438 insert_from_gap (coding
->produced_char
, coding
->produced
);
5443 translate_chars (coding
, table
)
5444 struct coding_system
*coding
;
5447 int *charbuf
= coding
->charbuf
;
5448 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
5451 if (coding
->chars_at_source
)
5454 while (charbuf
< charbuf_end
)
5460 *charbuf
++ = translate_char (table
, c
);
5465 produce_chars (coding
)
5466 struct coding_system
*coding
;
5468 unsigned char *dst
= coding
->destination
+ coding
->produced
;
5469 unsigned char *dst_end
= coding
->destination
+ coding
->dst_bytes
;
5471 int produced_chars
= 0;
5473 if (! coding
->chars_at_source
)
5475 /* Characters are in coding->charbuf. */
5476 int *buf
= coding
->charbuf
;
5477 int *buf_end
= buf
+ coding
->charbuf_used
;
5478 unsigned char *adjusted_dst_end
;
5480 if (BUFFERP (coding
->src_object
)
5481 && EQ (coding
->src_object
, coding
->dst_object
))
5482 dst_end
= coding
->source
+ coding
->consumed
;
5483 adjusted_dst_end
= dst_end
- MAX_MULTIBYTE_LENGTH
;
5485 while (buf
< buf_end
)
5489 if (dst
>= adjusted_dst_end
)
5491 dst
= alloc_destination (coding
,
5492 buf_end
- buf
+ MAX_MULTIBYTE_LENGTH
,
5494 dst_end
= coding
->destination
+ coding
->dst_bytes
;
5495 adjusted_dst_end
= dst_end
- MAX_MULTIBYTE_LENGTH
;
5499 if (coding
->dst_multibyte
5500 || ! CHAR_BYTE8_P (c
))
5501 CHAR_STRING_ADVANCE (c
, dst
);
5503 *dst
++ = CHAR_TO_BYTE8 (c
);
5507 /* This is an annotation datum. (-C) is the length of
5514 unsigned char *src
= coding
->source
;
5515 unsigned char *src_end
= src
+ coding
->src_bytes
;
5516 Lisp_Object eol_type
;
5518 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
5520 if (coding
->src_multibyte
!= coding
->dst_multibyte
)
5522 if (coding
->src_multibyte
)
5529 unsigned char *src_base
= src
;
5535 if (EQ (eol_type
, Qdos
))
5539 coding
->result
= CODING_RESULT_INSUFFICIENT_SRC
;
5540 goto no_more_source
;
5545 else if (EQ (eol_type
, Qmac
))
5550 coding
->consumed
= src
- coding
->source
;
5552 if (EQ (coding
->src_object
, coding
->dst_object
))
5556 dst
= alloc_destination (coding
, src_end
- src
+ 1,
5558 dst_end
= coding
->destination
+ coding
->dst_bytes
;
5559 coding_set_source (coding
);
5560 src
= coding
->source
+ coding
->consumed
;
5561 src_end
= coding
->source
+ coding
->src_bytes
;
5571 while (src
< src_end
)
5578 if (EQ (eol_type
, Qdos
))
5584 else if (EQ (eol_type
, Qmac
))
5587 if (dst
>= dst_end
- 1)
5589 coding
->consumed
= src
- coding
->source
;
5591 if (EQ (coding
->src_object
, coding
->dst_object
))
5593 if (dst
>= dst_end
- 1)
5595 dst
= alloc_destination (coding
, src_end
- src
+ 2,
5597 dst_end
= coding
->destination
+ coding
->dst_bytes
;
5598 coding_set_source (coding
);
5599 src
= coding
->source
+ coding
->consumed
;
5600 src_end
= coding
->source
+ coding
->src_bytes
;
5608 if (!EQ (coding
->src_object
, coding
->dst_object
))
5610 int require
= coding
->src_bytes
- coding
->dst_bytes
;
5614 EMACS_INT offset
= src
- coding
->source
;
5616 dst
= alloc_destination (coding
, require
, dst
);
5617 coding_set_source (coding
);
5618 src
= coding
->source
+ offset
;
5619 src_end
= coding
->source
+ coding
->src_bytes
;
5622 produced_chars
= coding
->src_chars
;
5623 while (src
< src_end
)
5629 if (EQ (eol_type
, Qdos
))
5636 else if (EQ (eol_type
, Qmac
))
5642 coding
->consumed
= coding
->src_bytes
;
5643 coding
->consumed_char
= coding
->src_chars
;
5646 produced
= dst
- (coding
->destination
+ coding
->produced
);
5647 if (BUFFERP (coding
->dst_object
))
5648 insert_from_gap (produced_chars
, produced
);
5649 coding
->produced
+= produced
;
5650 coding
->produced_char
+= produced_chars
;
5651 return produced_chars
;
5654 /* Compose text in CODING->object according to the annotation data at
5655 CHARBUF. CHARBUF is an array:
5656 [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
5660 produce_composition (coding
, charbuf
)
5661 struct coding_system
*coding
;
5666 enum composition_method method
;
5667 Lisp_Object components
;
5670 from
= coding
->dst_pos
+ charbuf
[2];
5671 to
= coding
->dst_pos
+ charbuf
[3];
5672 method
= (enum composition_method
) (charbuf
[4]);
5674 if (method
== COMPOSITION_RELATIVE
)
5678 Lisp_Object args
[MAX_COMPOSITION_COMPONENTS
* 2 - 1];
5683 for (i
= 0; i
< len
; i
++)
5684 args
[i
] = make_number (charbuf
[i
]);
5685 components
= (method
== COMPOSITION_WITH_ALTCHARS
5686 ? Fstring (len
, args
) : Fvector (len
, args
));
5688 compose_text (from
, to
, components
, Qnil
, coding
->dst_object
);
5692 /* Put `charset' property on text in CODING->object according to
5693 the annotation data at CHARBUF. CHARBUF is an array:
5694 [ -LENGTH ANNOTATION_MASK FROM TO CHARSET-ID ]
5698 produce_charset (coding
, charbuf
)
5699 struct coding_system
*coding
;
5702 EMACS_INT from
= coding
->dst_pos
+ charbuf
[2];
5703 EMACS_INT to
= coding
->dst_pos
+ charbuf
[3];
5704 struct charset
*charset
= CHARSET_FROM_ID (charbuf
[4]);
5706 Fput_text_property (make_number (from
), make_number (to
),
5707 Qcharset
, CHARSET_NAME (charset
),
5708 coding
->dst_object
);
5712 #define CHARBUF_SIZE 0x4000
5714 #define ALLOC_CONVERSION_WORK_AREA(coding) \
5716 int size = CHARBUF_SIZE;; \
5718 coding->charbuf = NULL; \
5719 while (size > 1024) \
5721 coding->charbuf = (int *) alloca (sizeof (int) * size); \
5722 if (coding->charbuf) \
5726 if (! coding->charbuf) \
5728 coding->result = CODING_RESULT_INSUFFICIENT_MEM; \
5729 return coding->result; \
5731 coding->charbuf_size = size; \
5736 produce_annotation (coding
)
5737 struct coding_system
*coding
;
5739 int *charbuf
= coding
->charbuf
;
5740 int *charbuf_end
= charbuf
+ coding
->charbuf_used
;
5742 if (NILP (coding
->dst_object
))
5745 while (charbuf
< charbuf_end
)
5751 int len
= -*charbuf
;
5754 case CODING_ANNOTATE_COMPOSITION_MASK
:
5755 produce_composition (coding
, charbuf
);
5757 case CODING_ANNOTATE_CHARSET_MASK
:
5758 produce_charset (coding
, charbuf
);
5768 /* Decode the data at CODING->src_object into CODING->dst_object.
5769 CODING->src_object is a buffer, a string, or nil.
5770 CODING->dst_object is a buffer.
5772 If CODING->src_object is a buffer, it must be the current buffer.
5773 In this case, if CODING->src_pos is positive, it is a position of
5774 the source text in the buffer, otherwise, the source text is in the
5775 gap area of the buffer, and CODING->src_pos specifies the offset of
5776 the text from GPT (which must be the same as PT). If this is the
5777 same buffer as CODING->dst_object, CODING->src_pos must be
5780 If CODING->src_object is a string, CODING->src_pos in an index to
5783 If CODING->src_object is nil, CODING->source must already point to
5784 the non-relocatable memory area. In this case, CODING->src_pos is
5785 an offset from CODING->source.
5787 The decoded data is inserted at the current point of the buffer
5792 decode_coding (coding
)
5793 struct coding_system
*coding
;
5797 if (BUFFERP (coding
->src_object
)
5798 && coding
->src_pos
> 0
5799 && coding
->src_pos
< GPT
5800 && coding
->src_pos
+ coding
->src_chars
> GPT
)
5801 move_gap_both (coding
->src_pos
, coding
->src_pos_byte
);
5803 if (BUFFERP (coding
->dst_object
))
5805 if (current_buffer
!= XBUFFER (coding
->dst_object
))
5806 set_buffer_internal (XBUFFER (coding
->dst_object
));
5808 move_gap_both (PT
, PT_BYTE
);
5811 coding
->consumed
= coding
->consumed_char
= 0;
5812 coding
->produced
= coding
->produced_char
= 0;
5813 coding
->chars_at_source
= 0;
5814 coding
->result
= CODING_RESULT_SUCCESS
;
5817 ALLOC_CONVERSION_WORK_AREA (coding
);
5819 attrs
= CODING_ID_ATTRS (coding
->id
);
5823 coding_set_source (coding
);
5824 coding
->annotated
= 0;
5825 (*(coding
->decoder
)) (coding
);
5826 if (!NILP (CODING_ATTR_DECODE_TBL (attrs
)))
5827 translate_chars (coding
, CODING_ATTR_DECODE_TBL (attrs
));
5828 else if (!NILP (Vstandard_translation_table_for_decode
))
5829 translate_chars (coding
, Vstandard_translation_table_for_decode
);
5830 coding_set_destination (coding
);
5831 produce_chars (coding
);
5832 if (coding
->annotated
)
5833 produce_annotation (coding
);
5835 while (coding
->consumed
< coding
->src_bytes
5836 && ! coding
->result
);
5838 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding
->id
)), Qccl
)
5839 && SYMBOLP (CODING_ID_EOL_TYPE (coding
->id
))
5840 && ! EQ (CODING_ID_EOL_TYPE (coding
->id
), Qunix
))
5841 decode_eol (coding
);
5843 coding
->carryover_bytes
= 0;
5844 if (coding
->consumed
< coding
->src_bytes
)
5846 int nbytes
= coding
->src_bytes
- coding
->consumed
;
5849 coding_set_source (coding
);
5850 coding_set_destination (coding
);
5851 src
= coding
->source
+ coding
->consumed
;
5853 if (coding
->mode
& CODING_MODE_LAST_BLOCK
)
5855 /* Flush out unprocessed data as binary chars. We are sure
5856 that the number of data is less than the size of
5858 while (nbytes
-- > 0)
5862 coding
->charbuf
[coding
->charbuf_used
++] = (c
& 0x80 ? - c
: c
);
5864 produce_chars (coding
);
5868 /* Record unprocessed bytes in coding->carryover. We are
5869 sure that the number of data is less than the size of
5870 coding->carryover. */
5871 unsigned char *p
= coding
->carryover
;
5873 coding
->carryover_bytes
= nbytes
;
5874 while (nbytes
-- > 0)
5877 coding
->consumed
= coding
->src_bytes
;
5880 return coding
->result
;
5884 /* Extract an annotation datum from a composition starting at POS and
5885 ending before LIMIT of CODING->src_object (buffer or string), store
5886 the data in BUF, set *STOP to a starting position of the next
5887 composition (if any) or to LIMIT, and return the address of the
5888 next element of BUF.
5890 If such an annotation is not found, set *STOP to a starting
5891 position of a composition after POS (if any) or to LIMIT, and
5895 handle_composition_annotation (pos
, limit
, coding
, buf
, stop
)
5896 EMACS_INT pos
, limit
;
5897 struct coding_system
*coding
;
5901 EMACS_INT start
, end
;
5904 if (! find_composition (pos
, limit
, &start
, &end
, &prop
, coding
->src_object
)
5907 else if (start
> pos
)
5913 /* We found a composition. Store the corresponding
5914 annotation data in BUF. */
5916 enum composition_method method
= COMPOSITION_METHOD (prop
);
5917 int nchars
= COMPOSITION_LENGTH (prop
);
5919 ADD_COMPOSITION_DATA (buf
, 0, nchars
, method
);
5920 if (method
!= COMPOSITION_RELATIVE
)
5922 Lisp_Object components
;
5925 components
= COMPOSITION_COMPONENTS (prop
);
5926 if (VECTORP (components
))
5928 len
= XVECTOR (components
)->size
;
5929 for (i
= 0; i
< len
; i
++)
5930 *buf
++ = XINT (AREF (components
, i
));
5932 else if (STRINGP (components
))
5934 len
= XSTRING (components
)->size
;
5938 FETCH_STRING_CHAR_ADVANCE (*buf
, components
, i
, i_byte
);
5942 else if (INTEGERP (components
))
5945 *buf
++ = XINT (components
);
5947 else if (CONSP (components
))
5949 for (len
= 0; CONSP (components
);
5950 len
++, components
= XCDR (components
))
5951 *buf
++ = XINT (XCAR (components
));
5959 if (find_composition (end
, limit
, &start
, &end
, &prop
,
5970 /* Extract an annotation datum from a text property `charset' at POS of
5971 CODING->src_object (buffer of string), store the data in BUF, set
5972 *STOP to the position where the value of `charset' property changes
5973 (limiting by LIMIT), and return the address of the next element of
5976 If the property value is nil, set *STOP to the position where the
5977 property value is non-nil (limiting by LIMIT), and return BUF. */
5980 handle_charset_annotation (pos
, limit
, coding
, buf
, stop
)
5981 EMACS_INT pos
, limit
;
5982 struct coding_system
*coding
;
5986 Lisp_Object val
, next
;
5989 val
= Fget_text_property (make_number (pos
), Qcharset
, coding
->src_object
);
5990 if (! NILP (val
) && CHARSETP (val
))
5991 id
= XINT (CHARSET_SYMBOL_ID (val
));
5994 ADD_CHARSET_DATA (buf
, 0, 0, id
);
5995 next
= Fnext_single_property_change (make_number (pos
), Qcharset
,
5997 make_number (limit
));
5998 *stop
= XINT (next
);
6004 consume_chars (coding
)
6005 struct coding_system
*coding
;
6007 int *buf
= coding
->charbuf
;
6008 int *buf_end
= coding
->charbuf
+ coding
->charbuf_size
;
6009 const unsigned char *src
= coding
->source
+ coding
->consumed
;
6010 const unsigned char *src_end
= coding
->source
+ coding
->src_bytes
;
6011 EMACS_INT pos
= coding
->src_pos
+ coding
->consumed_char
;
6012 EMACS_INT end_pos
= coding
->src_pos
+ coding
->src_chars
;
6013 int multibytep
= coding
->src_multibyte
;
6014 Lisp_Object eol_type
;
6016 EMACS_INT stop
, stop_composition
, stop_charset
;
6018 eol_type
= CODING_ID_EOL_TYPE (coding
->id
);
6019 if (VECTORP (eol_type
))
6022 /* Note: composition handling is not yet implemented. */
6023 coding
->common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
6025 if (coding
->common_flags
& CODING_ANNOTATE_COMPOSITION_MASK
)
6026 stop
= stop_composition
= pos
;
6028 stop
= stop_composition
= end_pos
;
6029 if (coding
->common_flags
& CODING_ANNOTATE_CHARSET_MASK
)
6030 stop
= stop_charset
= pos
;
6032 stop_charset
= end_pos
;
6034 /* Compensate for CRLF and annotation. */
6035 buf_end
-= 1 + MAX_ANNOTATION_LENGTH
;
6036 while (buf
< buf_end
)
6042 if (pos
== stop_composition
)
6043 buf
= handle_composition_annotation (pos
, end_pos
, coding
,
6044 buf
, &stop_composition
);
6045 if (pos
== stop_charset
)
6046 buf
= handle_charset_annotation (pos
, end_pos
, coding
,
6047 buf
, &stop_charset
);
6048 stop
= (stop_composition
< stop_charset
6049 ? stop_composition
: stop_charset
);
6056 if (! CODING_FOR_UNIBYTE (coding
)
6057 && (bytes
= MULTIBYTE_LENGTH (src
, src_end
)) > 0)
6058 c
= STRING_CHAR_ADVANCE (src
), pos
+= bytes
;
6063 c
= STRING_CHAR_ADVANCE (src
), pos
++;
6064 if ((c
== '\r') && (coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
))
6066 if (! EQ (eol_type
, Qunix
))
6070 if (EQ (eol_type
, Qdos
))
6079 coding
->consumed
= src
- coding
->source
;
6080 coding
->consumed_char
= pos
- coding
->src_pos
;
6081 coding
->charbuf_used
= buf
- coding
->charbuf
;
6082 coding
->chars_at_source
= 0;
6086 /* Encode the text at CODING->src_object into CODING->dst_object.
6087 CODING->src_object is a buffer or a string.
6088 CODING->dst_object is a buffer or nil.
6090 If CODING->src_object is a buffer, it must be the current buffer.
6091 In this case, if CODING->src_pos is positive, it is a position of
6092 the source text in the buffer, otherwise. the source text is in the
6093 gap area of the buffer, and coding->src_pos specifies the offset of
6094 the text from GPT (which must be the same as PT). If this is the
6095 same buffer as CODING->dst_object, CODING->src_pos must be
6096 negative and CODING should not have `pre-write-conversion'.
6098 If CODING->src_object is a string, CODING should not have
6099 `pre-write-conversion'.
6101 If CODING->dst_object is a buffer, the encoded data is inserted at
6102 the current point of that buffer.
6104 If CODING->dst_object is nil, the encoded data is placed at the
6105 memory area specified by CODING->destination. */
6108 encode_coding (coding
)
6109 struct coding_system
*coding
;
6113 attrs
= CODING_ID_ATTRS (coding
->id
);
6115 if (BUFFERP (coding
->dst_object
))
6117 set_buffer_internal (XBUFFER (coding
->dst_object
));
6118 coding
->dst_multibyte
6119 = ! NILP (current_buffer
->enable_multibyte_characters
);
6122 coding
->consumed
= coding
->consumed_char
= 0;
6123 coding
->produced
= coding
->produced_char
= 0;
6124 coding
->result
= CODING_RESULT_SUCCESS
;
6127 ALLOC_CONVERSION_WORK_AREA (coding
);
6130 coding_set_source (coding
);
6131 consume_chars (coding
);
6133 if (!NILP (CODING_ATTR_ENCODE_TBL (attrs
)))
6134 translate_chars (coding
, CODING_ATTR_ENCODE_TBL (attrs
));
6135 else if (!NILP (Vstandard_translation_table_for_encode
))
6136 translate_chars (coding
, Vstandard_translation_table_for_encode
);
6138 coding_set_destination (coding
);
6139 (*(coding
->encoder
)) (coding
);
6140 } while (coding
->consumed_char
< coding
->src_chars
);
6142 if (BUFFERP (coding
->dst_object
))
6143 insert_from_gap (coding
->produced_char
, coding
->produced
);
6145 return (coding
->result
);
6149 /* Stack of working buffers used in code conversion. An nil element
6150 means that the code conversion of that level is not using a working
6152 Lisp_Object Vcode_conversion_work_buf_list
;
6154 /* A working buffer used by the top level conversion. */
6155 Lisp_Object Vcode_conversion_reused_work_buf
;
6158 /* Return a working buffer that can be freely used by the following
6159 code conversion. MULTIBYTEP specifies the multibyteness of the
6163 make_conversion_work_buffer (multibytep
, depth
)
6164 int multibytep
, depth
;
6166 struct buffer
*current
= current_buffer
;
6167 Lisp_Object buf
, name
;
6171 if (NILP (Vcode_conversion_reused_work_buf
))
6172 Vcode_conversion_reused_work_buf
6173 = Fget_buffer_create (build_string (" *code-converting-work<0>*"));
6174 buf
= Vcode_conversion_reused_work_buf
;
6180 name
= build_string (" *code-converting-work*");
6181 name
= Fgenerate_new_buffer_name (name
, Qnil
);
6187 sprintf (str
, " *code-converting-work*<%d>", depth
);
6188 name
= build_string (str
);
6190 buf
= Fget_buffer_create (name
);
6192 set_buffer_internal (XBUFFER (buf
));
6193 current_buffer
->undo_list
= Qt
;
6195 Fset_buffer_multibyte (multibytep
? Qt
: Qnil
, Qnil
);
6196 set_buffer_internal (current
);
6201 code_conversion_restore (buffer
)
6204 Lisp_Object workbuf
;
6206 workbuf
= XCAR (Vcode_conversion_work_buf_list
);
6207 if (! NILP (workbuf
)
6208 && ! EQ (workbuf
, Vcode_conversion_reused_work_buf
)
6209 && ! NILP (Fbuffer_live_p (workbuf
)))
6210 Fkill_buffer (workbuf
);
6211 Vcode_conversion_work_buf_list
= XCDR (Vcode_conversion_work_buf_list
);
6212 set_buffer_internal (XBUFFER (buffer
));
6217 code_conversion_save (buffer
, with_work_buf
, multibyte
)
6219 int with_work_buf
, multibyte
;
6221 Lisp_Object workbuf
;
6225 int depth
= XINT (Flength (Vcode_conversion_work_buf_list
));
6227 workbuf
= make_conversion_work_buffer (multibyte
, depth
);
6231 Vcode_conversion_work_buf_list
6232 = Fcons (workbuf
, Vcode_conversion_work_buf_list
);
6233 record_unwind_protect (code_conversion_restore
, buffer
);
6238 decode_coding_gap (coding
, chars
, bytes
)
6239 struct coding_system
*coding
;
6240 EMACS_INT chars
, bytes
;
6242 int count
= specpdl_ptr
- specpdl
;
6246 buffer
= Fcurrent_buffer ();
6247 code_conversion_save (buffer
, 0, 0);
6249 coding
->src_object
= buffer
;
6250 coding
->src_chars
= chars
;
6251 coding
->src_bytes
= bytes
;
6252 coding
->src_pos
= -chars
;
6253 coding
->src_pos_byte
= -bytes
;
6254 coding
->src_multibyte
= chars
< bytes
;
6255 coding
->dst_object
= buffer
;
6256 coding
->dst_pos
= PT
;
6257 coding
->dst_pos_byte
= PT_BYTE
;
6258 coding
->dst_multibyte
= ! NILP (current_buffer
->enable_multibyte_characters
);
6259 coding
->mode
|= CODING_MODE_LAST_BLOCK
;
6261 if (CODING_REQUIRE_DETECTION (coding
))
6262 detect_coding (coding
);
6264 decode_coding (coding
);
6266 attrs
= CODING_ID_ATTRS (coding
->id
);
6267 if (! NILP (CODING_ATTR_POST_READ (attrs
)))
6269 struct gcpro gcpro1
;
6270 EMACS_INT prev_Z
= Z
, prev_Z_BYTE
= Z_BYTE
;
6273 TEMP_SET_PT_BOTH (coding
->dst_pos
, coding
->dst_pos_byte
);
6275 val
= call1 (CODING_ATTR_POST_READ (attrs
),
6276 make_number (coding
->produced_char
));
6279 coding
->produced_char
+= Z
- prev_Z
;
6280 coding
->produced
+= Z_BYTE
- prev_Z_BYTE
;
6283 unbind_to (count
, Qnil
);
6284 return coding
->result
;
6288 encode_coding_gap (coding
, chars
, bytes
)
6289 struct coding_system
*coding
;
6290 EMACS_INT chars
, bytes
;
6292 int count
= specpdl_ptr
- specpdl
;
6295 buffer
= Fcurrent_buffer ();
6296 code_conversion_save (buffer
, 0, 0);
6298 coding
->src_object
= buffer
;
6299 coding
->src_chars
= chars
;
6300 coding
->src_bytes
= bytes
;
6301 coding
->src_pos
= -chars
;
6302 coding
->src_pos_byte
= -bytes
;
6303 coding
->src_multibyte
= chars
< bytes
;
6304 coding
->dst_object
= coding
->src_object
;
6305 coding
->dst_pos
= PT
;
6306 coding
->dst_pos_byte
= PT_BYTE
;
6308 encode_coding (coding
);
6310 unbind_to (count
, Qnil
);
6311 return coding
->result
;
6315 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
6316 SRC_OBJECT into DST_OBJECT by coding context CODING.
6318 SRC_OBJECT is a buffer, a string, or Qnil.
6320 If it is a buffer, the text is at point of the buffer. FROM and TO
6321 are positions in the buffer.
6323 If it is a string, the text is at the beginning of the string.
6324 FROM and TO are indices to the string.
6326 If it is nil, the text is at coding->source. FROM and TO are
6327 indices to coding->source.
6329 DST_OBJECT is a buffer, Qt, or Qnil.
6331 If it is a buffer, the decoded text is inserted at point of the
6332 buffer. If the buffer is the same as SRC_OBJECT, the source text
6335 If it is Qt, a string is made from the decoded text, and
6336 set in CODING->dst_object.
6338 If it is Qnil, the decoded text is stored at CODING->destination.
6339 The caller must allocate CODING->dst_bytes bytes at
6340 CODING->destination by xmalloc. If the decoded text is longer than
6341 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
6345 decode_coding_object (coding
, src_object
, from
, from_byte
, to
, to_byte
,
6347 struct coding_system
*coding
;
6348 Lisp_Object src_object
;
6349 EMACS_INT from
, from_byte
, to
, to_byte
;
6350 Lisp_Object dst_object
;
6352 int count
= specpdl_ptr
- specpdl
;
6353 unsigned char *destination
;
6354 EMACS_INT dst_bytes
;
6355 EMACS_INT chars
= to
- from
;
6356 EMACS_INT bytes
= to_byte
- from_byte
;
6359 int saved_pt
= -1, saved_pt_byte
;
6361 buffer
= Fcurrent_buffer ();
6363 if (NILP (dst_object
))
6365 destination
= coding
->destination
;
6366 dst_bytes
= coding
->dst_bytes
;
6369 coding
->src_object
= src_object
;
6370 coding
->src_chars
= chars
;
6371 coding
->src_bytes
= bytes
;
6372 coding
->src_multibyte
= chars
< bytes
;
6374 if (STRINGP (src_object
))
6376 coding
->src_pos
= from
;
6377 coding
->src_pos_byte
= from_byte
;
6379 else if (BUFFERP (src_object
))
6381 set_buffer_internal (XBUFFER (src_object
));
6383 move_gap_both (from
, from_byte
);
6384 if (EQ (src_object
, dst_object
))
6386 saved_pt
= PT
, saved_pt_byte
= PT_BYTE
;
6387 TEMP_SET_PT_BOTH (from
, from_byte
);
6388 del_range_both (from
, from_byte
, to
, to_byte
, 1);
6389 coding
->src_pos
= -chars
;
6390 coding
->src_pos_byte
= -bytes
;
6394 coding
->src_pos
= from
;
6395 coding
->src_pos_byte
= from_byte
;
6399 if (CODING_REQUIRE_DETECTION (coding
))
6400 detect_coding (coding
);
6401 attrs
= CODING_ID_ATTRS (coding
->id
);
6403 if (EQ (dst_object
, Qt
)
6404 || (! NILP (CODING_ATTR_POST_READ (attrs
))
6405 && NILP (dst_object
)))
6407 coding
->dst_object
= code_conversion_save (buffer
, 1, 1);
6408 coding
->dst_pos
= BEG
;
6409 coding
->dst_pos_byte
= BEG_BYTE
;
6410 coding
->dst_multibyte
= 1;
6412 else if (BUFFERP (dst_object
))
6414 code_conversion_save (buffer
, 0, 0);
6415 coding
->dst_object
= dst_object
;
6416 coding
->dst_pos
= BUF_PT (XBUFFER (dst_object
));
6417 coding
->dst_pos_byte
= BUF_PT_BYTE (XBUFFER (dst_object
));
6418 coding
->dst_multibyte
6419 = ! NILP (XBUFFER (dst_object
)->enable_multibyte_characters
);
6423 code_conversion_save (buffer
, 0, 0);
6424 coding
->dst_object
= Qnil
;
6425 coding
->dst_multibyte
= 1;
6428 decode_coding (coding
);
6430 if (BUFFERP (coding
->dst_object
))
6431 set_buffer_internal (XBUFFER (coding
->dst_object
));
6433 if (! NILP (CODING_ATTR_POST_READ (attrs
)))
6435 struct gcpro gcpro1
, gcpro2
;
6436 EMACS_INT prev_Z
= Z
, prev_Z_BYTE
= Z_BYTE
;
6439 TEMP_SET_PT_BOTH (coding
->dst_pos
, coding
->dst_pos_byte
);
6440 GCPRO2 (coding
->src_object
, coding
->dst_object
);
6441 val
= call1 (CODING_ATTR_POST_READ (attrs
),
6442 make_number (coding
->produced_char
));
6445 coding
->produced_char
+= Z
- prev_Z
;
6446 coding
->produced
+= Z_BYTE
- prev_Z_BYTE
;
6449 if (EQ (dst_object
, Qt
))
6451 coding
->dst_object
= Fbuffer_string ();
6453 else if (NILP (dst_object
) && BUFFERP (coding
->dst_object
))
6455 set_buffer_internal (XBUFFER (coding
->dst_object
));
6456 if (dst_bytes
< coding
->produced
)
6459 = (unsigned char *) xrealloc (destination
, coding
->produced
);
6462 coding
->result
= CODING_RESULT_INSUFFICIENT_DST
;
6463 unbind_to (count
, Qnil
);
6466 if (BEGV
< GPT
&& GPT
< BEGV
+ coding
->produced_char
)
6467 move_gap_both (BEGV
, BEGV_BYTE
);
6468 bcopy (BEGV_ADDR
, destination
, coding
->produced
);
6469 coding
->destination
= destination
;
6475 /* This is the case of:
6476 (BUFFERP (src_object) && EQ (src_object, dst_object))
6477 As we have moved PT while replacing the original buffer
6478 contents, we must recover it now. */
6479 set_buffer_internal (XBUFFER (src_object
));
6480 if (saved_pt
< from
)
6481 TEMP_SET_PT_BOTH (saved_pt
, saved_pt_byte
);
6482 else if (saved_pt
< from
+ chars
)
6483 TEMP_SET_PT_BOTH (from
, from_byte
);
6484 else if (! NILP (current_buffer
->enable_multibyte_characters
))
6485 TEMP_SET_PT_BOTH (saved_pt
+ (coding
->produced_char
- chars
),
6486 saved_pt_byte
+ (coding
->produced
- bytes
));
6488 TEMP_SET_PT_BOTH (saved_pt
+ (coding
->produced
- bytes
),
6489 saved_pt_byte
+ (coding
->produced
- bytes
));
6492 unbind_to (count
, Qnil
);
6497 encode_coding_object (coding
, src_object
, from
, from_byte
, to
, to_byte
,
6499 struct coding_system
*coding
;
6500 Lisp_Object src_object
;
6501 EMACS_INT from
, from_byte
, to
, to_byte
;
6502 Lisp_Object dst_object
;
6504 int count
= specpdl_ptr
- specpdl
;
6505 EMACS_INT chars
= to
- from
;
6506 EMACS_INT bytes
= to_byte
- from_byte
;
6509 int saved_pt
= -1, saved_pt_byte
;
6511 buffer
= Fcurrent_buffer ();
6513 coding
->src_object
= src_object
;
6514 coding
->src_chars
= chars
;
6515 coding
->src_bytes
= bytes
;
6516 coding
->src_multibyte
= chars
< bytes
;
6518 attrs
= CODING_ID_ATTRS (coding
->id
);
6520 if (! NILP (CODING_ATTR_PRE_WRITE (attrs
)))
6522 coding
->src_object
= code_conversion_save (buffer
, 1,
6523 coding
->src_multibyte
);
6524 set_buffer_internal (XBUFFER (coding
->src_object
));
6525 if (STRINGP (src_object
))
6526 insert_from_string (src_object
, from
, from_byte
, chars
, bytes
, 0);
6527 else if (BUFFERP (src_object
))
6528 insert_from_buffer (XBUFFER (src_object
), from
, chars
, 0);
6530 insert_1_both (coding
->source
+ from
, chars
, bytes
, 0, 0, 0);
6532 if (EQ (src_object
, dst_object
))
6534 set_buffer_internal (XBUFFER (src_object
));
6535 saved_pt
= PT
, saved_pt_byte
= PT_BYTE
;
6536 del_range_both (from
, from_byte
, to
, to_byte
, 1);
6537 set_buffer_internal (XBUFFER (coding
->src_object
));
6540 call2 (CODING_ATTR_PRE_WRITE (attrs
),
6541 make_number (BEG
), make_number (Z
));
6542 coding
->src_object
= Fcurrent_buffer ();
6544 move_gap_both (BEG
, BEG_BYTE
);
6545 coding
->src_chars
= Z
- BEG
;
6546 coding
->src_bytes
= Z_BYTE
- BEG_BYTE
;
6547 coding
->src_pos
= BEG
;
6548 coding
->src_pos_byte
= BEG_BYTE
;
6549 coding
->src_multibyte
= Z
< Z_BYTE
;
6551 else if (STRINGP (src_object
))
6553 code_conversion_save (buffer
, 0, 0);
6554 coding
->src_pos
= from
;
6555 coding
->src_pos_byte
= from_byte
;
6557 else if (BUFFERP (src_object
))
6559 code_conversion_save (buffer
, 0, 0);
6560 set_buffer_internal (XBUFFER (src_object
));
6561 if (EQ (src_object
, dst_object
))
6563 saved_pt
= PT
, saved_pt_byte
= PT_BYTE
;
6564 coding
->src_object
= del_range_1 (from
, to
, 1, 1);
6565 coding
->src_pos
= 0;
6566 coding
->src_pos_byte
= 0;
6570 if (from
< GPT
&& to
>= GPT
)
6571 move_gap_both (from
, from_byte
);
6572 coding
->src_pos
= from
;
6573 coding
->src_pos_byte
= from_byte
;
6577 code_conversion_save (buffer
, 0, 0);
6579 if (BUFFERP (dst_object
))
6581 coding
->dst_object
= dst_object
;
6582 if (EQ (src_object
, dst_object
))
6584 coding
->dst_pos
= from
;
6585 coding
->dst_pos_byte
= from_byte
;
6589 coding
->dst_pos
= BUF_PT (XBUFFER (dst_object
));
6590 coding
->dst_pos_byte
= BUF_PT_BYTE (XBUFFER (dst_object
));
6592 coding
->dst_multibyte
6593 = ! NILP (XBUFFER (dst_object
)->enable_multibyte_characters
);
6595 else if (EQ (dst_object
, Qt
))
6597 coding
->dst_object
= Qnil
;
6598 coding
->dst_bytes
= coding
->src_chars
;
6599 if (coding
->dst_bytes
== 0)
6600 coding
->dst_bytes
= 1;
6601 coding
->destination
= (unsigned char *) xmalloc (coding
->dst_bytes
);
6602 coding
->dst_multibyte
= 0;
6606 coding
->dst_object
= Qnil
;
6607 coding
->dst_multibyte
= 0;
6610 encode_coding (coding
);
6612 if (EQ (dst_object
, Qt
))
6614 if (BUFFERP (coding
->dst_object
))
6615 coding
->dst_object
= Fbuffer_string ();
6619 = make_unibyte_string ((char *) coding
->destination
,
6621 xfree (coding
->destination
);
6627 /* This is the case of:
6628 (BUFFERP (src_object) && EQ (src_object, dst_object))
6629 As we have moved PT while replacing the original buffer
6630 contents, we must recover it now. */
6631 set_buffer_internal (XBUFFER (src_object
));
6632 if (saved_pt
< from
)
6633 TEMP_SET_PT_BOTH (saved_pt
, saved_pt_byte
);
6634 else if (saved_pt
< from
+ chars
)
6635 TEMP_SET_PT_BOTH (from
, from_byte
);
6636 else if (! NILP (current_buffer
->enable_multibyte_characters
))
6637 TEMP_SET_PT_BOTH (saved_pt
+ (coding
->produced_char
- chars
),
6638 saved_pt_byte
+ (coding
->produced
- bytes
));
6640 TEMP_SET_PT_BOTH (saved_pt
+ (coding
->produced
- bytes
),
6641 saved_pt_byte
+ (coding
->produced
- bytes
));
6644 unbind_to (count
, Qnil
);
6649 preferred_coding_system ()
6651 int id
= coding_categories
[coding_priorities
[0]].id
;
6653 return CODING_ID_NAME (id
);
6658 /*** 8. Emacs Lisp library functions ***/
6660 DEFUN ("coding-system-p", Fcoding_system_p
, Scoding_system_p
, 1, 1, 0,
6661 doc
: /* Return t if OBJECT is nil or a coding-system.
6662 See the documentation of `define-coding-system' for information
6663 about coding-system objects. */)
6667 return ((NILP (obj
) || CODING_SYSTEM_P (obj
)) ? Qt
: Qnil
);
6670 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system
,
6671 Sread_non_nil_coding_system
, 1, 1, 0,
6672 doc
: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6679 val
= Fcompleting_read (prompt
, Vcoding_system_alist
, Qnil
,
6680 Qt
, Qnil
, Qcoding_system_history
, Qnil
, Qnil
);
6682 while (XSTRING (val
)->size
== 0);
6683 return (Fintern (val
, Qnil
));
6686 DEFUN ("read-coding-system", Fread_coding_system
, Sread_coding_system
, 1, 2, 0,
6687 doc
: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6688 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
6689 (prompt
, default_coding_system
)
6690 Lisp_Object prompt
, default_coding_system
;
6693 if (SYMBOLP (default_coding_system
))
6694 XSETSTRING (default_coding_system
, XSYMBOL (default_coding_system
)->name
);
6695 val
= Fcompleting_read (prompt
, Vcoding_system_alist
, Qnil
,
6696 Qt
, Qnil
, Qcoding_system_history
,
6697 default_coding_system
, Qnil
);
6698 return (XSTRING (val
)->size
== 0 ? Qnil
: Fintern (val
, Qnil
));
6701 DEFUN ("check-coding-system", Fcheck_coding_system
, Scheck_coding_system
,
6703 doc
: /* Check validity of CODING-SYSTEM.
6704 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error. */)
6706 Lisp_Object coding_system
;
6708 CHECK_SYMBOL (coding_system
);
6709 if (!NILP (Fcoding_system_p (coding_system
)))
6710 return coding_system
;
6712 Fsignal (Qcoding_system_error
, Fcons (coding_system
, Qnil
));
6716 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
6717 HIGHEST is nonzero, return the coding system of the highest
6718 priority among the detected coding systems. Otherwize return a
6719 list of detected coding systems sorted by their priorities. If
6720 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
6721 multibyte form but contains only ASCII and eight-bit chars.
6722 Otherwise, the bytes are raw bytes.
6724 CODING-SYSTEM controls the detection as below:
6726 If it is nil, detect both text-format and eol-format. If the
6727 text-format part of CODING-SYSTEM is already specified
6728 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
6729 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
6730 detect only text-format. */
6733 detect_coding_system (src
, src_bytes
, highest
, multibytep
, coding_system
)
6735 int src_bytes
, highest
;
6737 Lisp_Object coding_system
;
6739 unsigned char *src_end
= src
+ src_bytes
;
6740 Lisp_Object attrs
, eol_type
;
6742 struct coding_system coding
;
6744 struct coding_detection_info detect_info
;
6746 if (NILP (coding_system
))
6747 coding_system
= Qundecided
;
6748 setup_coding_system (coding_system
, &coding
);
6749 attrs
= CODING_ID_ATTRS (coding
.id
);
6750 eol_type
= CODING_ID_EOL_TYPE (coding
.id
);
6751 coding_system
= CODING_ATTR_BASE_NAME (attrs
);
6753 coding
.source
= src
;
6754 coding
.src_bytes
= src_bytes
;
6755 coding
.src_multibyte
= multibytep
;
6756 coding
.consumed
= 0;
6757 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
6759 detect_info
.checked
= detect_info
.found
= detect_info
.rejected
= 0;
6761 /* At first, detect text-format if necessary. */
6762 if (XINT (CODING_ATTR_CATEGORY (attrs
)) == coding_category_undecided
)
6764 enum coding_category category
;
6765 struct coding_system
*this;
6768 for (; src
< src_end
; src
++)
6772 || (c
< 0x20 && (c
== ISO_CODE_ESC
6774 || c
== ISO_CODE_SO
)))
6777 coding
.head_ascii
= src
- coding
.source
;
6780 for (i
= 0; i
< coding_category_raw_text
; i
++)
6782 category
= coding_priorities
[i
];
6783 this = coding_categories
+ category
;
6787 /* No coding system of this category is defined. */
6788 detect_info
.rejected
|= (1 << category
);
6790 else if (category
>= coding_category_raw_text
)
6792 else if (detect_info
.checked
& (1 << category
))
6795 && (detect_info
.found
& (1 << category
)))
6800 if ((*(this->detector
)) (&coding
, &detect_info
)
6802 && (detect_info
.found
& (1 << category
)))
6808 if (detect_info
.rejected
== CATEGORY_MASK_ANY
)
6810 detect_info
.found
= CATEGORY_MASK_RAW_TEXT
;
6811 id
= coding_categories
[coding_category_raw_text
].id
;
6812 val
= Fcons (make_number (id
), Qnil
);
6814 else if (! detect_info
.rejected
&& ! detect_info
.found
)
6816 detect_info
.found
= CATEGORY_MASK_ANY
;
6817 id
= coding_categories
[coding_category_undecided
].id
;
6818 val
= Fcons (make_number (id
), Qnil
);
6822 if (detect_info
.found
)
6824 detect_info
.found
= 1 << category
;
6825 val
= Fcons (make_number (this->id
), Qnil
);
6828 for (i
= 0; i
< coding_category_raw_text
; i
++)
6829 if (! (detect_info
.rejected
& (1 << coding_priorities
[i
])))
6831 detect_info
.found
= 1 << coding_priorities
[i
];
6832 id
= coding_categories
[coding_priorities
[i
]].id
;
6833 val
= Fcons (make_number (id
), Qnil
);
6839 int mask
= detect_info
.rejected
| detect_info
.found
;
6843 for (i
= coding_category_raw_text
- 1; i
>= 0; i
--)
6845 category
= coding_priorities
[i
];
6846 if (! (mask
& (1 << category
)))
6848 found
|= 1 << category
;
6849 id
= coding_categories
[category
].id
;
6850 val
= Fcons (make_number (id
), val
);
6853 for (i
= coding_category_raw_text
- 1; i
>= 0; i
--)
6855 category
= coding_priorities
[i
];
6856 if (detect_info
.found
& (1 << category
))
6858 id
= coding_categories
[category
].id
;
6859 val
= Fcons (make_number (id
), val
);
6862 detect_info
.found
|= found
;
6867 detect_info
.found
= 1 << XINT (CODING_ATTR_CATEGORY (attrs
));
6868 val
= Fcons (make_number (coding
.id
), Qnil
);
6871 /* Then, detect eol-format if necessary. */
6873 int normal_eol
= -1, utf_16_be_eol
= -1, utf_16_le_eol
;
6876 if (VECTORP (eol_type
))
6878 if (detect_info
.found
& ~CATEGORY_MASK_UTF_16
)
6879 normal_eol
= detect_eol (coding
.source
, src_bytes
,
6880 coding_category_raw_text
);
6881 if (detect_info
.found
& (CATEGORY_MASK_UTF_16_BE
6882 | CATEGORY_MASK_UTF_16_BE_NOSIG
))
6883 utf_16_be_eol
= detect_eol (coding
.source
, src_bytes
,
6884 coding_category_utf_16_be
);
6885 if (detect_info
.found
& (CATEGORY_MASK_UTF_16_LE
6886 | CATEGORY_MASK_UTF_16_LE_NOSIG
))
6887 utf_16_le_eol
= detect_eol (coding
.source
, src_bytes
,
6888 coding_category_utf_16_le
);
6892 if (EQ (eol_type
, Qunix
))
6893 normal_eol
= utf_16_be_eol
= utf_16_le_eol
= EOL_SEEN_LF
;
6894 else if (EQ (eol_type
, Qdos
))
6895 normal_eol
= utf_16_be_eol
= utf_16_le_eol
= EOL_SEEN_CRLF
;
6897 normal_eol
= utf_16_be_eol
= utf_16_le_eol
= EOL_SEEN_CR
;
6900 for (tail
= val
; CONSP (tail
); tail
= XCDR (tail
))
6902 enum coding_category category
;
6905 id
= XINT (XCAR (tail
));
6906 attrs
= CODING_ID_ATTRS (id
);
6907 category
= XINT (CODING_ATTR_CATEGORY (attrs
));
6908 eol_type
= CODING_ID_EOL_TYPE (id
);
6909 if (VECTORP (eol_type
))
6911 if (category
== coding_category_utf_16_be
6912 || category
== coding_category_utf_16_be_nosig
)
6913 this_eol
= utf_16_be_eol
;
6914 else if (category
== coding_category_utf_16_le
6915 || category
== coding_category_utf_16_le_nosig
)
6916 this_eol
= utf_16_le_eol
;
6918 this_eol
= normal_eol
;
6920 if (this_eol
== EOL_SEEN_LF
)
6921 XSETCAR (tail
, AREF (eol_type
, 0));
6922 else if (this_eol
== EOL_SEEN_CRLF
)
6923 XSETCAR (tail
, AREF (eol_type
, 1));
6924 else if (this_eol
== EOL_SEEN_CR
)
6925 XSETCAR (tail
, AREF (eol_type
, 2));
6927 XSETCAR (tail
, CODING_ID_NAME (id
));
6930 XSETCAR (tail
, CODING_ID_NAME (id
));
6934 return (highest
? XCAR (val
) : val
);
6938 DEFUN ("detect-coding-region", Fdetect_coding_region
, Sdetect_coding_region
,
6940 doc
: /* Detect coding system of the text in the region between START and END.
6941 Return a list of possible coding systems ordered by priority.
6943 If only ASCII characters are found, it returns a list of single element
6944 `undecided' or its subsidiary coding system according to a detected
6947 If optional argument HIGHEST is non-nil, return the coding system of
6948 highest priority. */)
6949 (start
, end
, highest
)
6950 Lisp_Object start
, end
, highest
;
6953 int from_byte
, to_byte
;
6955 CHECK_NUMBER_COERCE_MARKER (start
);
6956 CHECK_NUMBER_COERCE_MARKER (end
);
6958 validate_region (&start
, &end
);
6959 from
= XINT (start
), to
= XINT (end
);
6960 from_byte
= CHAR_TO_BYTE (from
);
6961 to_byte
= CHAR_TO_BYTE (to
);
6963 if (from
< GPT
&& to
>= GPT
)
6964 move_gap_both (to
, to_byte
);
6966 return detect_coding_system (BYTE_POS_ADDR (from_byte
),
6967 to_byte
- from_byte
,
6969 !NILP (current_buffer
6970 ->enable_multibyte_characters
),
6974 DEFUN ("detect-coding-string", Fdetect_coding_string
, Sdetect_coding_string
,
6976 doc
: /* Detect coding system of the text in STRING.
6977 Return a list of possible coding systems ordered by priority.
6979 If only ASCII characters are found, it returns a list of single element
6980 `undecided' or its subsidiary coding system according to a detected
6983 If optional argument HIGHEST is non-nil, return the coding system of
6984 highest priority. */)
6986 Lisp_Object string
, highest
;
6988 CHECK_STRING (string
);
6990 return detect_coding_system (XSTRING (string
)->data
,
6991 STRING_BYTES (XSTRING (string
)),
6993 STRING_MULTIBYTE (string
),
6999 char_encodable_p (c
, attrs
)
7004 struct charset
*charset
;
7006 for (tail
= CODING_ATTR_CHARSET_LIST (attrs
);
7007 CONSP (tail
); tail
= XCDR (tail
))
7009 charset
= CHARSET_FROM_ID (XINT (XCAR (tail
)));
7010 if (CHAR_CHARSET_P (c
, charset
))
7013 return (! NILP (tail
));
7017 /* Return a list of coding systems that safely encode the text between
7018 START and END. If EXCLUDE is non-nil, it is a list of coding
7019 systems not to check. The returned list doesn't contain any such
7020 coding systems. In any case, if the text contains only ASCII or is
7021 unibyte, return t. */
7023 DEFUN ("find-coding-systems-region-internal",
7024 Ffind_coding_systems_region_internal
,
7025 Sfind_coding_systems_region_internal
, 2, 3, 0,
7026 doc
: /* Internal use only. */)
7027 (start
, end
, exclude
)
7028 Lisp_Object start
, end
, exclude
;
7030 Lisp_Object coding_attrs_list
, safe_codings
;
7031 EMACS_INT start_byte
, end_byte
;
7032 const unsigned char *p
, *pbeg
, *pend
;
7034 Lisp_Object tail
, elt
;
7036 if (STRINGP (start
))
7038 if (!STRING_MULTIBYTE (start
)
7039 || XSTRING (start
)->size
== STRING_BYTES (XSTRING (start
)))
7042 end_byte
= STRING_BYTES (XSTRING (start
));
7046 CHECK_NUMBER_COERCE_MARKER (start
);
7047 CHECK_NUMBER_COERCE_MARKER (end
);
7048 if (XINT (start
) < BEG
|| XINT (end
) > Z
|| XINT (start
) > XINT (end
))
7049 args_out_of_range (start
, end
);
7050 if (NILP (current_buffer
->enable_multibyte_characters
))
7052 start_byte
= CHAR_TO_BYTE (XINT (start
));
7053 end_byte
= CHAR_TO_BYTE (XINT (end
));
7054 if (XINT (end
) - XINT (start
) == end_byte
- start_byte
)
7057 if (XINT (start
) < GPT
&& XINT (end
) > GPT
)
7059 if ((GPT
- XINT (start
)) < (XINT (end
) - GPT
))
7060 move_gap_both (XINT (start
), start_byte
);
7062 move_gap_both (XINT (end
), end_byte
);
7066 coding_attrs_list
= Qnil
;
7067 for (tail
= Vcoding_system_list
; CONSP (tail
); tail
= XCDR (tail
))
7069 || NILP (Fmemq (XCAR (tail
), exclude
)))
7073 attrs
= AREF (CODING_SYSTEM_SPEC (XCAR (tail
)), 0);
7074 if (EQ (XCAR (tail
), CODING_ATTR_BASE_NAME (attrs
))
7075 && ! EQ (CODING_ATTR_TYPE (attrs
), Qundecided
))
7076 coding_attrs_list
= Fcons (attrs
, coding_attrs_list
);
7079 if (STRINGP (start
))
7080 p
= pbeg
= XSTRING (start
)->data
;
7082 p
= pbeg
= BYTE_POS_ADDR (start_byte
);
7083 pend
= p
+ (end_byte
- start_byte
);
7085 while (p
< pend
&& ASCII_BYTE_P (*p
)) p
++;
7086 while (p
< pend
&& ASCII_BYTE_P (*(pend
- 1))) pend
--;
7090 if (ASCII_BYTE_P (*p
))
7094 c
= STRING_CHAR_ADVANCE (p
);
7096 charset_map_loaded
= 0;
7097 for (tail
= coding_attrs_list
; CONSP (tail
);)
7102 else if (char_encodable_p (c
, elt
))
7104 else if (CONSP (XCDR (tail
)))
7106 XSETCAR (tail
, XCAR (XCDR (tail
)));
7107 XSETCDR (tail
, XCDR (XCDR (tail
)));
7111 XSETCAR (tail
, Qnil
);
7115 if (charset_map_loaded
)
7117 EMACS_INT p_offset
= p
- pbeg
, pend_offset
= pend
- pbeg
;
7119 if (STRINGP (start
))
7120 pbeg
= XSTRING (start
)->data
;
7122 pbeg
= BYTE_POS_ADDR (start_byte
);
7123 p
= pbeg
+ p_offset
;
7124 pend
= pbeg
+ pend_offset
;
7129 safe_codings
= Qnil
;
7130 for (tail
= coding_attrs_list
; CONSP (tail
); tail
= XCDR (tail
))
7131 if (! NILP (XCAR (tail
)))
7132 safe_codings
= Fcons (CODING_ATTR_BASE_NAME (XCAR (tail
)), safe_codings
);
7134 return safe_codings
;
7138 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region
,
7139 Scheck_coding_systems_region
, 3, 3, 0,
7140 doc
: /* Check if the region is encodable by coding systems.
7142 START and END are buffer positions specifying the region.
7143 CODING-SYSTEM-LIST is a list of coding systems to check.
7145 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
7146 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
7147 whole region, POS0, POS1, ... are buffer positions where non-encodable
7148 characters are found.
7150 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
7153 START may be a string. In that case, check if the string is
7154 encodable, and the value contains indices to the string instead of
7155 buffer positions. END is ignored. */)
7156 (start
, end
, coding_system_list
)
7157 Lisp_Object start
, end
, coding_system_list
;
7160 EMACS_INT start_byte
, end_byte
;
7162 const unsigned char *p
, *pbeg
, *pend
;
7164 Lisp_Object tail
, elt
;
7166 if (STRINGP (start
))
7168 if (!STRING_MULTIBYTE (start
)
7169 && XSTRING (start
)->size
!= STRING_BYTES (XSTRING (start
)))
7172 end_byte
= STRING_BYTES (XSTRING (start
));
7177 CHECK_NUMBER_COERCE_MARKER (start
);
7178 CHECK_NUMBER_COERCE_MARKER (end
);
7179 if (XINT (start
) < BEG
|| XINT (end
) > Z
|| XINT (start
) > XINT (end
))
7180 args_out_of_range (start
, end
);
7181 if (NILP (current_buffer
->enable_multibyte_characters
))
7183 start_byte
= CHAR_TO_BYTE (XINT (start
));
7184 end_byte
= CHAR_TO_BYTE (XINT (end
));
7185 if (XINT (end
) - XINT (start
) == end_byte
- start_byte
)
7188 if (XINT (start
) < GPT
&& XINT (end
) > GPT
)
7190 if ((GPT
- XINT (start
)) < (XINT (end
) - GPT
))
7191 move_gap_both (XINT (start
), start_byte
);
7193 move_gap_both (XINT (end
), end_byte
);
7199 for (tail
= coding_system_list
; CONSP (tail
); tail
= XCDR (tail
))
7202 list
= Fcons (Fcons (elt
, Fcons (AREF (CODING_SYSTEM_SPEC (elt
), 0),
7207 if (STRINGP (start
))
7208 p
= pbeg
= XSTRING (start
)->data
;
7210 p
= pbeg
= BYTE_POS_ADDR (start_byte
);
7211 pend
= p
+ (end_byte
- start_byte
);
7213 while (p
< pend
&& ASCII_BYTE_P (*p
)) p
++, pos
++;
7214 while (p
< pend
&& ASCII_BYTE_P (*(pend
- 1))) pend
--;
7218 if (ASCII_BYTE_P (*p
))
7222 c
= STRING_CHAR_ADVANCE (p
);
7224 charset_map_loaded
= 0;
7225 for (tail
= list
; CONSP (tail
); tail
= XCDR (tail
))
7227 elt
= XCDR (XCAR (tail
));
7228 if (! char_encodable_p (c
, XCAR (elt
)))
7229 XSETCDR (elt
, Fcons (make_number (pos
), XCDR (elt
)));
7231 if (charset_map_loaded
)
7233 EMACS_INT p_offset
= p
- pbeg
, pend_offset
= pend
- pbeg
;
7235 if (STRINGP (start
))
7236 pbeg
= XSTRING (start
)->data
;
7238 pbeg
= BYTE_POS_ADDR (start_byte
);
7239 p
= pbeg
+ p_offset
;
7240 pend
= pbeg
+ pend_offset
;
7248 for (; CONSP (tail
); tail
= XCDR (tail
))
7251 if (CONSP (XCDR (XCDR (elt
))))
7252 list
= Fcons (Fcons (XCAR (elt
), Fnreverse (XCDR (XCDR (elt
)))),
7262 code_convert_region (start
, end
, coding_system
, dst_object
, encodep
, norecord
)
7263 Lisp_Object start
, end
, coding_system
, dst_object
;
7264 int encodep
, norecord
;
7266 struct coding_system coding
;
7267 EMACS_INT from
, from_byte
, to
, to_byte
;
7268 Lisp_Object src_object
;
7270 CHECK_NUMBER_COERCE_MARKER (start
);
7271 CHECK_NUMBER_COERCE_MARKER (end
);
7272 if (NILP (coding_system
))
7273 coding_system
= Qno_conversion
;
7275 CHECK_CODING_SYSTEM (coding_system
);
7276 src_object
= Fcurrent_buffer ();
7277 if (NILP (dst_object
))
7278 dst_object
= src_object
;
7279 else if (! EQ (dst_object
, Qt
))
7280 CHECK_BUFFER (dst_object
);
7282 validate_region (&start
, &end
);
7283 from
= XFASTINT (start
);
7284 from_byte
= CHAR_TO_BYTE (from
);
7285 to
= XFASTINT (end
);
7286 to_byte
= CHAR_TO_BYTE (to
);
7288 setup_coding_system (coding_system
, &coding
);
7289 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
7292 encode_coding_object (&coding
, src_object
, from
, from_byte
, to
, to_byte
,
7295 decode_coding_object (&coding
, src_object
, from
, from_byte
, to
, to_byte
,
7298 Vlast_coding_system_used
= CODING_ID_NAME (coding
.id
);
7300 if (coding
.result
!= CODING_RESULT_SUCCESS
)
7301 error ("Code conversion error: %d", coding
.result
);
7303 return (BUFFERP (dst_object
)
7304 ? make_number (coding
.produced_char
)
7305 : coding
.dst_object
);
7309 DEFUN ("decode-coding-region", Fdecode_coding_region
, Sdecode_coding_region
,
7310 3, 4, "r\nzCoding system: ",
7311 doc
: /* Decode the current region from the specified coding system.
7312 When called from a program, takes four arguments:
7313 START, END, CODING-SYSTEM, and DESTINATION.
7314 START and END are buffer positions.
7316 Optional 4th arguments DESTINATION specifies where the decoded text goes.
7317 If nil, the region between START and END is replace by the decoded text.
7318 If buffer, the decoded text is inserted in the buffer.
7319 If t, the decoded text is returned.
7321 This function sets `last-coding-system-used' to the precise coding system
7322 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7323 not fully specified.)
7324 It returns the length of the decoded text. */)
7325 (start
, end
, coding_system
, destination
)
7326 Lisp_Object start
, end
, coding_system
, destination
;
7328 return code_convert_region (start
, end
, coding_system
, destination
, 0, 0);
7331 DEFUN ("encode-coding-region", Fencode_coding_region
, Sencode_coding_region
,
7332 3, 4, "r\nzCoding system: ",
7333 doc
: /* Encode the current region by specified coding system.
7334 When called from a program, takes three arguments:
7335 START, END, and CODING-SYSTEM. START and END are buffer positions.
7337 Optional 4th arguments DESTINATION specifies where the encoded text goes.
7338 If nil, the region between START and END is replace by the encoded text.
7339 If buffer, the encoded text is inserted in the buffer.
7340 If t, the encoded text is returned.
7342 This function sets `last-coding-system-used' to the precise coding system
7343 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7344 not fully specified.)
7345 It returns the length of the encoded text. */)
7346 (start
, end
, coding_system
, destination
)
7347 Lisp_Object start
, end
, coding_system
, destination
;
7349 return code_convert_region (start
, end
, coding_system
, destination
, 1, 0);
7353 code_convert_string (string
, coding_system
, dst_object
,
7354 encodep
, nocopy
, norecord
)
7355 Lisp_Object string
, coding_system
, dst_object
;
7356 int encodep
, nocopy
, norecord
;
7358 struct coding_system coding
;
7359 EMACS_INT chars
, bytes
;
7361 CHECK_STRING (string
);
7362 if (NILP (coding_system
))
7365 Vlast_coding_system_used
= Qno_conversion
;
7366 if (NILP (dst_object
))
7367 return (nocopy
? Fcopy_sequence (string
) : string
);
7370 if (NILP (coding_system
))
7371 coding_system
= Qno_conversion
;
7373 CHECK_CODING_SYSTEM (coding_system
);
7374 if (NILP (dst_object
))
7376 else if (! EQ (dst_object
, Qt
))
7377 CHECK_BUFFER (dst_object
);
7379 setup_coding_system (coding_system
, &coding
);
7380 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
7381 chars
= XSTRING (string
)->size
;
7382 bytes
= STRING_BYTES (XSTRING (string
));
7384 encode_coding_object (&coding
, string
, 0, 0, chars
, bytes
, dst_object
);
7386 decode_coding_object (&coding
, string
, 0, 0, chars
, bytes
, dst_object
);
7388 Vlast_coding_system_used
= CODING_ID_NAME (coding
.id
);
7390 if (coding
.result
!= CODING_RESULT_SUCCESS
)
7391 error ("Code conversion error: %d", coding
.result
);
7393 return (BUFFERP (dst_object
)
7394 ? make_number (coding
.produced_char
)
7395 : coding
.dst_object
);
7399 /* Encode or decode STRING according to CODING_SYSTEM.
7400 Do not set Vlast_coding_system_used.
7402 This function is called only from macros DECODE_FILE and
7403 ENCODE_FILE, thus we ignore character composition. */
7406 code_convert_string_norecord (string
, coding_system
, encodep
)
7407 Lisp_Object string
, coding_system
;
7410 return code_convert_string (string
, coding_system
, Qt
, encodep
, 0, 1);
7414 DEFUN ("decode-coding-string", Fdecode_coding_string
, Sdecode_coding_string
,
7416 doc
: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7418 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
7419 if the decoding operation is trivial.
7421 Optional fourth arg BUFFER non-nil meant that the decoded text is
7422 inserted in BUFFER instead of returned as a string. In this case,
7423 the return value is BUFFER.
7425 This function sets `last-coding-system-used' to the precise coding system
7426 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7427 not fully specified. */)
7428 (string
, coding_system
, nocopy
, buffer
)
7429 Lisp_Object string
, coding_system
, nocopy
, buffer
;
7431 return code_convert_string (string
, coding_system
, buffer
,
7432 0, ! NILP (nocopy
), 0);
7435 DEFUN ("encode-coding-string", Fencode_coding_string
, Sencode_coding_string
,
7437 doc
: /* Encode STRING to CODING-SYSTEM, and return the result.
7439 Optional third arg NOCOPY non-nil means it is OK to return STRING
7440 itself if the encoding operation is trivial.
7442 Optional fourth arg BUFFER non-nil meant that the encoded text is
7443 inserted in BUFFER instead of returned as a string. In this case,
7444 the return value is BUFFER.
7446 This function sets `last-coding-system-used' to the precise coding system
7447 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7448 not fully specified.) */)
7449 (string
, coding_system
, nocopy
, buffer
)
7450 Lisp_Object string
, coding_system
, nocopy
, buffer
;
7452 return code_convert_string (string
, coding_system
, buffer
,
7453 1, ! NILP (nocopy
), 1);
7457 DEFUN ("decode-sjis-char", Fdecode_sjis_char
, Sdecode_sjis_char
, 1, 1, 0,
7458 doc
: /* Decode a Japanese character which has CODE in shift_jis encoding.
7459 Return the corresponding character. */)
7463 Lisp_Object spec
, attrs
, val
;
7464 struct charset
*charset_roman
, *charset_kanji
, *charset_kana
, *charset
;
7467 CHECK_NATNUM (code
);
7468 c
= XFASTINT (code
);
7469 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system
, spec
);
7470 attrs
= AREF (spec
, 0);
7472 if (ASCII_BYTE_P (c
)
7473 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
7476 val
= CODING_ATTR_CHARSET_LIST (attrs
);
7477 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
7478 charset_kana
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
7479 charset_kanji
= CHARSET_FROM_ID (XINT (XCAR (val
)));
7482 charset
= charset_roman
;
7483 else if (c
>= 0xA0 && c
< 0xDF)
7485 charset
= charset_kana
;
7490 int s1
= c
>> 8, s2
= c
& 0xFF;
7492 if (s1
< 0x81 || (s1
> 0x9F && s1
< 0xE0) || s1
> 0xEF
7493 || s2
< 0x40 || s2
== 0x7F || s2
> 0xFC)
7494 error ("Invalid code: %d", code
);
7496 charset
= charset_kanji
;
7498 c
= DECODE_CHAR (charset
, c
);
7500 error ("Invalid code: %d", code
);
7501 return make_number (c
);
7505 DEFUN ("encode-sjis-char", Fencode_sjis_char
, Sencode_sjis_char
, 1, 1, 0,
7506 doc
: /* Encode a Japanese character CHAR to shift_jis encoding.
7507 Return the corresponding code in SJIS. */)
7511 Lisp_Object spec
, attrs
, charset_list
;
7513 struct charset
*charset
;
7516 CHECK_CHARACTER (ch
);
7518 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system
, spec
);
7519 attrs
= AREF (spec
, 0);
7521 if (ASCII_CHAR_P (c
)
7522 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
7525 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
7526 charset
= char_charset (c
, charset_list
, &code
);
7527 if (code
== CHARSET_INVALID_CODE (charset
))
7528 error ("Can't encode by shift_jis encoding: %d", c
);
7531 return make_number (code
);
7534 DEFUN ("decode-big5-char", Fdecode_big5_char
, Sdecode_big5_char
, 1, 1, 0,
7535 doc
: /* Decode a Big5 character which has CODE in BIG5 coding system.
7536 Return the corresponding character. */)
7540 Lisp_Object spec
, attrs
, val
;
7541 struct charset
*charset_roman
, *charset_big5
, *charset
;
7544 CHECK_NATNUM (code
);
7545 c
= XFASTINT (code
);
7546 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system
, spec
);
7547 attrs
= AREF (spec
, 0);
7549 if (ASCII_BYTE_P (c
)
7550 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
7553 val
= CODING_ATTR_CHARSET_LIST (attrs
);
7554 charset_roman
= CHARSET_FROM_ID (XINT (XCAR (val
))), val
= XCDR (val
);
7555 charset_big5
= CHARSET_FROM_ID (XINT (XCAR (val
)));
7558 charset
= charset_roman
;
7561 int b1
= c
>> 8, b2
= c
& 0x7F;
7562 if (b1
< 0xA1 || b1
> 0xFE
7563 || b2
< 0x40 || (b2
> 0x7E && b2
< 0xA1) || b2
> 0xFE)
7564 error ("Invalid code: %d", code
);
7565 charset
= charset_big5
;
7567 c
= DECODE_CHAR (charset
, (unsigned )c
);
7569 error ("Invalid code: %d", code
);
7570 return make_number (c
);
7573 DEFUN ("encode-big5-char", Fencode_big5_char
, Sencode_big5_char
, 1, 1, 0,
7574 doc
: /* Encode the Big5 character CHAR to BIG5 coding system.
7575 Return the corresponding character code in Big5. */)
7579 Lisp_Object spec
, attrs
, charset_list
;
7580 struct charset
*charset
;
7584 CHECK_CHARACTER (ch
);
7586 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system
, spec
);
7587 attrs
= AREF (spec
, 0);
7588 if (ASCII_CHAR_P (c
)
7589 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs
)))
7592 charset_list
= CODING_ATTR_CHARSET_LIST (attrs
);
7593 charset
= char_charset (c
, charset_list
, &code
);
7594 if (code
== CHARSET_INVALID_CODE (charset
))
7595 error ("Can't encode by Big5 encoding: %d", c
);
7597 return make_number (code
);
7601 DEFUN ("set-terminal-coding-system-internal",
7602 Fset_terminal_coding_system_internal
,
7603 Sset_terminal_coding_system_internal
, 1, 1, 0,
7604 doc
: /* Internal use only. */)
7606 Lisp_Object coding_system
;
7608 CHECK_SYMBOL (coding_system
);
7609 setup_coding_system (Fcheck_coding_system (coding_system
),
7612 /* We had better not send unsafe characters to terminal. */
7613 terminal_coding
.mode
|= CODING_MODE_SAFE_ENCODING
;
7614 /* Characer composition should be disabled. */
7615 terminal_coding
.common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
7616 terminal_coding
.src_multibyte
= 1;
7617 terminal_coding
.dst_multibyte
= 0;
7621 DEFUN ("set-safe-terminal-coding-system-internal",
7622 Fset_safe_terminal_coding_system_internal
,
7623 Sset_safe_terminal_coding_system_internal
, 1, 1, 0,
7624 doc
: /* Internal use only. */)
7626 Lisp_Object coding_system
;
7628 CHECK_SYMBOL (coding_system
);
7629 setup_coding_system (Fcheck_coding_system (coding_system
),
7630 &safe_terminal_coding
);
7631 /* Characer composition should be disabled. */
7632 safe_terminal_coding
.common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
7633 safe_terminal_coding
.src_multibyte
= 1;
7634 safe_terminal_coding
.dst_multibyte
= 0;
7638 DEFUN ("terminal-coding-system",
7639 Fterminal_coding_system
, Sterminal_coding_system
, 0, 0, 0,
7640 doc
: /* Return coding system specified for terminal output. */)
7643 return CODING_ID_NAME (terminal_coding
.id
);
7646 DEFUN ("set-keyboard-coding-system-internal",
7647 Fset_keyboard_coding_system_internal
,
7648 Sset_keyboard_coding_system_internal
, 1, 1, 0,
7649 doc
: /* Internal use only. */)
7651 Lisp_Object coding_system
;
7653 CHECK_SYMBOL (coding_system
);
7654 setup_coding_system (Fcheck_coding_system (coding_system
),
7656 /* Characer composition should be disabled. */
7657 keyboard_coding
.common_flags
&= ~CODING_ANNOTATE_COMPOSITION_MASK
;
7661 DEFUN ("keyboard-coding-system",
7662 Fkeyboard_coding_system
, Skeyboard_coding_system
, 0, 0, 0,
7663 doc
: /* Return coding system specified for decoding keyboard input. */)
7666 return CODING_ID_NAME (keyboard_coding
.id
);
7670 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system
,
7671 Sfind_operation_coding_system
, 1, MANY
, 0,
7672 doc
: /* Choose a coding system for an operation based on the target name.
7673 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7674 DECODING-SYSTEM is the coding system to use for decoding
7675 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7676 for encoding (in case OPERATION does encoding).
7678 The first argument OPERATION specifies an I/O primitive:
7679 For file I/O, `insert-file-contents' or `write-region'.
7680 For process I/O, `call-process', `call-process-region', or `start-process'.
7681 For network I/O, `open-network-stream'.
7683 The remaining arguments should be the same arguments that were passed
7684 to the primitive. Depending on which primitive, one of those arguments
7685 is selected as the TARGET. For example, if OPERATION does file I/O,
7686 whichever argument specifies the file name is TARGET.
7688 TARGET has a meaning which depends on OPERATION:
7689 For file I/O, TARGET is a file name.
7690 For process I/O, TARGET is a process name.
7691 For network I/O, TARGET is a service name or a port number
7693 This function looks up what specified for TARGET in,
7694 `file-coding-system-alist', `process-coding-system-alist',
7695 or `network-coding-system-alist' depending on OPERATION.
7696 They may specify a coding system, a cons of coding systems,
7697 or a function symbol to call.
7698 In the last case, we call the function with one argument,
7699 which is a list of all the arguments given to this function.
7701 usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
7706 Lisp_Object operation
, target_idx
, target
, val
;
7707 register Lisp_Object chain
;
7710 error ("Too few arguments");
7711 operation
= args
[0];
7712 if (!SYMBOLP (operation
)
7713 || !INTEGERP (target_idx
= Fget (operation
, Qtarget_idx
)))
7714 error ("Invalid first arguement");
7715 if (nargs
< 1 + XINT (target_idx
))
7716 error ("Too few arguments for operation: %s",
7717 XSYMBOL (operation
)->name
->data
);
7718 target
= args
[XINT (target_idx
) + 1];
7719 if (!(STRINGP (target
)
7720 || (EQ (operation
, Qopen_network_stream
) && INTEGERP (target
))))
7721 error ("Invalid %dth argument", XINT (target_idx
) + 1);
7723 chain
= ((EQ (operation
, Qinsert_file_contents
)
7724 || EQ (operation
, Qwrite_region
))
7725 ? Vfile_coding_system_alist
7726 : (EQ (operation
, Qopen_network_stream
)
7727 ? Vnetwork_coding_system_alist
7728 : Vprocess_coding_system_alist
));
7732 for (; CONSP (chain
); chain
= XCDR (chain
))
7738 && ((STRINGP (target
)
7739 && STRINGP (XCAR (elt
))
7740 && fast_string_match (XCAR (elt
), target
) >= 0)
7741 || (INTEGERP (target
) && EQ (target
, XCAR (elt
)))))
7744 /* Here, if VAL is both a valid coding system and a valid
7745 function symbol, we return VAL as a coding system. */
7748 if (! SYMBOLP (val
))
7750 if (! NILP (Fcoding_system_p (val
)))
7751 return Fcons (val
, val
);
7752 if (! NILP (Ffboundp (val
)))
7754 val
= call1 (val
, Flist (nargs
, args
));
7757 if (SYMBOLP (val
) && ! NILP (Fcoding_system_p (val
)))
7758 return Fcons (val
, val
);
7766 DEFUN ("set-coding-system-priority", Fset_coding_system_priority
,
7767 Sset_coding_system_priority
, 0, MANY
, 0,
7768 doc
: /* Assign higher priority to the coding systems given as arguments.
7769 If multiple coding systems belongs to the same category,
7770 all but the first one are ignored. */)
7776 int changed
[coding_category_max
];
7777 enum coding_category priorities
[coding_category_max
];
7779 bzero (changed
, sizeof changed
);
7781 for (i
= j
= 0; i
< nargs
; i
++)
7783 enum coding_category category
;
7784 Lisp_Object spec
, attrs
;
7786 CHECK_CODING_SYSTEM_GET_SPEC (args
[i
], spec
);
7787 attrs
= AREF (spec
, 0);
7788 category
= XINT (CODING_ATTR_CATEGORY (attrs
));
7789 if (changed
[category
])
7790 /* Ignore this coding system because a coding system of the
7791 same category already had a higher priority. */
7793 changed
[category
] = 1;
7794 priorities
[j
++] = category
;
7795 if (coding_categories
[category
].id
>= 0
7796 && ! EQ (args
[i
], CODING_ID_NAME (coding_categories
[category
].id
)))
7797 setup_coding_system (args
[i
], &coding_categories
[category
]);
7798 Fset (AREF (Vcoding_category_table
, category
), args
[i
]);
7801 /* Now we have decided top J priorities. Reflect the order of the
7802 original priorities to the remaining priorities. */
7804 for (i
= j
, j
= 0; i
< coding_category_max
; i
++, j
++)
7806 while (j
< coding_category_max
7807 && changed
[coding_priorities
[j
]])
7809 if (j
== coding_category_max
)
7811 priorities
[i
] = coding_priorities
[j
];
7814 bcopy (priorities
, coding_priorities
, sizeof priorities
);
7816 /* Update `coding-category-list'. */
7817 Vcoding_category_list
= Qnil
;
7818 for (i
= coding_category_max
- 1; i
>= 0; i
--)
7819 Vcoding_category_list
7820 = Fcons (AREF (Vcoding_category_table
, priorities
[i
]),
7821 Vcoding_category_list
);
7826 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list
,
7827 Scoding_system_priority_list
, 0, 1, 0,
7828 doc
: /* Return a list of coding systems ordered by their priorities.
7829 HIGHESTP non-nil means just return the highest priority one. */)
7831 Lisp_Object highestp
;
7836 for (i
= 0, val
= Qnil
; i
< coding_category_max
; i
++)
7838 enum coding_category category
= coding_priorities
[i
];
7839 int id
= coding_categories
[category
].id
;
7844 attrs
= CODING_ID_ATTRS (id
);
7845 if (! NILP (highestp
))
7846 return CODING_ATTR_BASE_NAME (attrs
);
7847 val
= Fcons (CODING_ATTR_BASE_NAME (attrs
), val
);
7849 return Fnreverse (val
);
7852 static char *suffixes
[] = { "-unix", "-dos", "-mac" };
7855 make_subsidiaries (base
)
7858 Lisp_Object subsidiaries
;
7859 int base_name_len
= STRING_BYTES (XSYMBOL (base
)->name
);
7860 char *buf
= (char *) alloca (base_name_len
+ 6);
7863 bcopy (XSYMBOL (base
)->name
->data
, buf
, base_name_len
);
7864 subsidiaries
= Fmake_vector (make_number (3), Qnil
);
7865 for (i
= 0; i
< 3; i
++)
7867 bcopy (suffixes
[i
], buf
+ base_name_len
, strlen (suffixes
[i
]) + 1);
7868 ASET (subsidiaries
, i
, intern (buf
));
7870 return subsidiaries
;
7874 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal
,
7875 Sdefine_coding_system_internal
, coding_arg_max
, MANY
, 0,
7876 doc
: /* For internal use only.
7877 usage: (define-coding-system-internal ...) */)
7883 Lisp_Object spec_vec
; /* [ ATTRS ALIASE EOL_TYPE ] */
7884 Lisp_Object attrs
; /* Vector of attributes. */
7885 Lisp_Object eol_type
;
7886 Lisp_Object aliases
;
7887 Lisp_Object coding_type
, charset_list
, safe_charsets
;
7888 enum coding_category category
;
7889 Lisp_Object tail
, val
;
7890 int max_charset_id
= 0;
7893 if (nargs
< coding_arg_max
)
7896 attrs
= Fmake_vector (make_number (coding_attr_last_index
), Qnil
);
7898 name
= args
[coding_arg_name
];
7899 CHECK_SYMBOL (name
);
7900 CODING_ATTR_BASE_NAME (attrs
) = name
;
7902 val
= args
[coding_arg_mnemonic
];
7903 if (! STRINGP (val
))
7904 CHECK_CHARACTER (val
);
7905 CODING_ATTR_MNEMONIC (attrs
) = val
;
7907 coding_type
= args
[coding_arg_coding_type
];
7908 CHECK_SYMBOL (coding_type
);
7909 CODING_ATTR_TYPE (attrs
) = coding_type
;
7911 charset_list
= args
[coding_arg_charset_list
];
7912 if (SYMBOLP (charset_list
))
7914 if (EQ (charset_list
, Qiso_2022
))
7916 if (! EQ (coding_type
, Qiso_2022
))
7917 error ("Invalid charset-list");
7918 charset_list
= Viso_2022_charset_list
;
7920 else if (EQ (charset_list
, Qemacs_mule
))
7922 if (! EQ (coding_type
, Qemacs_mule
))
7923 error ("Invalid charset-list");
7924 charset_list
= Vemacs_mule_charset_list
;
7926 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
7927 if (max_charset_id
< XFASTINT (XCAR (tail
)))
7928 max_charset_id
= XFASTINT (XCAR (tail
));
7932 charset_list
= Fcopy_sequence (charset_list
);
7933 for (tail
= charset_list
; !NILP (tail
); tail
= Fcdr (tail
))
7935 struct charset
*charset
;
7938 CHECK_CHARSET_GET_CHARSET (val
, charset
);
7939 if (EQ (coding_type
, Qiso_2022
)
7940 ? CHARSET_ISO_FINAL (charset
) < 0
7941 : EQ (coding_type
, Qemacs_mule
)
7942 ? CHARSET_EMACS_MULE_ID (charset
) < 0
7944 error ("Can't handle charset `%s'",
7945 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
7947 XCAR (tail
) = make_number (charset
->id
);
7948 if (max_charset_id
< charset
->id
)
7949 max_charset_id
= charset
->id
;
7952 CODING_ATTR_CHARSET_LIST (attrs
) = charset_list
;
7954 safe_charsets
= Fmake_string (make_number (max_charset_id
+ 1),
7956 for (tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
7957 XSTRING (safe_charsets
)->data
[XFASTINT (XCAR (tail
))] = 0;
7958 CODING_ATTR_SAFE_CHARSETS (attrs
) = safe_charsets
;
7960 CODING_ATTR_ASCII_COMPAT (attrs
) = args
[coding_arg_ascii_compatible_p
];
7962 val
= args
[coding_arg_decode_translation_table
];
7964 CHECK_CHAR_TABLE (val
);
7965 CODING_ATTR_DECODE_TBL (attrs
) = val
;
7967 val
= args
[coding_arg_encode_translation_table
];
7969 CHECK_CHAR_TABLE (val
);
7970 CODING_ATTR_ENCODE_TBL (attrs
) = val
;
7972 val
= args
[coding_arg_post_read_conversion
];
7974 CODING_ATTR_POST_READ (attrs
) = val
;
7976 val
= args
[coding_arg_pre_write_conversion
];
7978 CODING_ATTR_PRE_WRITE (attrs
) = val
;
7980 val
= args
[coding_arg_default_char
];
7982 CODING_ATTR_DEFAULT_CHAR (attrs
) = make_number (' ');
7985 CHECK_CHARACTER (val
);
7986 CODING_ATTR_DEFAULT_CHAR (attrs
) = val
;
7989 val
= args
[coding_arg_plist
];
7991 CODING_ATTR_PLIST (attrs
) = val
;
7993 if (EQ (coding_type
, Qcharset
))
7996 /* Generate a lisp vector of 256 elements. Each element is nil,
7997 integer, or a list of charset IDs.
7999 If Nth element is nil, the byte code N is invalid in this
8002 If Nth element is a number NUM, N is the first byte of a
8003 charset whose ID is NUM.
8005 If Nth element is a list of charset IDs, N is the first byte
8006 of one of them. The list is sorted by dimensions of the
8007 charsets. A charset of smaller dimension comes firtst.
8009 for (list
= Qnil
, tail
= charset_list
; CONSP (tail
); tail
= XCDR (tail
))
8011 struct charset
*charset
= CHARSET_FROM_ID (XFASTINT (XCAR (tail
)));
8013 if (charset
->method
== CHARSET_METHOD_SUPERSET
)
8015 val
= CHARSET_SUPERSET (charset
);
8016 for (; CONSP (val
); val
= XCDR (val
))
8017 list
= Fcons (XCAR (XCAR (val
)), list
);
8020 list
= Fcons (XCAR (tail
), list
);
8023 val
= Fmake_vector (make_number (256), Qnil
);
8025 for (tail
= Fnreverse (list
); CONSP (tail
); tail
= XCDR (tail
))
8027 struct charset
*charset
= CHARSET_FROM_ID (XFASTINT (XCAR (tail
)));
8028 int dim
= CHARSET_DIMENSION (charset
);
8029 int idx
= (dim
- 1) * 4;
8031 if (CHARSET_ASCII_COMPATIBLE_P (charset
))
8032 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8034 for (i
= charset
->code_space
[idx
];
8035 i
<= charset
->code_space
[idx
+ 1]; i
++)
8037 Lisp_Object tmp
, tmp2
;
8040 tmp
= AREF (val
, i
);
8043 else if (NUMBERP (tmp
))
8045 dim2
= CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp
)));
8047 tmp
= Fcons (XCAR (tail
), Fcons (tmp
, Qnil
));
8049 tmp
= Fcons (tmp
, Fcons (XCAR (tail
), Qnil
));
8053 for (tmp2
= tmp
; CONSP (tmp2
); tmp2
= XCDR (tmp2
))
8055 dim2
= CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2
))));
8060 tmp
= nconc2 (tmp
, Fcons (XCAR (tail
), Qnil
));
8063 XSETCDR (tmp2
, Fcons (XCAR (tmp2
), XCDR (tmp2
)));
8064 XSETCAR (tmp2
, XCAR (tail
));
8070 ASET (attrs
, coding_attr_charset_valids
, val
);
8071 category
= coding_category_charset
;
8073 else if (EQ (coding_type
, Qccl
))
8077 if (nargs
< coding_arg_ccl_max
)
8080 val
= args
[coding_arg_ccl_decoder
];
8081 CHECK_CCL_PROGRAM (val
);
8083 val
= Fcopy_sequence (val
);
8084 ASET (attrs
, coding_attr_ccl_decoder
, val
);
8086 val
= args
[coding_arg_ccl_encoder
];
8087 CHECK_CCL_PROGRAM (val
);
8089 val
= Fcopy_sequence (val
);
8090 ASET (attrs
, coding_attr_ccl_encoder
, val
);
8092 val
= args
[coding_arg_ccl_valids
];
8093 valids
= Fmake_string (make_number (256), make_number (0));
8094 for (tail
= val
; !NILP (tail
); tail
= Fcdr (tail
))
8101 from
= to
= XINT (val
);
8102 if (from
< 0 || from
> 255)
8103 args_out_of_range_3 (val
, make_number (0), make_number (255));
8108 CHECK_NUMBER (XCAR (val
));
8109 CHECK_NUMBER (XCDR (val
));
8110 from
= XINT (XCAR (val
));
8111 if (from
< 0 || from
> 255)
8112 args_out_of_range_3 (XCAR (val
),
8113 make_number (0), make_number (255));
8114 to
= XINT (XCDR (val
));
8115 if (to
< from
|| to
> 255)
8116 args_out_of_range_3 (XCDR (val
),
8117 XCAR (val
), make_number (255));
8119 for (i
= from
; i
<= to
; i
++)
8120 XSTRING (valids
)->data
[i
] = 1;
8122 ASET (attrs
, coding_attr_ccl_valids
, valids
);
8124 category
= coding_category_ccl
;
8126 else if (EQ (coding_type
, Qutf_16
))
8128 Lisp_Object bom
, endian
;
8130 CODING_ATTR_ASCII_COMPAT (attrs
) = Qnil
;
8132 if (nargs
< coding_arg_utf16_max
)
8135 bom
= args
[coding_arg_utf16_bom
];
8136 if (! NILP (bom
) && ! EQ (bom
, Qt
))
8139 CHECK_CODING_SYSTEM (XCAR (bom
));
8140 CHECK_CODING_SYSTEM (XCDR (bom
));
8142 ASET (attrs
, coding_attr_utf_16_bom
, bom
);
8144 endian
= args
[coding_arg_utf16_endian
];
8145 CHECK_SYMBOL (endian
);
8148 else if (! EQ (endian
, Qbig
) && ! EQ (endian
, Qlittle
))
8149 error ("Invalid endian: %s", XSYMBOL (endian
)->name
->data
);
8150 ASET (attrs
, coding_attr_utf_16_endian
, endian
);
8152 category
= (CONSP (bom
)
8153 ? coding_category_utf_16_auto
8155 ? (EQ (endian
, Qbig
)
8156 ? coding_category_utf_16_be_nosig
8157 : coding_category_utf_16_le_nosig
)
8158 : (EQ (endian
, Qbig
)
8159 ? coding_category_utf_16_be
8160 : coding_category_utf_16_le
));
8162 else if (EQ (coding_type
, Qiso_2022
))
8164 Lisp_Object initial
, reg_usage
, request
, flags
;
8167 if (nargs
< coding_arg_iso2022_max
)
8170 initial
= Fcopy_sequence (args
[coding_arg_iso2022_initial
]);
8171 CHECK_VECTOR (initial
);
8172 for (i
= 0; i
< 4; i
++)
8174 val
= Faref (initial
, make_number (i
));
8177 struct charset
*charset
;
8179 CHECK_CHARSET_GET_CHARSET (val
, charset
);
8180 ASET (initial
, i
, make_number (CHARSET_ID (charset
)));
8181 if (i
== 0 && CHARSET_ASCII_COMPATIBLE_P (charset
))
8182 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8185 ASET (initial
, i
, make_number (-1));
8188 reg_usage
= args
[coding_arg_iso2022_reg_usage
];
8189 CHECK_CONS (reg_usage
);
8190 CHECK_NATNUM (XCAR (reg_usage
));
8191 CHECK_NATNUM (XCDR (reg_usage
));
8193 request
= Fcopy_sequence (args
[coding_arg_iso2022_request
]);
8194 for (tail
= request
; ! NILP (tail
); tail
= Fcdr (tail
))
8200 CHECK_CHARSET_GET_ID (XCAR (val
), id
);
8201 CHECK_NATNUM (XCDR (val
));
8202 if (XINT (XCDR (val
)) >= 4)
8203 error ("Invalid graphic register number: %d", XINT (XCDR (val
)));
8204 XCAR (val
) = make_number (id
);
8207 flags
= args
[coding_arg_iso2022_flags
];
8208 CHECK_NATNUM (flags
);
8210 if (EQ (args
[coding_arg_charset_list
], Qiso_2022
))
8211 flags
= make_number (i
| CODING_ISO_FLAG_FULL_SUPPORT
);
8213 ASET (attrs
, coding_attr_iso_initial
, initial
);
8214 ASET (attrs
, coding_attr_iso_usage
, reg_usage
);
8215 ASET (attrs
, coding_attr_iso_request
, request
);
8216 ASET (attrs
, coding_attr_iso_flags
, flags
);
8217 setup_iso_safe_charsets (attrs
);
8219 if (i
& CODING_ISO_FLAG_SEVEN_BITS
)
8220 category
= ((i
& (CODING_ISO_FLAG_LOCKING_SHIFT
8221 | CODING_ISO_FLAG_SINGLE_SHIFT
))
8222 ? coding_category_iso_7_else
8223 : EQ (args
[coding_arg_charset_list
], Qiso_2022
)
8224 ? coding_category_iso_7
8225 : coding_category_iso_7_tight
);
8228 int id
= XINT (AREF (initial
, 1));
8230 category
= (((i
& CODING_ISO_FLAG_LOCKING_SHIFT
)
8231 || EQ (args
[coding_arg_charset_list
], Qiso_2022
)
8233 ? coding_category_iso_8_else
8234 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id
)) == 1)
8235 ? coding_category_iso_8_1
8236 : coding_category_iso_8_2
);
8238 if (category
!= coding_category_iso_8_1
8239 && category
!= coding_category_iso_8_2
)
8240 CODING_ATTR_ASCII_COMPAT (attrs
) = Qnil
;
8242 else if (EQ (coding_type
, Qemacs_mule
))
8244 if (EQ (args
[coding_arg_charset_list
], Qemacs_mule
))
8245 ASET (attrs
, coding_attr_emacs_mule_full
, Qt
);
8246 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8247 category
= coding_category_emacs_mule
;
8249 else if (EQ (coding_type
, Qshift_jis
))
8252 struct charset
*charset
;
8254 if (XINT (Flength (charset_list
)) != 3)
8255 error ("There should be just three charsets");
8257 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
8258 if (CHARSET_DIMENSION (charset
) != 1)
8259 error ("Dimension of charset %s is not one",
8260 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
8261 if (CHARSET_ASCII_COMPATIBLE_P (charset
))
8262 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8264 charset_list
= XCDR (charset_list
);
8265 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
8266 if (CHARSET_DIMENSION (charset
) != 1)
8267 error ("Dimension of charset %s is not one",
8268 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
8270 charset_list
= XCDR (charset_list
);
8271 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
8272 if (CHARSET_DIMENSION (charset
) != 2)
8273 error ("Dimension of charset %s is not two",
8274 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
8276 category
= coding_category_sjis
;
8277 Vsjis_coding_system
= name
;
8279 else if (EQ (coding_type
, Qbig5
))
8281 struct charset
*charset
;
8283 if (XINT (Flength (charset_list
)) != 2)
8284 error ("There should be just two charsets");
8286 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
8287 if (CHARSET_DIMENSION (charset
) != 1)
8288 error ("Dimension of charset %s is not one",
8289 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
8290 if (CHARSET_ASCII_COMPATIBLE_P (charset
))
8291 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8293 charset_list
= XCDR (charset_list
);
8294 charset
= CHARSET_FROM_ID (XINT (XCAR (charset_list
)));
8295 if (CHARSET_DIMENSION (charset
) != 2)
8296 error ("Dimension of charset %s is not two",
8297 XSYMBOL (CHARSET_NAME (charset
))->name
->data
);
8299 category
= coding_category_big5
;
8300 Vbig5_coding_system
= name
;
8302 else if (EQ (coding_type
, Qraw_text
))
8304 category
= coding_category_raw_text
;
8305 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8307 else if (EQ (coding_type
, Qutf_8
))
8309 category
= coding_category_utf_8
;
8310 CODING_ATTR_ASCII_COMPAT (attrs
) = Qt
;
8312 else if (EQ (coding_type
, Qundecided
))
8313 category
= coding_category_undecided
;
8315 error ("Invalid coding system type: %s",
8316 XSYMBOL (coding_type
)->name
->data
);
8318 CODING_ATTR_CATEGORY (attrs
) = make_number (category
);
8319 CODING_ATTR_PLIST (attrs
)
8320 = Fcons (QCcategory
, Fcons (AREF (Vcoding_category_table
, category
),
8321 CODING_ATTR_PLIST (attrs
)));
8323 eol_type
= args
[coding_arg_eol_type
];
8324 if (! NILP (eol_type
)
8325 && ! EQ (eol_type
, Qunix
)
8326 && ! EQ (eol_type
, Qdos
)
8327 && ! EQ (eol_type
, Qmac
))
8328 error ("Invalid eol-type");
8330 aliases
= Fcons (name
, Qnil
);
8332 if (NILP (eol_type
))
8334 eol_type
= make_subsidiaries (name
);
8335 for (i
= 0; i
< 3; i
++)
8337 Lisp_Object this_spec
, this_name
, this_aliases
, this_eol_type
;
8339 this_name
= AREF (eol_type
, i
);
8340 this_aliases
= Fcons (this_name
, Qnil
);
8341 this_eol_type
= (i
== 0 ? Qunix
: i
== 1 ? Qdos
: Qmac
);
8342 this_spec
= Fmake_vector (make_number (3), attrs
);
8343 ASET (this_spec
, 1, this_aliases
);
8344 ASET (this_spec
, 2, this_eol_type
);
8345 Fputhash (this_name
, this_spec
, Vcoding_system_hash_table
);
8346 Vcoding_system_list
= Fcons (this_name
, Vcoding_system_list
);
8347 Vcoding_system_alist
= Fcons (Fcons (Fsymbol_name (this_name
), Qnil
),
8348 Vcoding_system_alist
);
8352 spec_vec
= Fmake_vector (make_number (3), attrs
);
8353 ASET (spec_vec
, 1, aliases
);
8354 ASET (spec_vec
, 2, eol_type
);
8356 Fputhash (name
, spec_vec
, Vcoding_system_hash_table
);
8357 Vcoding_system_list
= Fcons (name
, Vcoding_system_list
);
8358 Vcoding_system_alist
= Fcons (Fcons (Fsymbol_name (name
), Qnil
),
8359 Vcoding_system_alist
);
8362 int id
= coding_categories
[category
].id
;
8364 if (id
< 0 || EQ (name
, CODING_ID_NAME (id
)))
8365 setup_coding_system (name
, &coding_categories
[category
]);
8371 return Fsignal (Qwrong_number_of_arguments
,
8372 Fcons (intern ("define-coding-system-internal"),
8373 make_number (nargs
)));
8376 /* Fixme: should this record the alias relationships for
8377 diagnostics? Should it update coding-system-list? */
8378 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias
,
8379 Sdefine_coding_system_alias
, 2, 2, 0,
8380 doc
: /* Define ALIAS as an alias for CODING-SYSTEM. */)
8381 (alias
, coding_system
)
8382 Lisp_Object alias
, coding_system
;
8384 Lisp_Object spec
, aliases
, eol_type
;
8386 CHECK_SYMBOL (alias
);
8387 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
8388 aliases
= AREF (spec
, 1);
8389 while (!NILP (XCDR (aliases
)))
8390 aliases
= XCDR (aliases
);
8391 XCDR (aliases
) = Fcons (alias
, Qnil
);
8393 eol_type
= AREF (spec
, 2);
8394 if (VECTORP (eol_type
))
8396 Lisp_Object subsidiaries
;
8399 subsidiaries
= make_subsidiaries (alias
);
8400 for (i
= 0; i
< 3; i
++)
8401 Fdefine_coding_system_alias (AREF (subsidiaries
, i
),
8402 AREF (eol_type
, i
));
8404 ASET (spec
, 2, subsidiaries
);
8407 Fputhash (alias
, spec
, Vcoding_system_hash_table
);
8408 Vcoding_system_alist
= Fcons (Fcons (Fsymbol_name (alias
), Qnil
),
8409 Vcoding_system_alist
);
8414 DEFUN ("coding-system-base", Fcoding_system_base
, Scoding_system_base
,
8416 doc
: /* Return the base of CODING-SYSTEM.
8417 Any alias or subsidiary coding system is not a base coding system. */)
8419 Lisp_Object coding_system
;
8421 Lisp_Object spec
, attrs
;
8423 if (NILP (coding_system
))
8424 return (Qno_conversion
);
8425 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
8426 attrs
= AREF (spec
, 0);
8427 return CODING_ATTR_BASE_NAME (attrs
);
8430 DEFUN ("coding-system-plist", Fcoding_system_plist
, Scoding_system_plist
,
8432 doc
: "Return the property list of CODING-SYSTEM.")
8434 Lisp_Object coding_system
;
8436 Lisp_Object spec
, attrs
;
8438 if (NILP (coding_system
))
8439 coding_system
= Qno_conversion
;
8440 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
8441 attrs
= AREF (spec
, 0);
8442 return CODING_ATTR_PLIST (attrs
);
8446 DEFUN ("coding-system-aliases", Fcoding_system_aliases
, Scoding_system_aliases
,
8448 doc
: /* Return the list of aliases of CODING-SYSTEM. */)
8450 Lisp_Object coding_system
;
8454 if (NILP (coding_system
))
8455 coding_system
= Qno_conversion
;
8456 CHECK_CODING_SYSTEM_GET_SPEC (coding_system
, spec
);
8457 return AREF (spec
, 1);
8460 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type
,
8461 Scoding_system_eol_type
, 1, 1, 0,
8462 doc
: /* Return eol-type of CODING-SYSTEM.
8463 An eol-type is integer 0, 1, 2, or a vector of coding systems.
8465 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
8466 and CR respectively.
8468 A vector value indicates that a format of end-of-line should be
8469 detected automatically. Nth element of the vector is the subsidiary
8470 coding system whose eol-type is N. */)
8472 Lisp_Object coding_system
;
8474 Lisp_Object spec
, eol_type
;
8477 if (NILP (coding_system
))
8478 coding_system
= Qno_conversion
;
8479 if (! CODING_SYSTEM_P (coding_system
))
8481 spec
= CODING_SYSTEM_SPEC (coding_system
);
8482 eol_type
= AREF (spec
, 2);
8483 if (VECTORP (eol_type
))
8484 return Fcopy_sequence (eol_type
);
8485 n
= EQ (eol_type
, Qunix
) ? 0 : EQ (eol_type
, Qdos
) ? 1 : 2;
8486 return make_number (n
);
8492 /*** 9. Post-amble ***/
8499 for (i
= 0; i
< coding_category_max
; i
++)
8501 coding_categories
[i
].id
= -1;
8502 coding_priorities
[i
] = i
;
8505 /* ISO2022 specific initialize routine. */
8506 for (i
= 0; i
< 0x20; i
++)
8507 iso_code_class
[i
] = ISO_control_0
;
8508 for (i
= 0x21; i
< 0x7F; i
++)
8509 iso_code_class
[i
] = ISO_graphic_plane_0
;
8510 for (i
= 0x80; i
< 0xA0; i
++)
8511 iso_code_class
[i
] = ISO_control_1
;
8512 for (i
= 0xA1; i
< 0xFF; i
++)
8513 iso_code_class
[i
] = ISO_graphic_plane_1
;
8514 iso_code_class
[0x20] = iso_code_class
[0x7F] = ISO_0x20_or_0x7F
;
8515 iso_code_class
[0xA0] = iso_code_class
[0xFF] = ISO_0xA0_or_0xFF
;
8516 iso_code_class
[ISO_CODE_CR
] = ISO_carriage_return
;
8517 iso_code_class
[ISO_CODE_SO
] = ISO_shift_out
;
8518 iso_code_class
[ISO_CODE_SI
] = ISO_shift_in
;
8519 iso_code_class
[ISO_CODE_SS2_7
] = ISO_single_shift_2_7
;
8520 iso_code_class
[ISO_CODE_ESC
] = ISO_escape
;
8521 iso_code_class
[ISO_CODE_SS2
] = ISO_single_shift_2
;
8522 iso_code_class
[ISO_CODE_SS3
] = ISO_single_shift_3
;
8523 iso_code_class
[ISO_CODE_CSI
] = ISO_control_sequence_introducer
;
8525 inhibit_pre_post_conversion
= 0;
8527 for (i
= 0; i
< 256; i
++)
8529 emacs_mule_bytes
[i
] = 1;
8531 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_11
] = 3;
8532 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_12
] = 3;
8533 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_21
] = 4;
8534 emacs_mule_bytes
[EMACS_MULE_LEADING_CODE_PRIVATE_22
] = 4;
8542 staticpro (&Vcoding_system_hash_table
);
8543 Vcoding_system_hash_table
= Fmakehash (Qeq
);
8545 staticpro (&Vsjis_coding_system
);
8546 Vsjis_coding_system
= Qnil
;
8548 staticpro (&Vbig5_coding_system
);
8549 Vbig5_coding_system
= Qnil
;
8551 staticpro (&Vcode_conversion_work_buf_list
);
8552 Vcode_conversion_work_buf_list
= Qnil
;
8554 staticpro (&Vcode_conversion_reused_work_buf
);
8555 Vcode_conversion_reused_work_buf
= Qnil
;
8557 DEFSYM (Qcharset
, "charset");
8558 DEFSYM (Qtarget_idx
, "target-idx");
8559 DEFSYM (Qcoding_system_history
, "coding-system-history");
8560 Fset (Qcoding_system_history
, Qnil
);
8562 /* Target FILENAME is the first argument. */
8563 Fput (Qinsert_file_contents
, Qtarget_idx
, make_number (0));
8564 /* Target FILENAME is the third argument. */
8565 Fput (Qwrite_region
, Qtarget_idx
, make_number (2));
8567 DEFSYM (Qcall_process
, "call-process");
8568 /* Target PROGRAM is the first argument. */
8569 Fput (Qcall_process
, Qtarget_idx
, make_number (0));
8571 DEFSYM (Qcall_process_region
, "call-process-region");
8572 /* Target PROGRAM is the third argument. */
8573 Fput (Qcall_process_region
, Qtarget_idx
, make_number (2));
8575 DEFSYM (Qstart_process
, "start-process");
8576 /* Target PROGRAM is the third argument. */
8577 Fput (Qstart_process
, Qtarget_idx
, make_number (2));
8579 DEFSYM (Qopen_network_stream
, "open-network-stream");
8580 /* Target SERVICE is the fourth argument. */
8581 Fput (Qopen_network_stream
, Qtarget_idx
, make_number (3));
8583 DEFSYM (Qcoding_system
, "coding-system");
8584 DEFSYM (Qcoding_aliases
, "coding-aliases");
8586 DEFSYM (Qeol_type
, "eol-type");
8587 DEFSYM (Qunix
, "unix");
8588 DEFSYM (Qdos
, "dos");
8590 DEFSYM (Qbuffer_file_coding_system
, "buffer-file-coding-system");
8591 DEFSYM (Qpost_read_conversion
, "post-read-conversion");
8592 DEFSYM (Qpre_write_conversion
, "pre-write-conversion");
8593 DEFSYM (Qdefault_char
, "default-char");
8594 DEFSYM (Qundecided
, "undecided");
8595 DEFSYM (Qno_conversion
, "no-conversion");
8596 DEFSYM (Qraw_text
, "raw-text");
8598 DEFSYM (Qiso_2022
, "iso-2022");
8600 DEFSYM (Qutf_8
, "utf-8");
8602 DEFSYM (Qutf_16
, "utf-16");
8603 DEFSYM (Qbig
, "big");
8604 DEFSYM (Qlittle
, "little");
8606 DEFSYM (Qshift_jis
, "shift-jis");
8607 DEFSYM (Qbig5
, "big5");
8609 DEFSYM (Qcoding_system_p
, "coding-system-p");
8611 DEFSYM (Qcoding_system_error
, "coding-system-error");
8612 Fput (Qcoding_system_error
, Qerror_conditions
,
8613 Fcons (Qcoding_system_error
, Fcons (Qerror
, Qnil
)));
8614 Fput (Qcoding_system_error
, Qerror_message
,
8615 build_string ("Invalid coding system"));
8617 /* Intern this now in case it isn't already done.
8618 Setting this variable twice is harmless.
8619 But don't staticpro it here--that is done in alloc.c. */
8620 Qchar_table_extra_slots
= intern ("char-table-extra-slots");
8622 DEFSYM (Qtranslation_table
, "translation-table");
8623 Fput (Qtranslation_table
, Qchar_table_extra_slots
, make_number (1));
8624 DEFSYM (Qtranslation_table_id
, "translation-table-id");
8625 DEFSYM (Qtranslation_table_for_decode
, "translation-table-for-decode");
8626 DEFSYM (Qtranslation_table_for_encode
, "translation-table-for-encode");
8628 DEFSYM (Qvalid_codes
, "valid-codes");
8630 DEFSYM (Qemacs_mule
, "emacs-mule");
8632 DEFSYM (QCcategory
, ":category");
8634 Vcoding_category_table
8635 = Fmake_vector (make_number (coding_category_max
), Qnil
);
8636 staticpro (&Vcoding_category_table
);
8637 /* Followings are target of code detection. */
8638 ASET (Vcoding_category_table
, coding_category_iso_7
,
8639 intern ("coding-category-iso-7"));
8640 ASET (Vcoding_category_table
, coding_category_iso_7_tight
,
8641 intern ("coding-category-iso-7-tight"));
8642 ASET (Vcoding_category_table
, coding_category_iso_8_1
,
8643 intern ("coding-category-iso-8-1"));
8644 ASET (Vcoding_category_table
, coding_category_iso_8_2
,
8645 intern ("coding-category-iso-8-2"));
8646 ASET (Vcoding_category_table
, coding_category_iso_7_else
,
8647 intern ("coding-category-iso-7-else"));
8648 ASET (Vcoding_category_table
, coding_category_iso_8_else
,
8649 intern ("coding-category-iso-8-else"));
8650 ASET (Vcoding_category_table
, coding_category_utf_8
,
8651 intern ("coding-category-utf-8"));
8652 ASET (Vcoding_category_table
, coding_category_utf_16_be
,
8653 intern ("coding-category-utf-16-be"));
8654 ASET (Vcoding_category_table
, coding_category_utf_16_auto
,
8655 intern ("coding-category-utf-16-auto"));
8656 ASET (Vcoding_category_table
, coding_category_utf_16_le
,
8657 intern ("coding-category-utf-16-le"));
8658 ASET (Vcoding_category_table
, coding_category_utf_16_be_nosig
,
8659 intern ("coding-category-utf-16-be-nosig"));
8660 ASET (Vcoding_category_table
, coding_category_utf_16_le_nosig
,
8661 intern ("coding-category-utf-16-le-nosig"));
8662 ASET (Vcoding_category_table
, coding_category_charset
,
8663 intern ("coding-category-charset"));
8664 ASET (Vcoding_category_table
, coding_category_sjis
,
8665 intern ("coding-category-sjis"));
8666 ASET (Vcoding_category_table
, coding_category_big5
,
8667 intern ("coding-category-big5"));
8668 ASET (Vcoding_category_table
, coding_category_ccl
,
8669 intern ("coding-category-ccl"));
8670 ASET (Vcoding_category_table
, coding_category_emacs_mule
,
8671 intern ("coding-category-emacs-mule"));
8672 /* Followings are NOT target of code detection. */
8673 ASET (Vcoding_category_table
, coding_category_raw_text
,
8674 intern ("coding-category-raw-text"));
8675 ASET (Vcoding_category_table
, coding_category_undecided
,
8676 intern ("coding-category-undecided"));
8678 defsubr (&Scoding_system_p
);
8679 defsubr (&Sread_coding_system
);
8680 defsubr (&Sread_non_nil_coding_system
);
8681 defsubr (&Scheck_coding_system
);
8682 defsubr (&Sdetect_coding_region
);
8683 defsubr (&Sdetect_coding_string
);
8684 defsubr (&Sfind_coding_systems_region_internal
);
8685 defsubr (&Scheck_coding_systems_region
);
8686 defsubr (&Sdecode_coding_region
);
8687 defsubr (&Sencode_coding_region
);
8688 defsubr (&Sdecode_coding_string
);
8689 defsubr (&Sencode_coding_string
);
8690 defsubr (&Sdecode_sjis_char
);
8691 defsubr (&Sencode_sjis_char
);
8692 defsubr (&Sdecode_big5_char
);
8693 defsubr (&Sencode_big5_char
);
8694 defsubr (&Sset_terminal_coding_system_internal
);
8695 defsubr (&Sset_safe_terminal_coding_system_internal
);
8696 defsubr (&Sterminal_coding_system
);
8697 defsubr (&Sset_keyboard_coding_system_internal
);
8698 defsubr (&Skeyboard_coding_system
);
8699 defsubr (&Sfind_operation_coding_system
);
8700 defsubr (&Sset_coding_system_priority
);
8701 defsubr (&Sdefine_coding_system_internal
);
8702 defsubr (&Sdefine_coding_system_alias
);
8703 defsubr (&Scoding_system_base
);
8704 defsubr (&Scoding_system_plist
);
8705 defsubr (&Scoding_system_aliases
);
8706 defsubr (&Scoding_system_eol_type
);
8707 defsubr (&Scoding_system_priority_list
);
8709 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list
,
8710 doc
: /* List of coding systems.
8712 Do not alter the value of this variable manually. This variable should be
8713 updated by the functions `define-coding-system' and
8714 `define-coding-system-alias'. */);
8715 Vcoding_system_list
= Qnil
;
8717 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist
,
8718 doc
: /* Alist of coding system names.
8719 Each element is one element list of coding system name.
8720 This variable is given to `completing-read' as TABLE argument.
8722 Do not alter the value of this variable manually. This variable should be
8723 updated by the functions `make-coding-system' and
8724 `define-coding-system-alias'. */);
8725 Vcoding_system_alist
= Qnil
;
8727 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list
,
8728 doc
: /* List of coding-categories (symbols) ordered by priority.
8730 On detecting a coding system, Emacs tries code detection algorithms
8731 associated with each coding-category one by one in this order. When
8732 one algorithm agrees with a byte sequence of source text, the coding
8733 system bound to the corresponding coding-category is selected. */);
8737 Vcoding_category_list
= Qnil
;
8738 for (i
= coding_category_max
- 1; i
>= 0; i
--)
8739 Vcoding_category_list
8740 = Fcons (XVECTOR (Vcoding_category_table
)->contents
[i
],
8741 Vcoding_category_list
);
8744 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read
,
8745 doc
: /* Specify the coding system for read operations.
8746 It is useful to bind this variable with `let', but do not set it globally.
8747 If the value is a coding system, it is used for decoding on read operation.
8748 If not, an appropriate element is used from one of the coding system alists:
8749 There are three such tables, `file-coding-system-alist',
8750 `process-coding-system-alist', and `network-coding-system-alist'. */);
8751 Vcoding_system_for_read
= Qnil
;
8753 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write
,
8754 doc
: /* Specify the coding system for write operations.
8755 Programs bind this variable with `let', but you should not set it globally.
8756 If the value is a coding system, it is used for encoding of output,
8757 when writing it to a file and when sending it to a file or subprocess.
8759 If this does not specify a coding system, an appropriate element
8760 is used from one of the coding system alists:
8761 There are three such tables, `file-coding-system-alist',
8762 `process-coding-system-alist', and `network-coding-system-alist'.
8763 For output to files, if the above procedure does not specify a coding system,
8764 the value of `buffer-file-coding-system' is used. */);
8765 Vcoding_system_for_write
= Qnil
;
8767 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used
,
8769 Coding system used in the latest file or process I/O. */);
8770 Vlast_coding_system_used
= Qnil
;
8772 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion
,
8774 *Non-nil means always inhibit code conversion of end-of-line format.
8775 See info node `Coding Systems' and info node `Text and Binary' concerning
8776 such conversion. */);
8777 inhibit_eol_conversion
= 0;
8779 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system
,
8781 Non-nil means process buffer inherits coding system of process output.
8782 Bind it to t if the process output is to be treated as if it were a file
8783 read from some filesystem. */);
8784 inherit_process_coding_system
= 0;
8786 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist
,
8788 Alist to decide a coding system to use for a file I/O operation.
8789 The format is ((PATTERN . VAL) ...),
8790 where PATTERN is a regular expression matching a file name,
8791 VAL is a coding system, a cons of coding systems, or a function symbol.
8792 If VAL is a coding system, it is used for both decoding and encoding
8794 If VAL is a cons of coding systems, the car part is used for decoding,
8795 and the cdr part is used for encoding.
8796 If VAL is a function symbol, the function must return a coding system
8797 or a cons of coding systems which are used as above. The function gets
8798 the arguments with which `find-operation-coding-systems' was called.
8800 See also the function `find-operation-coding-system'
8801 and the variable `auto-coding-alist'. */);
8802 Vfile_coding_system_alist
= Qnil
;
8804 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist
,
8806 Alist to decide a coding system to use for a process I/O operation.
8807 The format is ((PATTERN . VAL) ...),
8808 where PATTERN is a regular expression matching a program name,
8809 VAL is a coding system, a cons of coding systems, or a function symbol.
8810 If VAL is a coding system, it is used for both decoding what received
8811 from the program and encoding what sent to the program.
8812 If VAL is a cons of coding systems, the car part is used for decoding,
8813 and the cdr part is used for encoding.
8814 If VAL is a function symbol, the function must return a coding system
8815 or a cons of coding systems which are used as above.
8817 See also the function `find-operation-coding-system'. */);
8818 Vprocess_coding_system_alist
= Qnil
;
8820 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist
,
8822 Alist to decide a coding system to use for a network I/O operation.
8823 The format is ((PATTERN . VAL) ...),
8824 where PATTERN is a regular expression matching a network service name
8825 or is a port number to connect to,
8826 VAL is a coding system, a cons of coding systems, or a function symbol.
8827 If VAL is a coding system, it is used for both decoding what received
8828 from the network stream and encoding what sent to the network stream.
8829 If VAL is a cons of coding systems, the car part is used for decoding,
8830 and the cdr part is used for encoding.
8831 If VAL is a function symbol, the function must return a coding system
8832 or a cons of coding systems which are used as above.
8834 See also the function `find-operation-coding-system'. */);
8835 Vnetwork_coding_system_alist
= Qnil
;
8837 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system
,
8838 doc
: /* Coding system to use with system messages.
8839 Also used for decoding keyboard input on X Window system. */);
8840 Vlocale_coding_system
= Qnil
;
8842 /* The eol mnemonics are reset in startup.el system-dependently. */
8843 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix
,
8845 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
8846 eol_mnemonic_unix
= build_string (":");
8848 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos
,
8850 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
8851 eol_mnemonic_dos
= build_string ("\\");
8853 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac
,
8855 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
8856 eol_mnemonic_mac
= build_string ("/");
8858 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided
,
8860 *String displayed in mode line when end-of-line format is not yet determined. */);
8861 eol_mnemonic_undecided
= build_string (":");
8863 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation
,
8865 *Non-nil enables character translation while encoding and decoding. */);
8866 Venable_character_translation
= Qt
;
8868 DEFVAR_LISP ("standard-translation-table-for-decode",
8869 &Vstandard_translation_table_for_decode
,
8870 doc
: /* Table for translating characters while decoding. */);
8871 Vstandard_translation_table_for_decode
= Qnil
;
8873 DEFVAR_LISP ("standard-translation-table-for-encode",
8874 &Vstandard_translation_table_for_encode
,
8875 doc
: /* Table for translating characters while encoding. */);
8876 Vstandard_translation_table_for_encode
= Qnil
;
8878 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table
,
8879 doc
: /* Alist of charsets vs revision numbers.
8880 While encoding, if a charset (car part of an element) is found,
8881 designate it with the escape sequence identifying revision (cdr part
8882 of the element). */);
8883 Vcharset_revision_table
= Qnil
;
8885 DEFVAR_LISP ("default-process-coding-system",
8886 &Vdefault_process_coding_system
,
8887 doc
: /* Cons of coding systems used for process I/O by default.
8888 The car part is used for decoding a process output,
8889 the cdr part is used for encoding a text to be sent to a process. */);
8890 Vdefault_process_coding_system
= Qnil
;
8892 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table
,
8894 Table of extra Latin codes in the range 128..159 (inclusive).
8895 This is a vector of length 256.
8896 If Nth element is non-nil, the existence of code N in a file
8897 \(or output of subprocess) doesn't prevent it to be detected as
8898 a coding system of ISO 2022 variant which has a flag
8899 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
8900 or reading output of a subprocess.
8901 Only 128th through 159th elements has a meaning. */);
8902 Vlatin_extra_code_table
= Fmake_vector (make_number (256), Qnil
);
8904 DEFVAR_LISP ("select-safe-coding-system-function",
8905 &Vselect_safe_coding_system_function
,
8907 Function to call to select safe coding system for encoding a text.
8909 If set, this function is called to force a user to select a proper
8910 coding system which can encode the text in the case that a default
8911 coding system used in each operation can't encode the text.
8913 The default value is `select-safe-coding-system' (which see). */);
8914 Vselect_safe_coding_system_function
= Qnil
;
8916 DEFVAR_BOOL ("inhibit-iso-escape-detection",
8917 &inhibit_iso_escape_detection
,
8919 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
8921 By default, on reading a file, Emacs tries to detect how the text is
8922 encoded. This code detection is sensitive to escape sequences. If
8923 the sequence is valid as ISO2022, the code is determined as one of
8924 the ISO2022 encodings, and the file is decoded by the corresponding
8925 coding system (e.g. `iso-2022-7bit').
8927 However, there may be a case that you want to read escape sequences in
8928 a file as is. In such a case, you can set this variable to non-nil.
8929 Then, as the code detection ignores any escape sequences, no file is
8930 detected as encoded in some ISO2022 encoding. The result is that all
8931 escape sequences become visible in a buffer.
8933 The default value is nil, and it is strongly recommended not to change
8934 it. That is because many Emacs Lisp source files that contain
8935 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
8936 in Emacs's distribution, and they won't be decoded correctly on
8937 reading if you suppress escape sequence detection.
8939 The other way to read escape sequences in a file without decoding is
8940 to explicitly specify some coding system that doesn't use ISO2022's
8941 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
8942 inhibit_iso_escape_detection
= 0;
8945 Lisp_Object args
[coding_arg_max
];
8946 Lisp_Object plist
[14];
8949 for (i
= 0; i
< coding_arg_max
; i
++)
8952 plist
[0] = intern (":name");
8953 plist
[1] = args
[coding_arg_name
] = Qno_conversion
;
8954 plist
[2] = intern (":mnemonic");
8955 plist
[3] = args
[coding_arg_mnemonic
] = make_number ('=');
8956 plist
[4] = intern (":coding-type");
8957 plist
[5] = args
[coding_arg_coding_type
] = Qraw_text
;
8958 plist
[6] = intern (":ascii-compatible-p");
8959 plist
[7] = args
[coding_arg_ascii_compatible_p
] = Qt
;
8960 plist
[8] = intern (":default-char");
8961 plist
[9] = args
[coding_arg_default_char
] = make_number (0);
8962 plist
[10] = intern (":docstring");
8963 plist
[11] = build_string ("Do no conversion.\n\
8965 When you visit a file with this coding, the file is read into a\n\
8966 unibyte buffer as is, thus each byte of a file is treated as a\n\
8968 plist
[12] = intern (":eol-type");
8969 plist
[13] = args
[coding_arg_eol_type
] = Qunix
;
8970 args
[coding_arg_plist
] = Flist (14, plist
);
8971 Fdefine_coding_system_internal (coding_arg_max
, args
);
8974 setup_coding_system (Qno_conversion
, &keyboard_coding
);
8975 setup_coding_system (Qno_conversion
, &terminal_coding
);
8976 setup_coding_system (Qno_conversion
, &safe_terminal_coding
);
8981 for (i
= 0; i
< coding_category_max
; i
++)
8982 Fset (AREF (Vcoding_category_table
, i
), Qno_conversion
);
8987 emacs_strerror (error_number
)
8992 synchronize_system_messages_locale ();
8993 str
= strerror (error_number
);
8995 if (! NILP (Vlocale_coding_system
))
8997 Lisp_Object dec
= code_convert_string_norecord (build_string (str
),
8998 Vlocale_coding_system
,
9000 str
= (char *) XSTRING (dec
)->data
;