code.delx.au - gnu-emacs/blob - src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001, 2002, 2003, 2004, 2005,
   3                  2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
   4    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   5      2005, 2006, 2007, 2008, 2009, 2010, 2011
   6      National Institute of Advanced Industrial Science and Technology (AIST)
   7      Registration Number H14PRO021
   8    Copyright (C) 2003
   9      National Institute of Advanced Industrial Science and Technology (AIST)
  10      Registration Number H13PRO009
  11
  12 This file is part of GNU Emacs.
  13
  14 GNU Emacs is free software: you can redistribute it and/or modify
  15 it under the terms of the GNU General Public License as published by
  16 the Free Software Foundation, either version 3 of the License, or
  17 (at your option) any later version.
  18
  19 GNU Emacs is distributed in the hope that it will be useful,
  20 but WITHOUT ANY WARRANTY; without even the implied warranty of
  21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  22 GNU General Public License for more details.
  23
  24 You should have received a copy of the GNU General Public License
  25 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  26
  27 /*** TABLE OF CONTENTS ***
  28
  29   0. General comments
  30   1. Preamble
  31   2. Emacs' internal format (emacs-utf-8) handlers
  32   3. UTF-8 handlers
  33   4. UTF-16 handlers
  34   5. Charset-base coding systems handlers
  35   6. emacs-mule (old Emacs' internal format) handlers
  36   7. ISO2022 handlers
  37   8. Shift-JIS and BIG5 handlers
  38   9. CCL handlers
  39   10. C library functions
  40   11. Emacs Lisp library functions
  41   12. Postamble
  42
  43 */
  44
  45 /*** 0. General comments ***
  46
  47
  48 CODING SYSTEM
  49
  50   A coding system is an object for an encoding mechanism that contains
  51   information about how to convert byte sequences to character
  52   sequences and vice versa.  When we say "decode", it means converting
  53   a byte sequence of a specific coding system into a character
  54   sequence that is represented by Emacs' internal coding system
  55   `emacs-utf-8', and when we say "encode", it means converting a
  56   character sequence of emacs-utf-8 to a byte sequence of a specific
  57   coding system.
  58
  59   In Emacs Lisp, a coding system is represented by a Lisp symbol.  In
  60   C level, a coding system is represented by a vector of attributes
  61   stored in the hash table Vcharset_hash_table.  The conversion from
  62   coding system symbol to attributes vector is done by looking up
  63   Vcharset_hash_table by the symbol.
  64
  65   Coding systems are classified into the following types depending on
  66   the encoding mechanism.  Here's a brief description of the types.
  67
  68   o UTF-8
  69
  70   o UTF-16
  71
  72   o Charset-base coding system
  73
  74   A coding system defined by one or more (coded) character sets.
  75   Decoding and encoding are done by a code converter defined for each
  76   character set.
  77
  78   o Old Emacs internal format (emacs-mule)
  79
  80   The coding system adopted by old versions of Emacs (20 and 21).
  81
  82   o ISO2022-base coding system
  83
  84   The most famous coding system for multiple character sets.  X's
  85   Compound Text, various EUCs (Extended Unix Code), and coding systems
  86   used in the Internet communication such as ISO-2022-JP are all
  87   variants of ISO2022.
  88
  89   o SJIS (or Shift-JIS or MS-Kanji-Code)
  90
  91   A coding system to encode character sets: ASCII, JISX0201, and
  92   JISX0208.  Widely used for PC's in Japan.  Details are described in
  93   section 8.
  94
  95   o BIG5
  96
  97   A coding system to encode character sets: ASCII and Big5.  Widely
  98   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  99   described in section 8.  In this file, when we write "big5" (all
 100   lowercase), we mean the coding system, and when we write "Big5"
 101   (capitalized), we mean the character set.
 102
 103   o CCL
 104
 105   If a user wants to decode/encode text encoded in a coding system
 106   not listed above, he can supply a decoder and an encoder for it in
 107   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 108   program while decoding/encoding.
 109
 110   o Raw-text
 111
 112   A coding system for text containing raw eight-bit data.  Emacs
 113   treats each byte of source text as a character (except for
 114   end-of-line conversion).
 115
 116   o No-conversion
 117
 118   Like raw text, but don't do end-of-line conversion.
 119
 120
 121 END-OF-LINE FORMAT
 122
 123   How text end-of-line is encoded depends on operating system.  For
 124   instance, Unix's format is just one byte of LF (line-feed) code,
 125   whereas DOS's format is two-byte sequence of `carriage-return' and
 126   `line-feed' codes.  MacOS's format is usually one byte of
 127   `carriage-return'.
 128
 129   Since text character encoding and end-of-line encoding are
 130   independent, any coding system described above can take any format
 131   of end-of-line (except for no-conversion).
 132
 133 STRUCT CODING_SYSTEM
 134
 135   Before using a coding system for code conversion (i.e. decoding and
 136   encoding), we setup a structure of type `struct coding_system'.
 137   This structure keeps various information about a specific code
 138   conversion (e.g. the location of source and destination data).
 139
 140 */
 141
 142 /* COMMON MACROS */
 143
 144
 145 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 146
 147   These functions check if a byte sequence specified as a source in
 148   CODING conforms to the format of XXX, and update the members of
 149   DETECT_INFO.
 150
 151   Return 1 if the byte sequence conforms to XXX, otherwise return 0.
 152
 153   Below is the template of these functions.  */
 154
 155 #if 0
 156 static int
 157 detect_coding_XXX (struct coding_system *coding,
 158                    struct coding_detection_info *detect_info)
 159 {
 160   const unsigned char *src = coding->source;
 161   const unsigned char *src_end = coding->source + coding->src_bytes;
 162   int multibytep = coding->src_multibyte;
 163   int consumed_chars = 0;
 164   int found = 0;
 165   ...;
 166
 167   while (1)
 168     {
 169       /* Get one byte from the source.  If the source is exhausted, jump
 170          to no_more_source:.  */
 171       ONE_MORE_BYTE (c);
 172
 173       if (! __C_conforms_to_XXX___ (c))
 174         break;
 175       if (! __C_strongly_suggests_XXX__ (c))
 176         found = CATEGORY_MASK_XXX;
 177     }
 178   /* The byte sequence is invalid for XXX.  */
 179   detect_info->rejected |= CATEGORY_MASK_XXX;
 180   return 0;
 181
 182  no_more_source:
 183   /* The source exhausted successfully.  */
 184   detect_info->found |= found;
 185   return 1;
 186 }
 187 #endif
 188
 189 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 190
 191   These functions decode a byte sequence specified as a source by
 192   CODING.  The resulting multibyte text goes to a place pointed to by
 193   CODING->charbuf, the length of which should not exceed
 194   CODING->charbuf_size;
 195
 196   These functions set the information of original and decoded texts in
 197   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 198   They also set CODING->result to one of CODING_RESULT_XXX indicating
 199   how the decoding is finished.
 200
 201   Below is the template of these functions.  */
 202
 203 #if 0
 204 static void
 205 decode_coding_XXXX (struct coding_system *coding)
 206 {
 207   const unsigned char *src = coding->source + coding->consumed;
 208   const unsigned char *src_end = coding->source + coding->src_bytes;
 209   /* SRC_BASE remembers the start position in source in each loop.
 210      The loop will be exited when there's not enough source code, or
 211      when there's no room in CHARBUF for a decoded character.  */
 212   const unsigned char *src_base;
 213   /* A buffer to produce decoded characters.  */
 214   int *charbuf = coding->charbuf + coding->charbuf_used;
 215   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 216   int multibytep = coding->src_multibyte;
 217
 218   while (1)
 219     {
 220       src_base = src;
 221       if (charbuf < charbuf_end)
 222         /* No more room to produce a decoded character.  */
 223         break;
 224       ONE_MORE_BYTE (c);
 225       /* Decode it. */
 226     }
 227
 228  no_more_source:
 229   if (src_base < src_end
 230       && coding->mode & CODING_MODE_LAST_BLOCK)
 231     /* If the source ends by partial bytes to construct a character,
 232        treat them as eight-bit raw data.  */
 233     while (src_base < src_end && charbuf < charbuf_end)
 234       *charbuf++ = *src_base++;
 235   /* Remember how many bytes and characters we consumed.  If the
 236      source is multibyte, the bytes and chars are not identical.  */
 237   coding->consumed = coding->consumed_char = src_base - coding->source;
 238   /* Remember how many characters we produced.  */
 239   coding->charbuf_used = charbuf - coding->charbuf;
 240 }
 241 #endif
 242
 243 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 244
 245   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 246   internal multibyte format by CODING.  The resulting byte sequence
 247   goes to a place pointed to by DESTINATION, the length of which
 248   should not exceed DST_BYTES.
 249
 250   These functions set the information of original and encoded texts in
 251   the members produced, produced_char, consumed, and consumed_char of
 252   the structure *CODING.  They also set the member result to one of
 253   CODING_RESULT_XXX indicating how the encoding finished.
 254
 255   DST_BYTES zero means that source area and destination area are
 256   overlapped, which means that we can produce a encoded text until it
 257   reaches at the head of not-yet-encoded source text.
 258
 259   Below is a template of these functions.  */
 260 #if 0
 261 static void
 262 encode_coding_XXX (struct coding_system *coding)
 263 {
 264   int multibytep = coding->dst_multibyte;
 265   int *charbuf = coding->charbuf;
 266   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 267   unsigned char *dst = coding->destination + coding->produced;
 268   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 269   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 270   int produced_chars = 0;
 271
 272   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 273     {
 274       int c = *charbuf;
 275       /* Encode C into DST, and increment DST.  */
 276     }
 277  label_no_more_destination:
 278   /* How many chars and bytes we produced.  */
 279   coding->produced_char += produced_chars;
 280   coding->produced = dst - coding->destination;
 281 }
 282 #endif
 283
 284 \f
 285 /*** 1. Preamble ***/
 286
 287 #include <config.h>
 288 #include <stdio.h>
 289 #include <setjmp.h>
 290
 291 #include "lisp.h"
 292 #include "buffer.h"
 293 #include "character.h"
 294 #include "charset.h"
 295 #include "ccl.h"
 296 #include "composite.h"
 297 #include "coding.h"
 298 #include "window.h"
 299 #include "frame.h"
 300 #include "termhooks.h"
 301
 302 Lisp_Object Vcoding_system_hash_table;
 303
 304 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
 305 Lisp_Object Qunix, Qdos;
 306 Lisp_Object Qbuffer_file_coding_system;
 307 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 308 Lisp_Object Qdefault_char;
 309 Lisp_Object Qno_conversion, Qundecided;
 310 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
 311 Lisp_Object Qbig, Qlittle;
 312 Lisp_Object Qcoding_system_history;
 313 Lisp_Object Qvalid_codes;
 314 Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
 315 Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 316 Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 317 Lisp_Object QCascii_compatible_p;
 318
 319 Lisp_Object Qcall_process, Qcall_process_region;
 320 Lisp_Object Qstart_process, Qopen_network_stream;
 321 Lisp_Object Qtarget_idx;
 322
 323 Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
 324 Lisp_Object Qinterrupted, Qinsufficient_memory;
 325
 326 /* If a symbol has this property, evaluate the value to define the
 327    symbol as a coding system.  */
 328 static Lisp_Object Qcoding_system_define_form;
 329
 330 int coding_system_require_warning;
 331
 332 Lisp_Object Vselect_safe_coding_system_function;
 333
 334 /* Mnemonic string for each format of end-of-line.  */
 335 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 336 /* Mnemonic string to indicate format of end-of-line is not yet
 337    decided.  */
 338 Lisp_Object eol_mnemonic_undecided;
 339
 340 /* Format of end-of-line decided by system.  This is Qunix on
 341    Unix and Mac, Qdos on DOS/Windows.
 342    This has an effect only for external encoding (i.e. for output to
 343    file and process), not for in-buffer or Lisp string encoding.  */
 344 static Lisp_Object system_eol_type;
 345
 346 #ifdef emacs
 347
 348 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 349
 350 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 351
 352 /* Coding system emacs-mule and raw-text are for converting only
 353    end-of-line format.  */
 354 Lisp_Object Qemacs_mule, Qraw_text;
 355 Lisp_Object Qutf_8_emacs;
 356
 357 /* Coding-systems are handed between Emacs Lisp programs and C internal
 358    routines by the following three variables.  */
 359 /* Coding-system for reading files and receiving data from process.  */
 360 Lisp_Object Vcoding_system_for_read;
 361 /* Coding-system for writing files and sending data to process.  */
 362 Lisp_Object Vcoding_system_for_write;
 363 /* Coding-system actually used in the latest I/O.  */
 364 Lisp_Object Vlast_coding_system_used;
 365 /* Set to non-nil when an error is detected while code conversion.  */
 366 Lisp_Object Vlast_code_conversion_error;
 367 /* A vector of length 256 which contains information about special
 368    Latin codes (especially for dealing with Microsoft codes).  */
 369 Lisp_Object Vlatin_extra_code_table;
 370
 371 /* Flag to inhibit code conversion of end-of-line format.  */
 372 int inhibit_eol_conversion;
 373
 374 /* Flag to inhibit ISO2022 escape sequence detection.  */
 375 int inhibit_iso_escape_detection;
 376
 377 /* Flag to inhibit detection of binary files through null bytes.  */
 378 int inhibit_null_byte_detection;
 379
 380 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 381 int inherit_process_coding_system;
 382
 383 /* Coding system to be used to encode text for terminal display when
 384    terminal coding system is nil.  */
 385 struct coding_system safe_terminal_coding;
 386
 387 Lisp_Object Vfile_coding_system_alist;
 388 Lisp_Object Vprocess_coding_system_alist;
 389 Lisp_Object Vnetwork_coding_system_alist;
 390
 391 Lisp_Object Vlocale_coding_system;
 392
 393 #endif /* emacs */
 394
 395 /* Flag to tell if we look up translation table on character code
 396    conversion.  */
 397 Lisp_Object Venable_character_translation;
 398 /* Standard translation table to look up on decoding (reading).  */
 399 Lisp_Object Vstandard_translation_table_for_decode;
 400 /* Standard translation table to look up on encoding (writing).  */
 401 Lisp_Object Vstandard_translation_table_for_encode;
 402
 403 Lisp_Object Qtranslation_table;
 404 Lisp_Object Qtranslation_table_id;
 405 Lisp_Object Qtranslation_table_for_decode;
 406 Lisp_Object Qtranslation_table_for_encode;
 407
 408 /* Alist of charsets vs revision number.  */
 409 static Lisp_Object Vcharset_revision_table;
 410
 411 /* Default coding systems used for process I/O.  */
 412 Lisp_Object Vdefault_process_coding_system;
 413
 414 /* Char table for translating Quail and self-inserting input.  */
 415 Lisp_Object Vtranslation_table_for_input;
 416
 417 /* Two special coding systems.  */
 418 Lisp_Object Vsjis_coding_system;
 419 Lisp_Object Vbig5_coding_system;
 420
 421 /* ISO2022 section */
 422
 423 #define CODING_ISO_INITIAL(coding, reg)                 \
 424   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 425                      coding_attr_iso_initial),          \
 426                reg)))
 427
 428
 429 #define CODING_ISO_REQUEST(coding, charset_id)          \
 430   (((charset_id) <= (coding)->max_charset_id            \
 431     ? ((coding)->safe_charsets[charset_id] != 255       \
 432        ? (coding)->safe_charsets[charset_id]            \
 433        : -1)                                            \
 434     : -1))
 435
 436
 437 #define CODING_ISO_FLAGS(coding)        \
 438   ((coding)->spec.iso_2022.flags)
 439 #define CODING_ISO_DESIGNATION(coding, reg)     \
 440   ((coding)->spec.iso_2022.current_designation[reg])
 441 #define CODING_ISO_INVOCATION(coding, plane)    \
 442   ((coding)->spec.iso_2022.current_invocation[plane])
 443 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 444   ((coding)->spec.iso_2022.single_shifting)
 445 #define CODING_ISO_BOL(coding)  \
 446   ((coding)->spec.iso_2022.bol)
 447 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 448   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 449 #define CODING_ISO_CMP_STATUS(coding)   \
 450   (&(coding)->spec.iso_2022.cmp_status)
 451 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 452   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 453 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 454   ((coding)->spec.iso_2022.embedded_utf_8)
 455
 456 /* Control characters of ISO2022.  */
 457                         /* code */      /* function */
 458 #define ISO_CODE_LF     0x0A            /* line-feed */
 459 #define ISO_CODE_CR     0x0D            /* carriage-return */
 460 #define ISO_CODE_SO     0x0E            /* shift-out */
 461 #define ISO_CODE_SI     0x0F            /* shift-in */
 462 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 463 #define ISO_CODE_ESC    0x1B            /* escape */
 464 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 465 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 466 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 467
 468 /* All code (1-byte) of ISO2022 is classified into one of the
 469    followings.  */
 470 enum iso_code_class_type
 471   {
 472     ISO_control_0,              /* Control codes in the range
 473                                    0x00..0x1F and 0x7F, except for the
 474                                    following 5 codes.  */
 475     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 476     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 477     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 478     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 479     ISO_control_1,              /* Control codes in the range
 480                                    0x80..0x9F, except for the
 481                                    following 3 codes.  */
 482     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 483     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 484     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 485     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 486     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 487     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 488     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 489   };
 490
 491 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 492     `iso-flags' attribute of an iso2022 coding system.  */
 493
 494 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 495    instead of the correct short-form sequence (e.g. ESC $ A).  */
 496 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 497
 498 /* If set, reset graphic planes and registers at end-of-line to the
 499    initial state.  */
 500 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 501
 502 /* If set, reset graphic planes and registers before any control
 503    characters to the initial state.  */
 504 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 505
 506 /* If set, encode by 7-bit environment.  */
 507 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 508
 509 /* If set, use locking-shift function.  */
 510 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 511
 512 /* If set, use single-shift function.  Overwrite
 513    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 514 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 515
 516 /* If set, use designation escape sequence.  */
 517 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 518
 519 /* If set, produce revision number sequence.  */
 520 #define CODING_ISO_FLAG_REVISION        0x0080
 521
 522 /* If set, produce ISO6429's direction specifying sequence.  */
 523 #define CODING_ISO_FLAG_DIRECTION       0x0100
 524
 525 /* If set, assume designation states are reset at beginning of line on
 526    output.  */
 527 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 528
 529 /* If set, designation sequence should be placed at beginning of line
 530    on output.  */
 531 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 532
 533 /* If set, do not encode unsafe characters on output.  */
 534 #define CODING_ISO_FLAG_SAFE            0x0800
 535
 536 /* If set, extra latin codes (128..159) are accepted as a valid code
 537    on input.  */
 538 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 539
 540 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 541
 542 #define CODING_ISO_FLAG_EUC_TW_SHIFT    0x4000
 543
 544 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 545
 546 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 547
 548 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 549
 550 /* A character to be produced on output if encoding of the original
 551    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 552 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 553
 554 /* UTF-8 section */
 555 #define CODING_UTF_8_BOM(coding)        \
 556   ((coding)->spec.utf_8_bom)
 557
 558 /* UTF-16 section */
 559 #define CODING_UTF_16_BOM(coding)       \
 560   ((coding)->spec.utf_16.bom)
 561
 562 #define CODING_UTF_16_ENDIAN(coding)    \
 563   ((coding)->spec.utf_16.endian)
 564
 565 #define CODING_UTF_16_SURROGATE(coding) \
 566   ((coding)->spec.utf_16.surrogate)
 567
 568
 569 /* CCL section */
 570 #define CODING_CCL_DECODER(coding)      \
 571   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 572 #define CODING_CCL_ENCODER(coding)      \
 573   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 574 #define CODING_CCL_VALIDS(coding)                                          \
 575   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 576
 577 /* Index for each coding category in `coding_categories' */
 578
 579 enum coding_category
 580   {
 581     coding_category_iso_7,
 582     coding_category_iso_7_tight,
 583     coding_category_iso_8_1,
 584     coding_category_iso_8_2,
 585     coding_category_iso_7_else,
 586     coding_category_iso_8_else,
 587     coding_category_utf_8_auto,
 588     coding_category_utf_8_nosig,
 589     coding_category_utf_8_sig,
 590     coding_category_utf_16_auto,
 591     coding_category_utf_16_be,
 592     coding_category_utf_16_le,
 593     coding_category_utf_16_be_nosig,
 594     coding_category_utf_16_le_nosig,
 595     coding_category_charset,
 596     coding_category_sjis,
 597     coding_category_big5,
 598     coding_category_ccl,
 599     coding_category_emacs_mule,
 600     /* All above are targets of code detection.  */
 601     coding_category_raw_text,
 602     coding_category_undecided,
 603     coding_category_max
 604   };
 605
 606 /* Definitions of flag bits used in detect_coding_XXXX.  */
 607 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 608 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 609 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 610 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 611 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 612 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 613 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 614 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 615 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 616 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 617 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 618 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 619 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 620 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 621 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 622 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 623 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 624 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 625 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 626 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 627
 628 /* This value is returned if detect_coding_mask () find nothing other
 629    than ASCII characters.  */
 630 #define CATEGORY_MASK_ANY               \
 631   (CATEGORY_MASK_ISO_7                  \
 632    | CATEGORY_MASK_ISO_7_TIGHT          \
 633    | CATEGORY_MASK_ISO_8_1              \
 634    | CATEGORY_MASK_ISO_8_2              \
 635    | CATEGORY_MASK_ISO_7_ELSE           \
 636    | CATEGORY_MASK_ISO_8_ELSE           \
 637    | CATEGORY_MASK_UTF_8_AUTO           \
 638    | CATEGORY_MASK_UTF_8_NOSIG          \
 639    | CATEGORY_MASK_UTF_8_SIG            \
 640    | CATEGORY_MASK_UTF_16_AUTO          \
 641    | CATEGORY_MASK_UTF_16_BE            \
 642    | CATEGORY_MASK_UTF_16_LE            \
 643    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 644    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 645    | CATEGORY_MASK_CHARSET              \
 646    | CATEGORY_MASK_SJIS                 \
 647    | CATEGORY_MASK_BIG5                 \
 648    | CATEGORY_MASK_CCL                  \
 649    | CATEGORY_MASK_EMACS_MULE)
 650
 651
 652 #define CATEGORY_MASK_ISO_7BIT \
 653   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 654
 655 #define CATEGORY_MASK_ISO_8BIT \
 656   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 657
 658 #define CATEGORY_MASK_ISO_ELSE \
 659   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 660
 661 #define CATEGORY_MASK_ISO_ESCAPE        \
 662   (CATEGORY_MASK_ISO_7                  \
 663    | CATEGORY_MASK_ISO_7_TIGHT          \
 664    | CATEGORY_MASK_ISO_7_ELSE           \
 665    | CATEGORY_MASK_ISO_8_ELSE)
 666
 667 #define CATEGORY_MASK_ISO       \
 668   (  CATEGORY_MASK_ISO_7BIT     \
 669      | CATEGORY_MASK_ISO_8BIT   \
 670      | CATEGORY_MASK_ISO_ELSE)
 671
 672 #define CATEGORY_MASK_UTF_16            \
 673   (CATEGORY_MASK_UTF_16_AUTO            \
 674    | CATEGORY_MASK_UTF_16_BE            \
 675    | CATEGORY_MASK_UTF_16_LE            \
 676    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 677    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 678
 679 #define CATEGORY_MASK_UTF_8     \
 680   (CATEGORY_MASK_UTF_8_AUTO     \
 681    | CATEGORY_MASK_UTF_8_NOSIG  \
 682    | CATEGORY_MASK_UTF_8_SIG)
 683
 684 /* List of symbols `coding-category-xxx' ordered by priority.  This
 685    variable is exposed to Emacs Lisp.  */
 686 static Lisp_Object Vcoding_category_list;
 687
 688 /* Table of coding categories (Lisp symbols).  This variable is for
 689    internal use only.  */
 690 static Lisp_Object Vcoding_category_table;
 691
 692 /* Table of coding-categories ordered by priority.  */
 693 static enum coding_category coding_priorities[coding_category_max];
 694
 695 /* Nth element is a coding context for the coding system bound to the
 696    Nth coding category.  */
 697 static struct coding_system coding_categories[coding_category_max];
 698
 699 /*** Commonly used macros and functions ***/
 700
 701 #ifndef min
 702 #define min(a, b) ((a) < (b) ? (a) : (b))
 703 #endif
 704 #ifndef max
 705 #define max(a, b) ((a) > (b) ? (a) : (b))
 706 #endif
 707
 708 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 709   do {                                                  \
 710     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 711     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 712   } while (0)
 713
 714
 715 /* Safely get one byte from the source text pointed by SRC which ends
 716    at SRC_END, and set C to that byte.  If there are not enough bytes
 717    in the source, it jumps to `no_more_source'.  If multibytep is
 718    nonzero, and a multibyte character is found at SRC, set C to the
 719    negative value of the character code.  The caller should declare
 720    and set these variables appropriately in advance:
 721         src, src_end, multibytep */
 722
 723 #define ONE_MORE_BYTE(c)                                \
 724   do {                                                  \
 725     if (src == src_end)                                 \
 726       {                                                 \
 727         if (src_base < src)                             \
 728           record_conversion_result                      \
 729             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 730         goto no_more_source;                            \
 731       }                                                 \
 732     c = *src++;                                         \
 733     if (multibytep && (c & 0x80))                       \
 734       {                                                 \
 735         if ((c & 0xFE) == 0xC0)                         \
 736           c = ((c & 1) << 6) | *src++;                  \
 737         else                                            \
 738           {                                             \
 739             src--;                                      \
 740             c = - string_char (src, &src, NULL);        \
 741             record_conversion_result                    \
 742               (coding, CODING_RESULT_INVALID_SRC);      \
 743           }                                             \
 744       }                                                 \
 745     consumed_chars++;                                   \
 746   } while (0)
 747
 748 /* Safely get two bytes from the source text pointed by SRC which ends
 749    at SRC_END, and set C1 and C2 to those bytes while skipping the
 750    heading multibyte characters.  If there are not enough bytes in the
 751    source, it jumps to `no_more_source'.  If multibytep is nonzero and
 752    a multibyte character is found for C2, set C2 to the negative value
 753    of the character code.  The caller should declare and set these
 754    variables appropriately in advance:
 755         src, src_end, multibytep
 756    It is intended that this macro is used in detect_coding_utf_16.  */
 757
 758 #define TWO_MORE_BYTES(c1, c2)                          \
 759   do {                                                  \
 760     do {                                                \
 761       if (src == src_end)                               \
 762         goto no_more_source;                            \
 763       c1 = *src++;                                      \
 764       if (multibytep && (c1 & 0x80))                    \
 765         {                                               \
 766           if ((c1 & 0xFE) == 0xC0)                      \
 767             c1 = ((c1 & 1) << 6) | *src++;              \
 768           else                                          \
 769             {                                           \
 770               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 771               c1 = -1;                                  \
 772             }                                           \
 773         }                                               \
 774     } while (c1 < 0);                                   \
 775     if (src == src_end)                                 \
 776       goto no_more_source;                              \
 777     c2 = *src++;                                        \
 778     if (multibytep && (c2 & 0x80))                      \
 779       {                                                 \
 780         if ((c2 & 0xFE) == 0xC0)                        \
 781           c2 = ((c2 & 1) << 6) | *src++;                \
 782         else                                            \
 783           c2 = -1;                                      \
 784       }                                                 \
 785   } while (0)
 786
 787
 788 #define ONE_MORE_BYTE_NO_CHECK(c)                       \
 789   do {                                                  \
 790     c = *src++;                                         \
 791     if (multibytep && (c & 0x80))                       \
 792       {                                                 \
 793         if ((c & 0xFE) == 0xC0)                         \
 794           c = ((c & 1) << 6) | *src++;                  \
 795         else                                            \
 796           {                                             \
 797             src--;                                      \
 798             c = - string_char (src, &src, NULL);        \
 799             record_conversion_result                    \
 800               (coding, CODING_RESULT_INVALID_SRC);      \
 801           }                                             \
 802       }                                                 \
 803     consumed_chars++;                                   \
 804   } while (0)
 805
 806
 807 /* Store a byte C in the place pointed by DST and increment DST to the
 808    next free point, and increment PRODUCED_CHARS.  The caller should
 809    assure that C is 0..127, and declare and set the variable `dst'
 810    appropriately in advance.
 811 */
 812
 813
 814 #define EMIT_ONE_ASCII_BYTE(c)  \
 815   do {                          \
 816     produced_chars++;           \
 817     *dst++ = (c);               \
 818   } while (0)
 819
 820
 821 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 822
 823 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 824   do {                                  \
 825     produced_chars += 2;                \
 826     *dst++ = (c1), *dst++ = (c2);       \
 827   } while (0)
 828
 829
 830 /* Store a byte C in the place pointed by DST and increment DST to the
 831    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP is
 832    nonzero, store in an appropriate multibyte from.  The caller should
 833    declare and set the variables `dst' and `multibytep' appropriately
 834    in advance.  */
 835
 836 #define EMIT_ONE_BYTE(c)                \
 837   do {                                  \
 838     produced_chars++;                   \
 839     if (multibytep)                     \
 840       {                                 \
 841         int ch = (c);                   \
 842         if (ch >= 0x80)                 \
 843           ch = BYTE8_TO_CHAR (ch);      \
 844         CHAR_STRING_ADVANCE (ch, dst);  \
 845       }                                 \
 846     else                                \
 847       *dst++ = (c);                     \
 848   } while (0)
 849
 850
 851 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 852
 853 #define EMIT_TWO_BYTES(c1, c2)          \
 854   do {                                  \
 855     produced_chars += 2;                \
 856     if (multibytep)                     \
 857       {                                 \
 858         int ch;                         \
 859                                         \
 860         ch = (c1);                      \
 861         if (ch >= 0x80)                 \
 862           ch = BYTE8_TO_CHAR (ch);      \
 863         CHAR_STRING_ADVANCE (ch, dst);  \
 864         ch = (c2);                      \
 865         if (ch >= 0x80)                 \
 866           ch = BYTE8_TO_CHAR (ch);      \
 867         CHAR_STRING_ADVANCE (ch, dst);  \
 868       }                                 \
 869     else                                \
 870       {                                 \
 871         *dst++ = (c1);                  \
 872         *dst++ = (c2);                  \
 873       }                                 \
 874   } while (0)
 875
 876
 877 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 878   do {                                  \
 879     EMIT_ONE_BYTE (c1);                 \
 880     EMIT_TWO_BYTES (c2, c3);            \
 881   } while (0)
 882
 883
 884 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 885   do {                                          \
 886     EMIT_TWO_BYTES (c1, c2);                    \
 887     EMIT_TWO_BYTES (c3, c4);                    \
 888   } while (0)
 889
 890
 891 /* Prototypes for static functions.  */
 892 static void record_conversion_result (struct coding_system *coding,
 893                                       enum coding_result_code result);
 894 static int detect_coding_utf_8 (struct coding_system *,
 895                                 struct coding_detection_info *info);
 896 static void decode_coding_utf_8 (struct coding_system *);
 897 static int encode_coding_utf_8 (struct coding_system *);
 898
 899 static int detect_coding_utf_16 (struct coding_system *,
 900                                  struct coding_detection_info *info);
 901 static void decode_coding_utf_16 (struct coding_system *);
 902 static int encode_coding_utf_16 (struct coding_system *);
 903
 904 static int detect_coding_iso_2022 (struct coding_system *,
 905                                    struct coding_detection_info *info);
 906 static void decode_coding_iso_2022 (struct coding_system *);
 907 static int encode_coding_iso_2022 (struct coding_system *);
 908
 909 static int detect_coding_emacs_mule (struct coding_system *,
 910                                      struct coding_detection_info *info);
 911 static void decode_coding_emacs_mule (struct coding_system *);
 912 static int encode_coding_emacs_mule (struct coding_system *);
 913
 914 static int detect_coding_sjis (struct coding_system *,
 915                                struct coding_detection_info *info);
 916 static void decode_coding_sjis (struct coding_system *);
 917 static int encode_coding_sjis (struct coding_system *);
 918
 919 static int detect_coding_big5 (struct coding_system *,
 920                                struct coding_detection_info *info);
 921 static void decode_coding_big5 (struct coding_system *);
 922 static int encode_coding_big5 (struct coding_system *);
 923
 924 static int detect_coding_ccl (struct coding_system *,
 925                               struct coding_detection_info *info);
 926 static void decode_coding_ccl (struct coding_system *);
 927 static int encode_coding_ccl (struct coding_system *);
 928
 929 static void decode_coding_raw_text (struct coding_system *);
 930 static int encode_coding_raw_text (struct coding_system *);
 931
 932 static void coding_set_source (struct coding_system *);
 933 static void coding_set_destination (struct coding_system *);
 934 static void coding_alloc_by_realloc (struct coding_system *, EMACS_INT);
 935 static void coding_alloc_by_making_gap (struct coding_system *,
 936                                         EMACS_INT, EMACS_INT);
 937 static unsigned char *alloc_destination (struct coding_system *,
 938                                          EMACS_INT, unsigned char *);
 939 static void setup_iso_safe_charsets (Lisp_Object);
 940 static unsigned char *encode_designation_at_bol (struct coding_system *,
 941                                                  int *, int *,
 942                                                  unsigned char *);
 943 static int detect_eol (const unsigned char *,
 944                        EMACS_INT, enum coding_category);
 945 static Lisp_Object adjust_coding_eol_type (struct coding_system *, int);
 946 static void decode_eol (struct coding_system *);
 947 static Lisp_Object get_translation_table (Lisp_Object, int, int *);
 948 static Lisp_Object get_translation (Lisp_Object, int *, int *);
 949 static int produce_chars (struct coding_system *, Lisp_Object, int);
 950 static INLINE void produce_charset (struct coding_system *, int *,
 951                                     EMACS_INT);
 952 static void produce_annotation (struct coding_system *, EMACS_INT);
 953 static int decode_coding (struct coding_system *);
 954 static INLINE int *handle_composition_annotation (EMACS_INT, EMACS_INT,
 955                                                   struct coding_system *,
 956                                                   int *, EMACS_INT *);
 957 static INLINE int *handle_charset_annotation (EMACS_INT, EMACS_INT,
 958                                               struct coding_system *,
 959                                               int *, EMACS_INT *);
 960 static void consume_chars (struct coding_system *, Lisp_Object, int);
 961 static int encode_coding (struct coding_system *);
 962 static Lisp_Object make_conversion_work_buffer (int);
 963 static Lisp_Object code_conversion_restore (Lisp_Object);
 964 static INLINE int char_encodable_p (int, Lisp_Object);
 965 static Lisp_Object make_subsidiaries (Lisp_Object);
 966
 967 static void
 968 record_conversion_result (struct coding_system *coding,
 969                           enum coding_result_code result)
 970 {
 971   coding->result = result;
 972   switch (result)
 973     {
 974     case CODING_RESULT_INSUFFICIENT_SRC:
 975       Vlast_code_conversion_error = Qinsufficient_source;
 976       break;
 977     case CODING_RESULT_INCONSISTENT_EOL:
 978       Vlast_code_conversion_error = Qinconsistent_eol;
 979       break;
 980     case CODING_RESULT_INVALID_SRC:
 981       Vlast_code_conversion_error = Qinvalid_source;
 982       break;
 983     case CODING_RESULT_INTERRUPT:
 984       Vlast_code_conversion_error = Qinterrupted;
 985       break;
 986     case CODING_RESULT_INSUFFICIENT_MEM:
 987       Vlast_code_conversion_error = Qinsufficient_memory;
 988       break;
 989     case CODING_RESULT_INSUFFICIENT_DST:
 990       /* Don't record this error in Vlast_code_conversion_error
 991          because it happens just temporarily and is resolved when the
 992          whole conversion is finished.  */
 993       break;
 994     case CODING_RESULT_SUCCESS:
 995       break;
 996     default:
 997       Vlast_code_conversion_error = intern ("Unknown error");
 998     }
 999 }
1000
1001 /* This wrapper macro is used to preserve validity of pointers into
1002    buffer text across calls to decode_char, which could cause
1003    relocation of buffers if it loads a charset map, because loading a
1004    charset map allocates large structures.  */
1005 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
1006   do {                                                                       \
1007     charset_map_loaded = 0;                                                  \
1008     c = DECODE_CHAR (charset, code);                                         \
1009     if (charset_map_loaded)                                                  \
1010       {                                                                      \
1011         const unsigned char *orig = coding->source;                          \
1012         EMACS_INT offset;                                                    \
1013                                                                              \
1014         coding_set_source (coding);                                          \
1015         offset = coding->source - orig;                                      \
1016         src += offset;                                                       \
1017         src_base += offset;                                                  \
1018         src_end += offset;                                                   \
1019       }                                                                      \
1020   } while (0)
1021
1022
1023 /* If there are at least BYTES length of room at dst, allocate memory
1024    for coding->destination and update dst and dst_end.  We don't have
1025    to take care of coding->source which will be relocated.  It is
1026    handled by calling coding_set_source in encode_coding.  */
1027
1028 #define ASSURE_DESTINATION(bytes)                               \
1029   do {                                                          \
1030     if (dst + (bytes) >= dst_end)                               \
1031       {                                                         \
1032         int more_bytes = charbuf_end - charbuf + (bytes);       \
1033                                                                 \
1034         dst = alloc_destination (coding, more_bytes, dst);      \
1035         dst_end = coding->destination + coding->dst_bytes;      \
1036       }                                                         \
1037   } while (0)
1038
1039
1040 /* Store multibyte form of the character C in P, and advance P to the
1041    end of the multibyte form.  This is like CHAR_STRING_ADVANCE but it
1042    never calls MAYBE_UNIFY_CHAR.  */
1043
1044 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)      \
1045   do {                                          \
1046     if ((c) <= MAX_1_BYTE_CHAR)                 \
1047       *(p)++ = (c);                             \
1048     else if ((c) <= MAX_2_BYTE_CHAR)            \
1049       *(p)++ = (0xC0 | ((c) >> 6)),             \
1050         *(p)++ = (0x80 | ((c) & 0x3F));         \
1051     else if ((c) <= MAX_3_BYTE_CHAR)            \
1052       *(p)++ = (0xE0 | ((c) >> 12)),            \
1053         *(p)++ = (0x80 | (((c) >> 6) & 0x3F)),  \
1054         *(p)++ = (0x80 | ((c) & 0x3F));         \
1055     else if ((c) <= MAX_4_BYTE_CHAR)            \
1056       *(p)++ = (0xF0 | (c >> 18)),              \
1057         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
1058         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
1059         *(p)++ = (0x80 | (c & 0x3F));           \
1060     else if ((c) <= MAX_5_BYTE_CHAR)            \
1061       *(p)++ = 0xF8,                            \
1062         *(p)++ = (0x80 | ((c >> 18) & 0x0F)),   \
1063         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
1064         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
1065         *(p)++ = (0x80 | (c & 0x3F));           \
1066     else                                        \
1067       (p) += BYTE8_STRING ((c) - 0x3FFF80, p);  \
1068   } while (0)
1069
1070
1071 /* Return the character code of character whose multibyte form is at
1072    P, and advance P to the end of the multibyte form.  This is like
1073    STRING_CHAR_ADVANCE, but it never calls MAYBE_UNIFY_CHAR.  */
1074
1075 #define STRING_CHAR_ADVANCE_NO_UNIFY(p)                         \
1076   (!((p)[0] & 0x80)                                             \
1077    ? *(p)++                                                     \
1078    : ! ((p)[0] & 0x20)                                          \
1079    ? ((p) += 2,                                                 \
1080       ((((p)[-2] & 0x1F) << 6)                                  \
1081        | ((p)[-1] & 0x3F)                                       \
1082        | ((unsigned char) ((p)[-2]) < 0xC2 ? 0x3FFF80 : 0)))    \
1083    : ! ((p)[0] & 0x10)                                          \
1084    ? ((p) += 3,                                                 \
1085       ((((p)[-3] & 0x0F) << 12)                                 \
1086        | (((p)[-2] & 0x3F) << 6)                                \
1087        | ((p)[-1] & 0x3F)))                                     \
1088    : ! ((p)[0] & 0x08)                                          \
1089    ? ((p) += 4,                                                 \
1090       ((((p)[-4] & 0xF) << 18)                                  \
1091        | (((p)[-3] & 0x3F) << 12)                               \
1092        | (((p)[-2] & 0x3F) << 6)                                \
1093        | ((p)[-1] & 0x3F)))                                     \
1094    : ((p) += 5,                                                 \
1095       ((((p)[-4] & 0x3F) << 18)                                 \
1096        | (((p)[-3] & 0x3F) << 12)                               \
1097        | (((p)[-2] & 0x3F) << 6)                                \
1098        | ((p)[-1] & 0x3F))))
1099
1100
1101 static void
1102 coding_set_source (struct coding_system *coding)
1103 {
1104   if (BUFFERP (coding->src_object))
1105     {
1106       struct buffer *buf = XBUFFER (coding->src_object);
1107
1108       if (coding->src_pos < 0)
1109         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
1110       else
1111         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
1112     }
1113   else if (STRINGP (coding->src_object))
1114     {
1115       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
1116     }
1117   else
1118     /* Otherwise, the source is C string and is never relocated
1119        automatically.  Thus we don't have to update anything.  */
1120     ;
1121 }
1122
1123 static void
1124 coding_set_destination (struct coding_system *coding)
1125 {
1126   if (BUFFERP (coding->dst_object))
1127     {
1128       if (coding->src_pos < 0)
1129         {
1130           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1131           coding->dst_bytes = (GAP_END_ADDR
1132                                - (coding->src_bytes - coding->consumed)
1133                                - coding->destination);
1134         }
1135       else
1136         {
1137           /* We are sure that coding->dst_pos_byte is before the gap
1138              of the buffer. */
1139           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1140                                  + coding->dst_pos_byte - BEG_BYTE);
1141           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1142                                - coding->destination);
1143         }
1144     }
1145   else
1146     /* Otherwise, the destination is C string and is never relocated
1147        automatically.  Thus we don't have to update anything.  */
1148     ;
1149 }
1150
1151
1152 static void
1153 coding_alloc_by_realloc (struct coding_system *coding, EMACS_INT bytes)
1154 {
1155   coding->destination = (unsigned char *) xrealloc (coding->destination,
1156                                                     coding->dst_bytes + bytes);
1157   coding->dst_bytes += bytes;
1158 }
1159
1160 static void
1161 coding_alloc_by_making_gap (struct coding_system *coding,
1162                             EMACS_INT gap_head_used, EMACS_INT bytes)
1163 {
1164   if (EQ (coding->src_object, coding->dst_object))
1165     {
1166       /* The gap may contain the produced data at the head and not-yet
1167          consumed data at the tail.  To preserve those data, we at
1168          first make the gap size to zero, then increase the gap
1169          size.  */
1170       EMACS_INT add = GAP_SIZE;
1171
1172       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1173       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1174       make_gap (bytes);
1175       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1176       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1177     }
1178   else
1179     {
1180       Lisp_Object this_buffer;
1181
1182       this_buffer = Fcurrent_buffer ();
1183       set_buffer_internal (XBUFFER (coding->dst_object));
1184       make_gap (bytes);
1185       set_buffer_internal (XBUFFER (this_buffer));
1186     }
1187 }
1188
1189
1190 static unsigned char *
1191 alloc_destination (struct coding_system *coding, EMACS_INT nbytes,
1192                    unsigned char *dst)
1193 {
1194   EMACS_INT offset = dst - coding->destination;
1195
1196   if (BUFFERP (coding->dst_object))
1197     {
1198       struct buffer *buf = XBUFFER (coding->dst_object);
1199
1200       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1201     }
1202   else
1203     coding_alloc_by_realloc (coding, nbytes);
1204   coding_set_destination (coding);
1205   dst = coding->destination + offset;
1206   return dst;
1207 }
1208
1209 /** Macros for annotations.  */
1210
1211 /* An annotation data is stored in the array coding->charbuf in this
1212    format:
1213      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1214    LENGTH is the number of elements in the annotation.
1215    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1216    NCHARS is the number of characters in the text annotated.
1217
1218    The format of the following elements depend on ANNOTATION_MASK.
1219
1220    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1221    follows:
1222      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1223
1224    NBYTES is the number of bytes specified in the header part of
1225    old-style emacs-mule encoding, or 0 for the other kind of
1226    composition.
1227
1228    METHOD is one of enum composition_method.
1229
1230    Optional COMPOSITION-COMPONENTS are characters and composition
1231    rules.
1232
1233    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1234    follows.
1235
1236    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1237    recover from an invalid annotation, and should be skipped by
1238    produce_annotation.  */
1239
1240 /* Maximum length of the header of annotation data.  */
1241 #define MAX_ANNOTATION_LENGTH 5
1242
1243 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1244   do {                                                  \
1245     *(buf)++ = -(len);                                  \
1246     *(buf)++ = (mask);                                  \
1247     *(buf)++ = (nchars);                                \
1248     coding->annotated = 1;                              \
1249   } while (0);
1250
1251 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1252   do {                                                                      \
1253     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1254     *buf++ = nbytes;                                                        \
1255     *buf++ = method;                                                        \
1256   } while (0)
1257
1258
1259 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1260   do {                                                                  \
1261     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1262     *buf++ = id;                                                        \
1263   } while (0)
1264
1265 \f
1266 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1267
1268
1269
1270 \f
1271 /*** 3. UTF-8 ***/
1272
1273 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1274    Check if a text is encoded in UTF-8.  If it is, return 1, else
1275    return 0.  */
1276
1277 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1278 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1279 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1280 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1281 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1282 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1283
1284 #define UTF_BOM 0xFEFF
1285 #define UTF_8_BOM_1 0xEF
1286 #define UTF_8_BOM_2 0xBB
1287 #define UTF_8_BOM_3 0xBF
1288
1289 static int
1290 detect_coding_utf_8 (struct coding_system *coding,
1291                      struct coding_detection_info *detect_info)
1292 {
1293   const unsigned char *src = coding->source, *src_base;
1294   const unsigned char *src_end = coding->source + coding->src_bytes;
1295   int multibytep = coding->src_multibyte;
1296   int consumed_chars = 0;
1297   int bom_found = 0;
1298   int found = 0;
1299
1300   detect_info->checked |= CATEGORY_MASK_UTF_8;
1301   /* A coding system of this category is always ASCII compatible.  */
1302   src += coding->head_ascii;
1303
1304   while (1)
1305     {
1306       int c, c1, c2, c3, c4;
1307
1308       src_base = src;
1309       ONE_MORE_BYTE (c);
1310       if (c < 0 || UTF_8_1_OCTET_P (c))
1311         continue;
1312       ONE_MORE_BYTE (c1);
1313       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1314         break;
1315       if (UTF_8_2_OCTET_LEADING_P (c))
1316         {
1317           found = 1;
1318           continue;
1319         }
1320       ONE_MORE_BYTE (c2);
1321       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1322         break;
1323       if (UTF_8_3_OCTET_LEADING_P (c))
1324         {
1325           found = 1;
1326           if (src_base == coding->source
1327               && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1328             bom_found = 1;
1329           continue;
1330         }
1331       ONE_MORE_BYTE (c3);
1332       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1333         break;
1334       if (UTF_8_4_OCTET_LEADING_P (c))
1335         {
1336           found = 1;
1337           continue;
1338         }
1339       ONE_MORE_BYTE (c4);
1340       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1341         break;
1342       if (UTF_8_5_OCTET_LEADING_P (c))
1343         {
1344           found = 1;
1345           continue;
1346         }
1347       break;
1348     }
1349   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1350   return 0;
1351
1352  no_more_source:
1353   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1354     {
1355       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1356       return 0;
1357     }
1358   if (bom_found)
1359     {
1360       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1361       detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1362     }
1363   else
1364     {
1365       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1366       if (found)
1367         detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
1368     }
1369   return 1;
1370 }
1371
1372
1373 static void
1374 decode_coding_utf_8 (struct coding_system *coding)
1375 {
1376   const unsigned char *src = coding->source + coding->consumed;
1377   const unsigned char *src_end = coding->source + coding->src_bytes;
1378   const unsigned char *src_base;
1379   int *charbuf = coding->charbuf + coding->charbuf_used;
1380   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1381   int consumed_chars = 0, consumed_chars_base = 0;
1382   int multibytep = coding->src_multibyte;
1383   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1384   Lisp_Object attr, charset_list;
1385   int eol_crlf =
1386     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1387   int byte_after_cr = -1;
1388
1389   CODING_GET_INFO (coding, attr, charset_list);
1390
1391   if (bom != utf_without_bom)
1392     {
1393       int c1, c2, c3;
1394
1395       src_base = src;
1396       ONE_MORE_BYTE (c1);
1397       if (! UTF_8_3_OCTET_LEADING_P (c1))
1398         src = src_base;
1399       else
1400         {
1401           ONE_MORE_BYTE (c2);
1402           if (! UTF_8_EXTRA_OCTET_P (c2))
1403             src = src_base;
1404           else
1405             {
1406               ONE_MORE_BYTE (c3);
1407               if (! UTF_8_EXTRA_OCTET_P (c3))
1408                 src = src_base;
1409               else
1410                 {
1411                   if ((c1 != UTF_8_BOM_1)
1412                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1413                     src = src_base;
1414                   else
1415                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1416                 }
1417             }
1418         }
1419     }
1420   CODING_UTF_8_BOM (coding) = utf_without_bom;
1421
1422   while (1)
1423     {
1424       int c, c1, c2, c3, c4, c5;
1425
1426       src_base = src;
1427       consumed_chars_base = consumed_chars;
1428
1429       if (charbuf >= charbuf_end)
1430         {
1431           if (byte_after_cr >= 0)
1432             src_base--;
1433           break;
1434         }
1435
1436       if (byte_after_cr >= 0)
1437         c1 = byte_after_cr, byte_after_cr = -1;
1438       else
1439         ONE_MORE_BYTE (c1);
1440       if (c1 < 0)
1441         {
1442           c = - c1;
1443         }
1444       else if (UTF_8_1_OCTET_P (c1))
1445         {
1446           if (eol_crlf && c1 == '\r')
1447             ONE_MORE_BYTE (byte_after_cr);
1448           c = c1;
1449         }
1450       else
1451         {
1452           ONE_MORE_BYTE (c2);
1453           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1454             goto invalid_code;
1455           if (UTF_8_2_OCTET_LEADING_P (c1))
1456             {
1457               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1458               /* Reject overlong sequences here and below.  Encoders
1459                  producing them are incorrect, they can be misleading,
1460                  and they mess up read/write invariance.  */
1461               if (c < 128)
1462                 goto invalid_code;
1463             }
1464           else
1465             {
1466               ONE_MORE_BYTE (c3);
1467               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1468                 goto invalid_code;
1469               if (UTF_8_3_OCTET_LEADING_P (c1))
1470                 {
1471                   c = (((c1 & 0xF) << 12)
1472                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1473                   if (c < 0x800
1474                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1475                     goto invalid_code;
1476                 }
1477               else
1478                 {
1479                   ONE_MORE_BYTE (c4);
1480                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1481                     goto invalid_code;
1482                   if (UTF_8_4_OCTET_LEADING_P (c1))
1483                     {
1484                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1485                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1486                     if (c < 0x10000)
1487                       goto invalid_code;
1488                     }
1489                   else
1490                     {
1491                       ONE_MORE_BYTE (c5);
1492                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1493                         goto invalid_code;
1494                       if (UTF_8_5_OCTET_LEADING_P (c1))
1495                         {
1496                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1497                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1498                                | (c5 & 0x3F));
1499                           if ((c > MAX_CHAR) || (c < 0x200000))
1500                             goto invalid_code;
1501                         }
1502                       else
1503                         goto invalid_code;
1504                     }
1505                 }
1506             }
1507         }
1508
1509       *charbuf++ = c;
1510       continue;
1511
1512     invalid_code:
1513       src = src_base;
1514       consumed_chars = consumed_chars_base;
1515       ONE_MORE_BYTE (c);
1516       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1517       coding->errors++;
1518     }
1519
1520  no_more_source:
1521   coding->consumed_char += consumed_chars_base;
1522   coding->consumed = src_base - coding->source;
1523   coding->charbuf_used = charbuf - coding->charbuf;
1524 }
1525
1526
1527 static int
1528 encode_coding_utf_8 (struct coding_system *coding)
1529 {
1530   int multibytep = coding->dst_multibyte;
1531   int *charbuf = coding->charbuf;
1532   int *charbuf_end = charbuf + coding->charbuf_used;
1533   unsigned char *dst = coding->destination + coding->produced;
1534   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1535   int produced_chars = 0;
1536   int c;
1537
1538   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1539     {
1540       ASSURE_DESTINATION (3);
1541       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1542       CODING_UTF_8_BOM (coding) = utf_without_bom;
1543     }
1544
1545   if (multibytep)
1546     {
1547       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1548
1549       while (charbuf < charbuf_end)
1550         {
1551           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1552
1553           ASSURE_DESTINATION (safe_room);
1554           c = *charbuf++;
1555           if (CHAR_BYTE8_P (c))
1556             {
1557               c = CHAR_TO_BYTE8 (c);
1558               EMIT_ONE_BYTE (c);
1559             }
1560           else
1561             {
1562               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1563               for (p = str; p < pend; p++)
1564                 EMIT_ONE_BYTE (*p);
1565             }
1566         }
1567     }
1568   else
1569     {
1570       int safe_room = MAX_MULTIBYTE_LENGTH;
1571
1572       while (charbuf < charbuf_end)
1573         {
1574           ASSURE_DESTINATION (safe_room);
1575           c = *charbuf++;
1576           if (CHAR_BYTE8_P (c))
1577             *dst++ = CHAR_TO_BYTE8 (c);
1578           else
1579             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1580           produced_chars++;
1581         }
1582     }
1583   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1584   coding->produced_char += produced_chars;
1585   coding->produced = dst - coding->destination;
1586   return 0;
1587 }
1588
1589
1590 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1591    Check if a text is encoded in one of UTF-16 based coding systems.
1592    If it is, return 1, else return 0.  */
1593
1594 #define UTF_16_HIGH_SURROGATE_P(val) \
1595   (((val) & 0xFC00) == 0xD800)
1596
1597 #define UTF_16_LOW_SURROGATE_P(val) \
1598   (((val) & 0xFC00) == 0xDC00)
1599
1600 #define UTF_16_INVALID_P(val)   \
1601   (((val) == 0xFFFE)            \
1602    || ((val) == 0xFFFF)         \
1603    || UTF_16_LOW_SURROGATE_P (val))
1604
1605
1606 static int
1607 detect_coding_utf_16 (struct coding_system *coding,
1608                       struct coding_detection_info *detect_info)
1609 {
1610   const unsigned char *src = coding->source;
1611   const unsigned char *src_end = coding->source + coding->src_bytes;
1612   int multibytep = coding->src_multibyte;
1613   int c1, c2;
1614
1615   detect_info->checked |= CATEGORY_MASK_UTF_16;
1616   if (coding->mode & CODING_MODE_LAST_BLOCK
1617       && (coding->src_chars & 1))
1618     {
1619       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1620       return 0;
1621     }
1622
1623   TWO_MORE_BYTES (c1, c2);
1624   if ((c1 == 0xFF) && (c2 == 0xFE))
1625     {
1626       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1627                              | CATEGORY_MASK_UTF_16_AUTO);
1628       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1629                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1630                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1631     }
1632   else if ((c1 == 0xFE) && (c2 == 0xFF))
1633     {
1634       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1635                              | CATEGORY_MASK_UTF_16_AUTO);
1636       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1637                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1638                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1639     }
1640   else if (c2 < 0)
1641     {
1642       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1643       return 0;
1644     }
1645   else
1646     {
1647       /* We check the dispersion of Eth and Oth bytes where E is even and
1648          O is odd.  If both are high, we assume binary data.*/
1649       unsigned char e[256], o[256];
1650       unsigned e_num = 1, o_num = 1;
1651
1652       memset (e, 0, 256);
1653       memset (o, 0, 256);
1654       e[c1] = 1;
1655       o[c2] = 1;
1656
1657       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1658                                 |CATEGORY_MASK_UTF_16_BE
1659                                 | CATEGORY_MASK_UTF_16_LE);
1660
1661       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1662              != CATEGORY_MASK_UTF_16)
1663         {
1664           TWO_MORE_BYTES (c1, c2);
1665           if (c2 < 0)
1666             break;
1667           if (! e[c1])
1668             {
1669               e[c1] = 1;
1670               e_num++;
1671               if (e_num >= 128)
1672                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1673             }
1674           if (! o[c2])
1675             {
1676               o[c2] = 1;
1677               o_num++;
1678               if (o_num >= 128)
1679                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1680             }
1681         }
1682       return 0;
1683     }
1684
1685  no_more_source:
1686   return 1;
1687 }
1688
1689 static void
1690 decode_coding_utf_16 (struct coding_system *coding)
1691 {
1692   const unsigned char *src = coding->source + coding->consumed;
1693   const unsigned char *src_end = coding->source + coding->src_bytes;
1694   const unsigned char *src_base;
1695   int *charbuf = coding->charbuf + coding->charbuf_used;
1696   /* We may produces at most 3 chars in one loop.  */
1697   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1698   int consumed_chars = 0, consumed_chars_base = 0;
1699   int multibytep = coding->src_multibyte;
1700   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1701   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1702   int surrogate = CODING_UTF_16_SURROGATE (coding);
1703   Lisp_Object attr, charset_list;
1704   int eol_crlf =
1705     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1706   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1707
1708   CODING_GET_INFO (coding, attr, charset_list);
1709
1710   if (bom == utf_with_bom)
1711     {
1712       int c, c1, c2;
1713
1714       src_base = src;
1715       ONE_MORE_BYTE (c1);
1716       ONE_MORE_BYTE (c2);
1717       c = (c1 << 8) | c2;
1718
1719       if (endian == utf_16_big_endian
1720           ? c != 0xFEFF : c != 0xFFFE)
1721         {
1722           /* The first two bytes are not BOM.  Treat them as bytes
1723              for a normal character.  */
1724           src = src_base;
1725           coding->errors++;
1726         }
1727       CODING_UTF_16_BOM (coding) = utf_without_bom;
1728     }
1729   else if (bom == utf_detect_bom)
1730     {
1731       /* We have already tried to detect BOM and failed in
1732          detect_coding.  */
1733       CODING_UTF_16_BOM (coding) = utf_without_bom;
1734     }
1735
1736   while (1)
1737     {
1738       int c, c1, c2;
1739
1740       src_base = src;
1741       consumed_chars_base = consumed_chars;
1742
1743       if (charbuf >= charbuf_end)
1744         {
1745           if (byte_after_cr1 >= 0)
1746             src_base -= 2;
1747           break;
1748         }
1749
1750       if (byte_after_cr1 >= 0)
1751         c1 = byte_after_cr1, byte_after_cr1 = -1;
1752       else
1753         ONE_MORE_BYTE (c1);
1754       if (c1 < 0)
1755         {
1756           *charbuf++ = -c1;
1757           continue;
1758         }
1759       if (byte_after_cr2 >= 0)
1760         c2 = byte_after_cr2, byte_after_cr2 = -1;
1761       else
1762         ONE_MORE_BYTE (c2);
1763       if (c2 < 0)
1764         {
1765           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1766           *charbuf++ = -c2;
1767           continue;
1768         }
1769       c = (endian == utf_16_big_endian
1770            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1771
1772       if (surrogate)
1773         {
1774           if (! UTF_16_LOW_SURROGATE_P (c))
1775             {
1776               if (endian == utf_16_big_endian)
1777                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1778               else
1779                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1780               *charbuf++ = c1;
1781               *charbuf++ = c2;
1782               coding->errors++;
1783               if (UTF_16_HIGH_SURROGATE_P (c))
1784                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1785               else
1786                 *charbuf++ = c;
1787             }
1788           else
1789             {
1790               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1791               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1792               *charbuf++ = 0x10000 + c;
1793             }
1794         }
1795       else
1796         {
1797           if (UTF_16_HIGH_SURROGATE_P (c))
1798             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1799           else
1800             {
1801               if (eol_crlf && c == '\r')
1802                 {
1803                   ONE_MORE_BYTE (byte_after_cr1);
1804                   ONE_MORE_BYTE (byte_after_cr2);
1805                 }
1806               *charbuf++ = c;
1807             }
1808         }
1809     }
1810
1811  no_more_source:
1812   coding->consumed_char += consumed_chars_base;
1813   coding->consumed = src_base - coding->source;
1814   coding->charbuf_used = charbuf - coding->charbuf;
1815 }
1816
1817 static int
1818 encode_coding_utf_16 (struct coding_system *coding)
1819 {
1820   int multibytep = coding->dst_multibyte;
1821   int *charbuf = coding->charbuf;
1822   int *charbuf_end = charbuf + coding->charbuf_used;
1823   unsigned char *dst = coding->destination + coding->produced;
1824   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1825   int safe_room = 8;
1826   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1827   int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1828   int produced_chars = 0;
1829   Lisp_Object attrs, charset_list;
1830   int c;
1831
1832   CODING_GET_INFO (coding, attrs, charset_list);
1833
1834   if (bom != utf_without_bom)
1835     {
1836       ASSURE_DESTINATION (safe_room);
1837       if (big_endian)
1838         EMIT_TWO_BYTES (0xFE, 0xFF);
1839       else
1840         EMIT_TWO_BYTES (0xFF, 0xFE);
1841       CODING_UTF_16_BOM (coding) = utf_without_bom;
1842     }
1843
1844   while (charbuf < charbuf_end)
1845     {
1846       ASSURE_DESTINATION (safe_room);
1847       c = *charbuf++;
1848       if (c > MAX_UNICODE_CHAR)
1849         c = coding->default_char;
1850
1851       if (c < 0x10000)
1852         {
1853           if (big_endian)
1854             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1855           else
1856             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1857         }
1858       else
1859         {
1860           int c1, c2;
1861
1862           c -= 0x10000;
1863           c1 = (c >> 10) + 0xD800;
1864           c2 = (c & 0x3FF) + 0xDC00;
1865           if (big_endian)
1866             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1867           else
1868             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1869         }
1870     }
1871   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1872   coding->produced = dst - coding->destination;
1873   coding->produced_char += produced_chars;
1874   return 0;
1875 }
1876
1877 \f
1878 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1879
1880 /* Emacs' internal format for representation of multiple character
1881    sets is a kind of multi-byte encoding, i.e. characters are
1882    represented by variable-length sequences of one-byte codes.
1883
1884    ASCII characters and control characters (e.g. `tab', `newline') are
1885    represented by one-byte sequences which are their ASCII codes, in
1886    the range 0x00 through 0x7F.
1887
1888    8-bit characters of the range 0x80..0x9F are represented by
1889    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1890    code + 0x20).
1891
1892    8-bit characters of the range 0xA0..0xFF are represented by
1893    one-byte sequences which are their 8-bit code.
1894
1895    The other characters are represented by a sequence of `base
1896    leading-code', optional `extended leading-code', and one or two
1897    `position-code's.  The length of the sequence is determined by the
1898    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1899    whereas extended leading-code and position-code take the range 0xA0
1900    through 0xFF.  See `charset.h' for more details about leading-code
1901    and position-code.
1902
1903    --- CODE RANGE of Emacs' internal format ---
1904    character set        range
1905    -------------        -----
1906    ascii                0x00..0x7F
1907    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1908    eight-bit-graphic    0xA0..0xBF
1909    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1910    ---------------------------------------------
1911
1912    As this is the internal character representation, the format is
1913    usually not used externally (i.e. in a file or in a data sent to a
1914    process).  But, it is possible to have a text externally in this
1915    format (i.e. by encoding by the coding system `emacs-mule').
1916
1917    In that case, a sequence of one-byte codes has a slightly different
1918    form.
1919
1920    At first, all characters in eight-bit-control are represented by
1921    one-byte sequences which are their 8-bit code.
1922
1923    Next, character composition data are represented by the byte
1924    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1925    where,
1926         METHOD is 0xF2 plus one of composition method (enum
1927         composition_method),
1928
1929         BYTES is 0xA0 plus a byte length of this composition data,
1930
1931         CHARS is 0xA0 plus a number of characters composed by this
1932         data,
1933
1934         COMPONENTs are characters of multibyte form or composition
1935         rules encoded by two-byte of ASCII codes.
1936
1937    In addition, for backward compatibility, the following formats are
1938    also recognized as composition data on decoding.
1939
1940    0x80 MSEQ ...
1941    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1942
1943    Here,
1944         MSEQ is a multibyte form but in these special format:
1945           ASCII: 0xA0 ASCII_CODE+0x80,
1946           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1947         RULE is a one byte code of the range 0xA0..0xF0 that
1948         represents a composition rule.
1949   */
1950
1951 char emacs_mule_bytes[256];
1952
1953
1954 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1955    Check if a text is encoded in `emacs-mule'.  If it is, return 1,
1956    else return 0.  */
1957
1958 static int
1959 detect_coding_emacs_mule (struct coding_system *coding,
1960                           struct coding_detection_info *detect_info)
1961 {
1962   const unsigned char *src = coding->source, *src_base;
1963   const unsigned char *src_end = coding->source + coding->src_bytes;
1964   int multibytep = coding->src_multibyte;
1965   int consumed_chars = 0;
1966   int c;
1967   int found = 0;
1968
1969   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1970   /* A coding system of this category is always ASCII compatible.  */
1971   src += coding->head_ascii;
1972
1973   while (1)
1974     {
1975       src_base = src;
1976       ONE_MORE_BYTE (c);
1977       if (c < 0)
1978         continue;
1979       if (c == 0x80)
1980         {
1981           /* Perhaps the start of composite character.  We simply skip
1982              it because analyzing it is too heavy for detecting.  But,
1983              at least, we check that the composite character
1984              constitutes of more than 4 bytes.  */
1985           const unsigned char *src_base;
1986
1987         repeat:
1988           src_base = src;
1989           do
1990             {
1991               ONE_MORE_BYTE (c);
1992             }
1993           while (c >= 0xA0);
1994
1995           if (src - src_base <= 4)
1996             break;
1997           found = CATEGORY_MASK_EMACS_MULE;
1998           if (c == 0x80)
1999             goto repeat;
2000         }
2001
2002       if (c < 0x80)
2003         {
2004           if (c < 0x20
2005               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2006             break;
2007         }
2008       else
2009         {
2010           int more_bytes = emacs_mule_bytes[c] - 1;
2011
2012           while (more_bytes > 0)
2013             {
2014               ONE_MORE_BYTE (c);
2015               if (c < 0xA0)
2016                 {
2017                   src--;        /* Unread the last byte.  */
2018                   break;
2019                 }
2020               more_bytes--;
2021             }
2022           if (more_bytes != 0)
2023             break;
2024           found = CATEGORY_MASK_EMACS_MULE;
2025         }
2026     }
2027   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
2028   return 0;
2029
2030  no_more_source:
2031   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
2032     {
2033       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
2034       return 0;
2035     }
2036   detect_info->found |= found;
2037   return 1;
2038 }
2039
2040
2041 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
2042    character.  If CMP_STATUS indicates that we must expect MSEQ or
2043    RULE described above, decode it and return the negative value of
2044    the decoded character or rule.  If an invalid byte is found, return
2045    -1.  If SRC is too short, return -2.  */
2046
2047 int
2048 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
2049                  int *nbytes, int *nchars, int *id,
2050                  struct composition_status *cmp_status)
2051 {
2052   const unsigned char *src_end = coding->source + coding->src_bytes;
2053   const unsigned char *src_base = src;
2054   int multibytep = coding->src_multibyte;
2055   int charset_id;
2056   unsigned code;
2057   int c;
2058   int consumed_chars = 0;
2059   int mseq_found = 0;
2060
2061   ONE_MORE_BYTE (c);
2062   if (c < 0)
2063     {
2064       c = -c;
2065       charset_id = emacs_mule_charset[0];
2066     }
2067   else
2068     {
2069       if (c >= 0xA0)
2070         {
2071           if (cmp_status->state != COMPOSING_NO
2072               && cmp_status->old_form)
2073             {
2074               if (cmp_status->state == COMPOSING_CHAR)
2075                 {
2076                   if (c == 0xA0)
2077                     {
2078                       ONE_MORE_BYTE (c);
2079                       c -= 0x80;
2080                       if (c < 0)
2081                         goto invalid_code;
2082                     }
2083                   else
2084                     c -= 0x20;
2085                   mseq_found = 1;
2086                 }
2087               else
2088                 {
2089                   *nbytes = src - src_base;
2090                   *nchars = consumed_chars;
2091                   return -c;
2092                 }
2093             }
2094           else
2095             goto invalid_code;
2096         }
2097
2098       switch (emacs_mule_bytes[c])
2099         {
2100         case 2:
2101           if ((charset_id = emacs_mule_charset[c]) < 0)
2102             goto invalid_code;
2103           ONE_MORE_BYTE (c);
2104           if (c < 0xA0)
2105             goto invalid_code;
2106           code = c & 0x7F;
2107           break;
2108
2109         case 3:
2110           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2111               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2112             {
2113               ONE_MORE_BYTE (c);
2114               if (c < 0xA0 || (charset_id = emacs_mule_charset[c]) < 0)
2115                 goto invalid_code;
2116               ONE_MORE_BYTE (c);
2117               if (c < 0xA0)
2118                 goto invalid_code;
2119               code = c & 0x7F;
2120             }
2121           else
2122             {
2123               if ((charset_id = emacs_mule_charset[c]) < 0)
2124                 goto invalid_code;
2125               ONE_MORE_BYTE (c);
2126               if (c < 0xA0)
2127                 goto invalid_code;
2128               code = (c & 0x7F) << 8;
2129               ONE_MORE_BYTE (c);
2130               if (c < 0xA0)
2131                 goto invalid_code;
2132               code |= c & 0x7F;
2133             }
2134           break;
2135
2136         case 4:
2137           ONE_MORE_BYTE (c);
2138           if (c < 0 || (charset_id = emacs_mule_charset[c]) < 0)
2139             goto invalid_code;
2140           ONE_MORE_BYTE (c);
2141           if (c < 0xA0)
2142             goto invalid_code;
2143           code = (c & 0x7F) << 8;
2144           ONE_MORE_BYTE (c);
2145           if (c < 0xA0)
2146             goto invalid_code;
2147           code |= c & 0x7F;
2148           break;
2149
2150         case 1:
2151           code = c;
2152           charset_id = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
2153           break;
2154
2155         default:
2156           abort ();
2157         }
2158       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2159                           CHARSET_FROM_ID (charset_id), code, c);
2160       if (c < 0)
2161         goto invalid_code;
2162     }
2163   *nbytes = src - src_base;
2164   *nchars = consumed_chars;
2165   if (id)
2166     *id = charset_id;
2167   return (mseq_found ? -c : c);
2168
2169  no_more_source:
2170   return -2;
2171
2172  invalid_code:
2173   return -1;
2174 }
2175
2176
2177 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2178
2179 /* Handle these composition sequence ('|': the end of header elements,
2180    BYTES and CHARS >= 0xA0):
2181
2182    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2183    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2184    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2185
2186    and these old form:
2187
2188    (4) relative composition: 0x80 | MSEQ ... MSEQ
2189    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2190
2191    When the starter 0x80 and the following header elements are found,
2192    this annotation header is produced.
2193
2194         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2195
2196    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2197    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2198
2199    Then, upon reading the following elements, these codes are produced
2200    until the composition end is found:
2201
2202    (1) CHAR ... CHAR
2203    (2) ALT ... ALT CHAR ... CHAR
2204    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2205    (4) CHAR ... CHAR
2206    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2207
2208    When the composition end is found, LENGTH and NCHARS in the
2209    annotation header is updated as below:
2210
2211    (1) LENGTH: unchanged, NCHARS: unchanged
2212    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2213    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2214    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2215    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2216
2217    If an error is found while composing, the annotation header is
2218    changed to the original composition header (plus filler -1s) as
2219    below:
2220
2221    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2222    (5)          [ 0x80 0xFF -1 -1- -1 ]
2223
2224    and the sequence [ -2 DECODED-RULE ] is changed to the original
2225    byte sequence as below:
2226         o the original byte sequence is B: [ B -1 ]
2227         o the original byte sequence is B1 B2: [ B1 B2 ]
2228
2229    Most of the routines are implemented by macros because many
2230    variables and labels in the caller decode_coding_emacs_mule must be
2231    accessible, and they are usually called just once (thus doesn't
2232    increase the size of compiled object).  */
2233
2234 /* Decode a composition rule represented by C as a component of
2235    composition sequence of Emacs 20 style.  Set RULE to the decoded
2236    rule. */
2237
2238 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2239   do {                                                  \
2240     int gref, nref;                                     \
2241                                                         \
2242     c -= 0xA0;                                          \
2243     if (c < 0 || c >= 81)                               \
2244       goto invalid_code;                                \
2245     gref = c / 9, nref = c % 9;                         \
2246     if (gref == 4) gref = 10;                           \
2247     if (nref == 4) nref = 10;                           \
2248     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2249   } while (0)
2250
2251
2252 /* Decode a composition rule represented by C and the following byte
2253    at SRC as a component of composition sequence of Emacs 21 style.
2254    Set RULE to the decoded rule.  */
2255
2256 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2257   do {                                                  \
2258     int gref, nref;                                     \
2259                                                         \
2260     gref = c - 0x20;                                    \
2261     if (gref < 0 || gref >= 81)                         \
2262       goto invalid_code;                                \
2263     ONE_MORE_BYTE (c);                                  \
2264     nref = c - 0x20;                                    \
2265     if (nref < 0 || nref >= 81)                         \
2266       goto invalid_code;                                \
2267     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2268   } while (0)
2269
2270
2271 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2272    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2273    byte length of this composition information, CHARS is the number of
2274    characters composed by this composition.  */
2275
2276 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2277   do {                                                                  \
2278     enum composition_method method = c - 0xF2;                          \
2279     int nbytes, nchars;                                                 \
2280                                                                         \
2281     ONE_MORE_BYTE (c);                                                  \
2282     if (c < 0)                                                          \
2283       goto invalid_code;                                                \
2284     nbytes = c - 0xA0;                                                  \
2285     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2286       goto invalid_code;                                                \
2287     ONE_MORE_BYTE (c);                                                  \
2288     nchars = c - 0xA0;                                                  \
2289     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2290       goto invalid_code;                                                \
2291     cmp_status->old_form = 0;                                           \
2292     cmp_status->method = method;                                        \
2293     if (method == COMPOSITION_RELATIVE)                                 \
2294       cmp_status->state = COMPOSING_CHAR;                               \
2295     else                                                                \
2296       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2297     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2298     cmp_status->nchars = nchars;                                        \
2299     cmp_status->ncomps = nbytes - 4;                                    \
2300     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2301   } while (0)
2302
2303
2304 /* Start of Emacs 20 style format for relative composition.  */
2305
2306 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2307   do {                                                          \
2308     cmp_status->old_form = 1;                                   \
2309     cmp_status->method = COMPOSITION_RELATIVE;                  \
2310     cmp_status->state = COMPOSING_CHAR;                         \
2311     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2312     cmp_status->nchars = cmp_status->ncomps = 0;                \
2313     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2314   } while (0)
2315
2316
2317 /* Start of Emacs 20 style format for rule-base composition.  */
2318
2319 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2320   do {                                                          \
2321     cmp_status->old_form = 1;                                   \
2322     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2323     cmp_status->state = COMPOSING_CHAR;                         \
2324     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2325     cmp_status->nchars = cmp_status->ncomps = 0;                \
2326     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2327   } while (0)
2328
2329
2330 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2331   do {                                                  \
2332     const unsigned char *current_src = src;             \
2333                                                         \
2334     ONE_MORE_BYTE (c);                                  \
2335     if (c < 0)                                          \
2336       goto invalid_code;                                \
2337     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2338         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2339       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2340     else if (c < 0xA0)                                  \
2341       goto invalid_code;                                \
2342     else if (c < 0xC0)                                  \
2343       {                                                 \
2344         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2345         /* Re-read C as a composition component.  */    \
2346         src = current_src;                              \
2347       }                                                 \
2348     else if (c == 0xFF)                                 \
2349       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2350     else                                                \
2351       goto invalid_code;                                \
2352   } while (0)
2353
2354 #define EMACS_MULE_COMPOSITION_END()                            \
2355   do {                                                          \
2356     int idx = - cmp_status->length;                             \
2357                                                                 \
2358     if (cmp_status->old_form)                                   \
2359       charbuf[idx + 2] = cmp_status->nchars;                    \
2360     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2361       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2362     cmp_status->state = COMPOSING_NO;                           \
2363   } while (0)
2364
2365
2366 static int
2367 emacs_mule_finish_composition (int *charbuf,
2368                                struct composition_status *cmp_status)
2369 {
2370   int idx = - cmp_status->length;
2371   int new_chars;
2372
2373   if (cmp_status->old_form && cmp_status->nchars > 0)
2374     {
2375       charbuf[idx + 2] = cmp_status->nchars;
2376       new_chars = 0;
2377       if (cmp_status->method == COMPOSITION_WITH_RULE
2378           && cmp_status->state == COMPOSING_CHAR)
2379         {
2380           /* The last rule was invalid.  */
2381           int rule = charbuf[-1] + 0xA0;
2382
2383           charbuf[-2] = BYTE8_TO_CHAR (rule);
2384           charbuf[-1] = -1;
2385           new_chars = 1;
2386         }
2387     }
2388   else
2389     {
2390       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2391
2392       if (cmp_status->method == COMPOSITION_WITH_RULE)
2393         {
2394           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2395           charbuf[idx++] = -3;
2396           charbuf[idx++] = 0;
2397           new_chars = 1;
2398         }
2399       else
2400         {
2401           int nchars = charbuf[idx + 1] + 0xA0;
2402           int nbytes = charbuf[idx + 2] + 0xA0;
2403
2404           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2405           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2406           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2407           charbuf[idx++] = -1;
2408           new_chars = 4;
2409         }
2410     }
2411   cmp_status->state = COMPOSING_NO;
2412   return new_chars;
2413 }
2414
2415 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2416   do {                                                                    \
2417     if (cmp_status->state != COMPOSING_NO)                                \
2418       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2419   } while (0)
2420
2421
2422 static void
2423 decode_coding_emacs_mule (struct coding_system *coding)
2424 {
2425   const unsigned char *src = coding->source + coding->consumed;
2426   const unsigned char *src_end = coding->source + coding->src_bytes;
2427   const unsigned char *src_base;
2428   int *charbuf = coding->charbuf + coding->charbuf_used;
2429   /* We may produce two annotations (charset and composition) in one
2430      loop and one more charset annotation at the end.  */
2431   int *charbuf_end
2432     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
2433   int consumed_chars = 0, consumed_chars_base;
2434   int multibytep = coding->src_multibyte;
2435   Lisp_Object attrs, charset_list;
2436   int char_offset = coding->produced_char;
2437   int last_offset = char_offset;
2438   int last_id = charset_ascii;
2439   int eol_crlf =
2440     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2441   int byte_after_cr = -1;
2442   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2443
2444   CODING_GET_INFO (coding, attrs, charset_list);
2445
2446   if (cmp_status->state != COMPOSING_NO)
2447     {
2448       int i;
2449
2450       for (i = 0; i < cmp_status->length; i++)
2451         *charbuf++ = cmp_status->carryover[i];
2452       coding->annotated = 1;
2453     }
2454
2455   while (1)
2456     {
2457       int c, id;
2458
2459       src_base = src;
2460       consumed_chars_base = consumed_chars;
2461
2462       if (charbuf >= charbuf_end)
2463         {
2464           if (byte_after_cr >= 0)
2465             src_base--;
2466           break;
2467         }
2468
2469       if (byte_after_cr >= 0)
2470         c = byte_after_cr, byte_after_cr = -1;
2471       else
2472         ONE_MORE_BYTE (c);
2473
2474       if (c < 0 || c == 0x80)
2475         {
2476           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2477           if (c < 0)
2478             {
2479               *charbuf++ = -c;
2480               char_offset++;
2481             }
2482           else
2483             DECODE_EMACS_MULE_COMPOSITION_START ();
2484           continue;
2485         }
2486
2487       if (c < 0x80)
2488         {
2489           if (eol_crlf && c == '\r')
2490             ONE_MORE_BYTE (byte_after_cr);
2491           id = charset_ascii;
2492           if (cmp_status->state != COMPOSING_NO)
2493             {
2494               if (cmp_status->old_form)
2495                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2496               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2497                 cmp_status->ncomps--;
2498             }
2499         }
2500       else
2501         {
2502           int nchars, nbytes;
2503           /* emacs_mule_char can load a charset map from a file, which
2504              allocates a large structure and might cause buffer text
2505              to be relocated as result.  Thus, we need to remember the
2506              original pointer to buffer text, and fix up all related
2507              pointers after the call.  */
2508           const unsigned char *orig = coding->source;
2509           EMACS_INT offset;
2510
2511           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2512                                cmp_status);
2513           offset = coding->source - orig;
2514           if (offset)
2515             {
2516               src += offset;
2517               src_base += offset;
2518               src_end += offset;
2519             }
2520           if (c < 0)
2521             {
2522               if (c == -1)
2523                 goto invalid_code;
2524               if (c == -2)
2525                 break;
2526             }
2527           src = src_base + nbytes;
2528           consumed_chars = consumed_chars_base + nchars;
2529           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2530             cmp_status->ncomps -= nchars;
2531         }
2532
2533       /* Now if C >= 0, we found a normally encoded character, if C <
2534          0, we found an old-style composition component character or
2535          rule.  */
2536
2537       if (cmp_status->state == COMPOSING_NO)
2538         {
2539           if (last_id != id)
2540             {
2541               if (last_id != charset_ascii)
2542                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2543                                   last_id);
2544               last_id = id;
2545               last_offset = char_offset;
2546             }
2547           *charbuf++ = c;
2548           char_offset++;
2549         }
2550       else if (cmp_status->state == COMPOSING_CHAR)
2551         {
2552           if (cmp_status->old_form)
2553             {
2554               if (c >= 0)
2555                 {
2556                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2557                   *charbuf++ = c;
2558                   char_offset++;
2559                 }
2560               else
2561                 {
2562                   *charbuf++ = -c;
2563                   cmp_status->nchars++;
2564                   cmp_status->length++;
2565                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2566                     EMACS_MULE_COMPOSITION_END ();
2567                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2568                     cmp_status->state = COMPOSING_RULE;
2569                 }
2570             }
2571           else
2572             {
2573               *charbuf++ = c;
2574               cmp_status->length++;
2575               cmp_status->nchars--;
2576               if (cmp_status->nchars == 0)
2577                 EMACS_MULE_COMPOSITION_END ();
2578             }
2579         }
2580       else if (cmp_status->state == COMPOSING_RULE)
2581         {
2582           int rule;
2583
2584           if (c >= 0)
2585             {
2586               EMACS_MULE_COMPOSITION_END ();
2587               *charbuf++ = c;
2588               char_offset++;
2589             }
2590           else
2591             {
2592               c = -c;
2593               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2594               if (rule < 0)
2595                 goto invalid_code;
2596               *charbuf++ = -2;
2597               *charbuf++ = rule;
2598               cmp_status->length += 2;
2599               cmp_status->state = COMPOSING_CHAR;
2600             }
2601         }
2602       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2603         {
2604           *charbuf++ = c;
2605           cmp_status->length++;
2606           if (cmp_status->ncomps == 0)
2607             cmp_status->state = COMPOSING_CHAR;
2608           else if (cmp_status->ncomps > 0)
2609             {
2610               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2611                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2612             }
2613           else
2614             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2615         }
2616       else                      /* COMPOSING_COMPONENT_RULE */
2617         {
2618           int rule;
2619
2620           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2621           if (rule < 0)
2622             goto invalid_code;
2623           *charbuf++ = -2;
2624           *charbuf++ = rule;
2625           cmp_status->length += 2;
2626           cmp_status->ncomps--;
2627           if (cmp_status->ncomps > 0)
2628             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2629           else
2630             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2631         }
2632       continue;
2633
2634       src = src_base;
2635       consumed_chars = consumed_chars_base;
2636       continue;
2637
2638     invalid_code:
2639       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2640       src = src_base;
2641       consumed_chars = consumed_chars_base;
2642       ONE_MORE_BYTE (c);
2643       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2644       char_offset++;
2645       coding->errors++;
2646     }
2647
2648  no_more_source:
2649   if (cmp_status->state != COMPOSING_NO)
2650     {
2651       if (coding->mode & CODING_MODE_LAST_BLOCK)
2652         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2653       else
2654         {
2655           int i;
2656
2657           charbuf -= cmp_status->length;
2658           for (i = 0; i < cmp_status->length; i++)
2659             cmp_status->carryover[i] = charbuf[i];
2660         }
2661     }
2662   if (last_id != charset_ascii)
2663     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2664   coding->consumed_char += consumed_chars_base;
2665   coding->consumed = src_base - coding->source;
2666   coding->charbuf_used = charbuf - coding->charbuf;
2667 }
2668
2669
2670 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2671   do {                                          \
2672     if (id < 0xA0)                              \
2673       codes[0] = id, codes[1] = 0;              \
2674     else if (id < 0xE0)                         \
2675       codes[0] = 0x9A, codes[1] = id;           \
2676     else if (id < 0xF0)                         \
2677       codes[0] = 0x9B, codes[1] = id;           \
2678     else if (id < 0xF5)                         \
2679       codes[0] = 0x9C, codes[1] = id;           \
2680     else                                        \
2681       codes[0] = 0x9D, codes[1] = id;           \
2682   } while (0);
2683
2684
2685 static int
2686 encode_coding_emacs_mule (struct coding_system *coding)
2687 {
2688   int multibytep = coding->dst_multibyte;
2689   int *charbuf = coding->charbuf;
2690   int *charbuf_end = charbuf + coding->charbuf_used;
2691   unsigned char *dst = coding->destination + coding->produced;
2692   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2693   int safe_room = 8;
2694   int produced_chars = 0;
2695   Lisp_Object attrs, charset_list;
2696   int c;
2697   int preferred_charset_id = -1;
2698
2699   CODING_GET_INFO (coding, attrs, charset_list);
2700   if (! EQ (charset_list, Vemacs_mule_charset_list))
2701     {
2702       CODING_ATTR_CHARSET_LIST (attrs)
2703         = charset_list = Vemacs_mule_charset_list;
2704     }
2705
2706   while (charbuf < charbuf_end)
2707     {
2708       ASSURE_DESTINATION (safe_room);
2709       c = *charbuf++;
2710
2711       if (c < 0)
2712         {
2713           /* Handle an annotation.  */
2714           switch (*charbuf)
2715             {
2716             case CODING_ANNOTATE_COMPOSITION_MASK:
2717               /* Not yet implemented.  */
2718               break;
2719             case CODING_ANNOTATE_CHARSET_MASK:
2720               preferred_charset_id = charbuf[3];
2721               if (preferred_charset_id >= 0
2722                   && NILP (Fmemq (make_number (preferred_charset_id),
2723                                   charset_list)))
2724                 preferred_charset_id = -1;
2725               break;
2726             default:
2727               abort ();
2728             }
2729           charbuf += -c - 1;
2730           continue;
2731         }
2732
2733       if (ASCII_CHAR_P (c))
2734         EMIT_ONE_ASCII_BYTE (c);
2735       else if (CHAR_BYTE8_P (c))
2736         {
2737           c = CHAR_TO_BYTE8 (c);
2738           EMIT_ONE_BYTE (c);
2739         }
2740       else
2741         {
2742           struct charset *charset;
2743           unsigned code;
2744           int dimension;
2745           int emacs_mule_id;
2746           unsigned char leading_codes[2];
2747
2748           if (preferred_charset_id >= 0)
2749             {
2750               charset = CHARSET_FROM_ID (preferred_charset_id);
2751               if (CHAR_CHARSET_P (c, charset))
2752                 code = ENCODE_CHAR (charset, c);
2753               else
2754                 charset = char_charset (c, charset_list, &code);
2755             }
2756           else
2757             charset = char_charset (c, charset_list, &code);
2758           if (! charset)
2759             {
2760               c = coding->default_char;
2761               if (ASCII_CHAR_P (c))
2762                 {
2763                   EMIT_ONE_ASCII_BYTE (c);
2764                   continue;
2765                 }
2766               charset = char_charset (c, charset_list, &code);
2767             }
2768           dimension = CHARSET_DIMENSION (charset);
2769           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2770           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2771           EMIT_ONE_BYTE (leading_codes[0]);
2772           if (leading_codes[1])
2773             EMIT_ONE_BYTE (leading_codes[1]);
2774           if (dimension == 1)
2775             EMIT_ONE_BYTE (code | 0x80);
2776           else
2777             {
2778               code |= 0x8080;
2779               EMIT_ONE_BYTE (code >> 8);
2780               EMIT_ONE_BYTE (code & 0xFF);
2781             }
2782         }
2783     }
2784   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2785   coding->produced_char += produced_chars;
2786   coding->produced = dst - coding->destination;
2787   return 0;
2788 }
2789
2790 \f
2791 /*** 7. ISO2022 handlers ***/
2792
2793 /* The following note describes the coding system ISO2022 briefly.
2794    Since the intention of this note is to help understand the
2795    functions in this file, some parts are NOT ACCURATE or are OVERLY
2796    SIMPLIFIED.  For thorough understanding, please refer to the
2797    original document of ISO2022.  This is equivalent to the standard
2798    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2799
2800    ISO2022 provides many mechanisms to encode several character sets
2801    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2802    is encoded using bytes less than 128.  This may make the encoded
2803    text a little bit longer, but the text passes more easily through
2804    several types of gateway, some of which strip off the MSB (Most
2805    Significant Bit).
2806
2807    There are two kinds of character sets: control character sets and
2808    graphic character sets.  The former contain control characters such
2809    as `newline' and `escape' to provide control functions (control
2810    functions are also provided by escape sequences).  The latter
2811    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2812    two control character sets and many graphic character sets.
2813
2814    Graphic character sets are classified into one of the following
2815    four classes, according to the number of bytes (DIMENSION) and
2816    number of characters in one dimension (CHARS) of the set:
2817    - DIMENSION1_CHARS94
2818    - DIMENSION1_CHARS96
2819    - DIMENSION2_CHARS94
2820    - DIMENSION2_CHARS96
2821
2822    In addition, each character set is assigned an identification tag,
2823    unique for each set, called the "final character" (denoted as <F>
2824    hereafter).  The <F> of each character set is decided by ECMA(*)
2825    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2826    (0x30..0x3F are for private use only).
2827
2828    Note (*): ECMA = European Computer Manufacturers Association
2829
2830    Here are examples of graphic character sets [NAME(<F>)]:
2831         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2832         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2833         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2834         o DIMENSION2_CHARS96 -- none for the moment
2835
2836    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2837         C0 [0x00..0x1F] -- control character plane 0
2838         GL [0x20..0x7F] -- graphic character plane 0
2839         C1 [0x80..0x9F] -- control character plane 1
2840         GR [0xA0..0xFF] -- graphic character plane 1
2841
2842    A control character set is directly designated and invoked to C0 or
2843    C1 by an escape sequence.  The most common case is that:
2844    - ISO646's  control character set is designated/invoked to C0, and
2845    - ISO6429's control character set is designated/invoked to C1,
2846    and usually these designations/invocations are omitted in encoded
2847    text.  In a 7-bit environment, only C0 can be used, and a control
2848    character for C1 is encoded by an appropriate escape sequence to
2849    fit into the environment.  All control characters for C1 are
2850    defined to have corresponding escape sequences.
2851
2852    A graphic character set is at first designated to one of four
2853    graphic registers (G0 through G3), then these graphic registers are
2854    invoked to GL or GR.  These designations and invocations can be
2855    done independently.  The most common case is that G0 is invoked to
2856    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2857    these invocations and designations are omitted in encoded text.
2858    In a 7-bit environment, only GL can be used.
2859
2860    When a graphic character set of CHARS94 is invoked to GL, codes
2861    0x20 and 0x7F of the GL area work as control characters SPACE and
2862    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2863    be used.
2864
2865    There are two ways of invocation: locking-shift and single-shift.
2866    With locking-shift, the invocation lasts until the next different
2867    invocation, whereas with single-shift, the invocation affects the
2868    following character only and doesn't affect the locking-shift
2869    state.  Invocations are done by the following control characters or
2870    escape sequences:
2871
2872    ----------------------------------------------------------------------
2873    abbrev  function                  cntrl escape seq   description
2874    ----------------------------------------------------------------------
2875    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2876    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2877    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2878    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2879    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2880    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2881    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2882    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2883    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2884    ----------------------------------------------------------------------
2885    (*) These are not used by any known coding system.
2886
2887    Control characters for these functions are defined by macros
2888    ISO_CODE_XXX in `coding.h'.
2889
2890    Designations are done by the following escape sequences:
2891    ----------------------------------------------------------------------
2892    escape sequence      description
2893    ----------------------------------------------------------------------
2894    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2895    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2896    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2897    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2898    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2899    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2900    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2901    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2902    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2903    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2904    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2905    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2906    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2907    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2908    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2909    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2910    ----------------------------------------------------------------------
2911
2912    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2913    of dimension 1, chars 94, and final character <F>, etc...
2914
2915    Note (*): Although these designations are not allowed in ISO2022,
2916    Emacs accepts them on decoding, and produces them on encoding
2917    CHARS96 character sets in a coding system which is characterized as
2918    7-bit environment, non-locking-shift, and non-single-shift.
2919
2920    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2921    '(' must be omitted.  We refer to this as "short-form" hereafter.
2922
2923    Now you may notice that there are a lot of ways of encoding the
2924    same multilingual text in ISO2022.  Actually, there exist many
2925    coding systems such as Compound Text (used in X11's inter client
2926    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2927    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2928    localized platforms), and all of these are variants of ISO2022.
2929
2930    In addition to the above, Emacs handles two more kinds of escape
2931    sequences: ISO6429's direction specification and Emacs' private
2932    sequence for specifying character composition.
2933
2934    ISO6429's direction specification takes the following form:
2935         o CSI ']'      -- end of the current direction
2936         o CSI '0' ']'  -- end of the current direction
2937         o CSI '1' ']'  -- start of left-to-right text
2938         o CSI '2' ']'  -- start of right-to-left text
2939    The control character CSI (0x9B: control sequence introducer) is
2940    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2941
2942    Character composition specification takes the following form:
2943         o ESC '0' -- start relative composition
2944         o ESC '1' -- end composition
2945         o ESC '2' -- start rule-base composition (*)
2946         o ESC '3' -- start relative composition with alternate chars  (**)
2947         o ESC '4' -- start rule-base composition with alternate chars  (**)
2948   Since these are not standard escape sequences of any ISO standard,
2949   the use of them with these meanings is restricted to Emacs only.
2950
2951   (*) This form is used only in Emacs 20.7 and older versions,
2952   but newer versions can safely decode it.
2953   (**) This form is used only in Emacs 21.1 and newer versions,
2954   and older versions can't decode it.
2955
2956   Here's a list of example usages of these composition escape
2957   sequences (categorized by `enum composition_method').
2958
2959   COMPOSITION_RELATIVE:
2960         ESC 0 CHAR [ CHAR ] ESC 1
2961   COMPOSITION_WITH_RULE:
2962         ESC 2 CHAR [ RULE CHAR ] ESC 1
2963   COMPOSITION_WITH_ALTCHARS:
2964         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2965   COMPOSITION_WITH_RULE_ALTCHARS:
2966         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2967
2968 enum iso_code_class_type iso_code_class[256];
2969
2970 #define SAFE_CHARSET_P(coding, id)      \
2971   ((id) <= (coding)->max_charset_id     \
2972    && (coding)->safe_charsets[id] != 255)
2973
2974
2975 #define SHIFT_OUT_OK(category)  \
2976   (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2977
2978 static void
2979 setup_iso_safe_charsets (Lisp_Object attrs)
2980 {
2981   Lisp_Object charset_list, safe_charsets;
2982   Lisp_Object request;
2983   Lisp_Object reg_usage;
2984   Lisp_Object tail;
2985   int reg94, reg96;
2986   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2987   int max_charset_id;
2988
2989   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2990   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2991       && ! EQ (charset_list, Viso_2022_charset_list))
2992     {
2993       CODING_ATTR_CHARSET_LIST (attrs)
2994         = charset_list = Viso_2022_charset_list;
2995       ASET (attrs, coding_attr_safe_charsets, Qnil);
2996     }
2997
2998   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2999     return;
3000
3001   max_charset_id = 0;
3002   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
3003     {
3004       int id = XINT (XCAR (tail));
3005       if (max_charset_id < id)
3006         max_charset_id = id;
3007     }
3008
3009   safe_charsets = make_uninit_string (max_charset_id + 1);
3010   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
3011   request = AREF (attrs, coding_attr_iso_request);
3012   reg_usage = AREF (attrs, coding_attr_iso_usage);
3013   reg94 = XINT (XCAR (reg_usage));
3014   reg96 = XINT (XCDR (reg_usage));
3015
3016   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
3017     {
3018       Lisp_Object id;
3019       Lisp_Object reg;
3020       struct charset *charset;
3021
3022       id = XCAR (tail);
3023       charset = CHARSET_FROM_ID (XINT (id));
3024       reg = Fcdr (Fassq (id, request));
3025       if (! NILP (reg))
3026         SSET (safe_charsets, XINT (id), XINT (reg));
3027       else if (charset->iso_chars_96)
3028         {
3029           if (reg96 < 4)
3030             SSET (safe_charsets, XINT (id), reg96);
3031         }
3032       else
3033         {
3034           if (reg94 < 4)
3035             SSET (safe_charsets, XINT (id), reg94);
3036         }
3037     }
3038   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
3039 }
3040
3041
3042 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3043    Check if a text is encoded in one of ISO-2022 based coding systems.
3044    If it is, return 1, else return 0.  */
3045
3046 static int
3047 detect_coding_iso_2022 (struct coding_system *coding,
3048                         struct coding_detection_info *detect_info)
3049 {
3050   const unsigned char *src = coding->source, *src_base = src;
3051   const unsigned char *src_end = coding->source + coding->src_bytes;
3052   int multibytep = coding->src_multibyte;
3053   int single_shifting = 0;
3054   int id;
3055   int c, c1;
3056   int consumed_chars = 0;
3057   int i;
3058   int rejected = 0;
3059   int found = 0;
3060   int composition_count = -1;
3061
3062   detect_info->checked |= CATEGORY_MASK_ISO;
3063
3064   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
3065     {
3066       struct coding_system *this = &(coding_categories[i]);
3067       Lisp_Object attrs, val;
3068
3069       if (this->id < 0)
3070         continue;
3071       attrs = CODING_ID_ATTRS (this->id);
3072       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
3073           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
3074         setup_iso_safe_charsets (attrs);
3075       val = CODING_ATTR_SAFE_CHARSETS (attrs);
3076       this->max_charset_id = SCHARS (val) - 1;
3077       this->safe_charsets = SDATA (val);
3078     }
3079
3080   /* A coding system of this category is always ASCII compatible.  */
3081   src += coding->head_ascii;
3082
3083   while (rejected != CATEGORY_MASK_ISO)
3084     {
3085       src_base = src;
3086       ONE_MORE_BYTE (c);
3087       switch (c)
3088         {
3089         case ISO_CODE_ESC:
3090           if (inhibit_iso_escape_detection)
3091             break;
3092           single_shifting = 0;
3093           ONE_MORE_BYTE (c);
3094           if (c >= '(' && c <= '/')
3095             {
3096               /* Designation sequence for a charset of dimension 1.  */
3097               ONE_MORE_BYTE (c1);
3098               if (c1 < ' ' || c1 >= 0x80
3099                   || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3100                 /* Invalid designation sequence.  Just ignore.  */
3101                 break;
3102             }
3103           else if (c == '$')
3104             {
3105               /* Designation sequence for a charset of dimension 2.  */
3106               ONE_MORE_BYTE (c);
3107               if (c >= '@' && c <= 'B')
3108                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3109                 id = iso_charset_table[1][0][c];
3110               else if (c >= '(' && c <= '/')
3111                 {
3112                   ONE_MORE_BYTE (c1);
3113                   if (c1 < ' ' || c1 >= 0x80
3114                       || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3115                     /* Invalid designation sequence.  Just ignore.  */
3116                     break;
3117                 }
3118               else
3119                 /* Invalid designation sequence.  Just ignore it.  */
3120                 break;
3121             }
3122           else if (c == 'N' || c == 'O')
3123             {
3124               /* ESC <Fe> for SS2 or SS3.  */
3125               single_shifting = 1;
3126               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3127               break;
3128             }
3129           else if (c == '1')
3130             {
3131               /* End of composition.  */
3132               if (composition_count < 0
3133                   || composition_count > MAX_COMPOSITION_COMPONENTS)
3134                 /* Invalid */
3135                 break;
3136               composition_count = -1;
3137               found |= CATEGORY_MASK_ISO;
3138             }
3139           else if (c >= '0' && c <= '4')
3140             {
3141               /* ESC <Fp> for start/end composition.  */
3142               composition_count = 0;
3143               break;
3144             }
3145           else
3146             {
3147               /* Invalid escape sequence.  Just ignore it.  */
3148               break;
3149             }
3150
3151           /* We found a valid designation sequence for CHARSET.  */
3152           rejected |= CATEGORY_MASK_ISO_8BIT;
3153           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3154                               id))
3155             found |= CATEGORY_MASK_ISO_7;
3156           else
3157             rejected |= CATEGORY_MASK_ISO_7;
3158           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3159                               id))
3160             found |= CATEGORY_MASK_ISO_7_TIGHT;
3161           else
3162             rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3163           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3164                               id))
3165             found |= CATEGORY_MASK_ISO_7_ELSE;
3166           else
3167             rejected |= CATEGORY_MASK_ISO_7_ELSE;
3168           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3169                               id))
3170             found |= CATEGORY_MASK_ISO_8_ELSE;
3171           else
3172             rejected |= CATEGORY_MASK_ISO_8_ELSE;
3173           break;
3174
3175         case ISO_CODE_SO:
3176         case ISO_CODE_SI:
3177           /* Locking shift out/in.  */
3178           if (inhibit_iso_escape_detection)
3179             break;
3180           single_shifting = 0;
3181           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3182           break;
3183
3184         case ISO_CODE_CSI:
3185           /* Control sequence introducer.  */
3186           single_shifting = 0;
3187           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3188           found |= CATEGORY_MASK_ISO_8_ELSE;
3189           goto check_extra_latin;
3190
3191         case ISO_CODE_SS2:
3192         case ISO_CODE_SS3:
3193           /* Single shift.   */
3194           if (inhibit_iso_escape_detection)
3195             break;
3196           single_shifting = 0;
3197           rejected |= CATEGORY_MASK_ISO_7BIT;
3198           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3199               & CODING_ISO_FLAG_SINGLE_SHIFT)
3200             found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
3201           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3202               & CODING_ISO_FLAG_SINGLE_SHIFT)
3203             found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
3204           if (single_shifting)
3205             break;
3206           goto check_extra_latin;
3207
3208         default:
3209           if (c < 0)
3210             continue;
3211           if (c < 0x80)
3212             {
3213               if (composition_count >= 0)
3214                 composition_count++;
3215               single_shifting = 0;
3216               break;
3217             }
3218           if (c >= 0xA0)
3219             {
3220               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3221               found |= CATEGORY_MASK_ISO_8_1;
3222               /* Check the length of succeeding codes of the range
3223                  0xA0..0FF.  If the byte length is even, we include
3224                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3225                  only when we are not single shifting.  */
3226               if (! single_shifting
3227                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3228                 {
3229                   int i = 1;
3230                   while (src < src_end)
3231                     {
3232                       src_base = src;
3233                       ONE_MORE_BYTE (c);
3234                       if (c < 0xA0)
3235                         {
3236                           src = src_base;
3237                           break;
3238                         }
3239                       i++;
3240                     }
3241
3242                   if (i & 1 && src < src_end)
3243                     {
3244                       rejected |= CATEGORY_MASK_ISO_8_2;
3245                       if (composition_count >= 0)
3246                         composition_count += i;
3247                     }
3248                   else
3249                     {
3250                       found |= CATEGORY_MASK_ISO_8_2;
3251                       if (composition_count >= 0)
3252                         composition_count += i / 2;
3253                     }
3254                 }
3255               break;
3256             }
3257         check_extra_latin:
3258           single_shifting = 0;
3259           if (! VECTORP (Vlatin_extra_code_table)
3260               || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3261             {
3262               rejected = CATEGORY_MASK_ISO;
3263               break;
3264             }
3265           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3266               & CODING_ISO_FLAG_LATIN_EXTRA)
3267             found |= CATEGORY_MASK_ISO_8_1;
3268           else
3269             rejected |= CATEGORY_MASK_ISO_8_1;
3270           rejected |= CATEGORY_MASK_ISO_8_2;
3271         }
3272     }
3273   detect_info->rejected |= CATEGORY_MASK_ISO;
3274   return 0;
3275
3276  no_more_source:
3277   detect_info->rejected |= rejected;
3278   detect_info->found |= (found & ~rejected);
3279   return 1;
3280 }
3281
3282
3283 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3284    escape sequence should be kept.  */
3285 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3286   do {                                                                  \
3287     int id, prev;                                                       \
3288                                                                         \
3289     if (final < '0' || final >= 128                                     \
3290         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3291         || !SAFE_CHARSET_P (coding, id))                                \
3292       {                                                                 \
3293         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3294         chars_96 = -1;                                                  \
3295         break;                                                          \
3296       }                                                                 \
3297     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3298     if (id == charset_jisx0201_roman)                                   \
3299       {                                                                 \
3300         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3301           id = charset_ascii;                                           \
3302       }                                                                 \
3303     else if (id == charset_jisx0208_1978)                               \
3304       {                                                                 \
3305         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3306           id = charset_jisx0208;                                        \
3307       }                                                                 \
3308     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3309     /* If there was an invalid designation to REG previously, and this  \
3310        designation is ASCII to REG, we should keep this designation     \
3311        sequence.  */                                                    \
3312     if (prev == -2 && id == charset_ascii)                              \
3313       chars_96 = -1;                                                    \
3314   } while (0)
3315
3316
3317 /* Handle these composition sequence (ALT: alternate char):
3318
3319    (1) relative composition: ESC 0 CHAR ... ESC 1
3320    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3321    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3322    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3323
3324    When the start sequence (ESC 0/2/3/4) is found, this annotation
3325    header is produced.
3326
3327         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3328
3329    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3330    produced until the end sequence (ESC 1) is found:
3331
3332    (1) CHAR ... CHAR
3333    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3334    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3335    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3336
3337    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3338    annotation header is updated as below:
3339
3340    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3341    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3342    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3343    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3344
3345    If an error is found while composing, the annotation header is
3346    changed to:
3347
3348         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3349
3350    and the sequence [ -2 DECODED-RULE ] is changed to the original
3351    byte sequence as below:
3352         o the original byte sequence is B: [ B -1 ]
3353         o the original byte sequence is B1 B2: [ B1 B2 ]
3354    and the sequence [ -1 -1 ] is changed to the original byte
3355    sequence:
3356         [ ESC '0' ]
3357 */
3358
3359 /* Decode a composition rule C1 and maybe one more byte from the
3360    source, and set RULE to the encoded composition rule, NBYTES to the
3361    length of the composition rule.  If the rule is invalid, set RULE
3362    to some negative value.  */
3363
3364 #define DECODE_COMPOSITION_RULE(rule, nbytes)                           \
3365   do {                                                                  \
3366     rule = c1 - 32;                                                     \
3367     if (rule < 0)                                                       \
3368       break;                                                            \
3369     if (rule < 81)              /* old format (before ver.21) */        \
3370       {                                                                 \
3371         int gref = (rule) / 9;                                          \
3372         int nref = (rule) % 9;                                          \
3373         if (gref == 4) gref = 10;                                       \
3374         if (nref == 4) nref = 10;                                       \
3375         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3376         nbytes = 1;                                                     \
3377       }                                                                 \
3378     else                        /* new format (after ver.21) */         \
3379       {                                                                 \
3380         int c;                                                          \
3381                                                                         \
3382         ONE_MORE_BYTE (c);                                              \
3383         rule = COMPOSITION_ENCODE_RULE (rule - 81, c - 32);             \
3384         if (rule >= 0)                                                  \
3385           rule += 0x100;   /* to destinguish it from the old format */  \
3386         nbytes = 2;                                                     \
3387       }                                                                 \
3388   } while (0)
3389
3390 #define ENCODE_COMPOSITION_RULE(rule)                           \
3391   do {                                                          \
3392     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3393                                                                 \
3394     if (rule < 0x100)           /* old format */                \
3395       {                                                         \
3396         if (gref == 10) gref = 4;                               \
3397         if (nref == 10) nref = 4;                               \
3398         charbuf[idx] = 32 + gref * 9 + nref;                    \
3399         charbuf[idx + 1] = -1;                                  \
3400         new_chars++;                                            \
3401       }                                                         \
3402     else                                /* new format */        \
3403       {                                                         \
3404         charbuf[idx] = 32 + 81 + gref;                          \
3405         charbuf[idx + 1] = 32 + nref;                           \
3406         new_chars += 2;                                         \
3407       }                                                         \
3408   } while (0)
3409
3410 /* Finish the current composition as invalid.  */
3411
3412 static int finish_composition (int *, struct composition_status *);
3413
3414 static int
3415 finish_composition (int *charbuf, struct composition_status *cmp_status)
3416 {
3417   int idx = - cmp_status->length;
3418   int new_chars;
3419
3420   /* Recover the original ESC sequence */
3421   charbuf[idx++] = ISO_CODE_ESC;
3422   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3423                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3424                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3425                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3426                     : '4');
3427   charbuf[idx++] = -2;
3428   charbuf[idx++] = 0;
3429   charbuf[idx++] = -1;
3430   new_chars = cmp_status->nchars;
3431   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3432     for (; idx < 0; idx++)
3433       {
3434         int elt = charbuf[idx];
3435
3436         if (elt == -2)
3437           {
3438             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3439             idx++;
3440           }
3441         else if (elt == -1)
3442           {
3443             charbuf[idx++] = ISO_CODE_ESC;
3444             charbuf[idx] = '0';
3445             new_chars += 2;
3446           }
3447       }
3448   cmp_status->state = COMPOSING_NO;
3449   return new_chars;
3450 }
3451
3452 /* If characters are under composition, finish the composition.  */
3453 #define MAYBE_FINISH_COMPOSITION()                              \
3454   do {                                                          \
3455     if (cmp_status->state != COMPOSING_NO)                      \
3456       char_offset += finish_composition (charbuf, cmp_status);  \
3457   } while (0)
3458
3459 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3460
3461    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3462    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3463    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3464    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3465
3466    Produce this annotation sequence now:
3467
3468    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3469 */
3470
3471 #define DECODE_COMPOSITION_START(c1)                                       \
3472   do {                                                                     \
3473     if (c1 == '0'                                                          \
3474         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3475              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3476             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3477                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3478       {                                                                    \
3479         *charbuf++ = -1;                                                   \
3480         *charbuf++= -1;                                                    \
3481         cmp_status->state = COMPOSING_CHAR;                                \
3482         cmp_status->length += 2;                                           \
3483       }                                                                    \
3484     else                                                                   \
3485       {                                                                    \
3486         MAYBE_FINISH_COMPOSITION ();                                       \
3487         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3488                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3489                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3490                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3491         cmp_status->state                                                  \
3492           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3493         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3494         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3495         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3496         coding->annotated = 1;                                             \
3497       }                                                                    \
3498   } while (0)
3499
3500
3501 /* Handle composition end sequence ESC 1.  */
3502
3503 #define DECODE_COMPOSITION_END()                                        \
3504   do {                                                                  \
3505     if (cmp_status->nchars == 0                                         \
3506         || ((cmp_status->state == COMPOSING_CHAR)                       \
3507             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3508       {                                                                 \
3509         MAYBE_FINISH_COMPOSITION ();                                    \
3510         goto invalid_code;                                              \
3511       }                                                                 \
3512     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3513       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3514     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3515       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3516     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3517     char_offset += cmp_status->nchars;                                  \
3518     cmp_status->state = COMPOSING_NO;                                   \
3519   } while (0)
3520
3521 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3522
3523 #define STORE_COMPOSITION_RULE(rule)    \
3524   do {                                  \
3525     *charbuf++ = -2;                    \
3526     *charbuf++ = rule;                  \
3527     cmp_status->length += 2;            \
3528     cmp_status->state--;                \
3529   } while (0)
3530
3531 /* Store a composed char or a component char C in charbuf, and update
3532    cmp_status.  */
3533
3534 #define STORE_COMPOSITION_CHAR(c)                                       \
3535   do {                                                                  \
3536     *charbuf++ = (c);                                                   \
3537     cmp_status->length++;                                               \
3538     if (cmp_status->state == COMPOSING_CHAR)                            \
3539       cmp_status->nchars++;                                             \
3540     else                                                                \
3541       cmp_status->ncomps++;                                             \
3542     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3543         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3544             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3545       cmp_status->state++;                                              \
3546   } while (0)
3547
3548
3549 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3550
3551 static void
3552 decode_coding_iso_2022 (struct coding_system *coding)
3553 {
3554   const unsigned char *src = coding->source + coding->consumed;
3555   const unsigned char *src_end = coding->source + coding->src_bytes;
3556   const unsigned char *src_base;
3557   int *charbuf = coding->charbuf + coding->charbuf_used;
3558   /* We may produce two annotations (charset and composition) in one
3559      loop and one more charset annotation at the end.  */
3560   int *charbuf_end
3561     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3562   int consumed_chars = 0, consumed_chars_base;
3563   int multibytep = coding->src_multibyte;
3564   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3565   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3566   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3567   int charset_id_2, charset_id_3;
3568   struct charset *charset;
3569   int c;
3570   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3571   Lisp_Object attrs, charset_list;
3572   int char_offset = coding->produced_char;
3573   int last_offset = char_offset;
3574   int last_id = charset_ascii;
3575   int eol_crlf =
3576     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3577   int byte_after_cr = -1;
3578   int i;
3579
3580   CODING_GET_INFO (coding, attrs, charset_list);
3581   setup_iso_safe_charsets (attrs);
3582   /* Charset list may have been changed.  */
3583   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3584   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3585
3586   if (cmp_status->state != COMPOSING_NO)
3587     {
3588       for (i = 0; i < cmp_status->length; i++)
3589         *charbuf++ = cmp_status->carryover[i];
3590       coding->annotated = 1;
3591     }
3592
3593   while (1)
3594     {
3595       int c1, c2, c3;
3596
3597       src_base = src;
3598       consumed_chars_base = consumed_chars;
3599
3600       if (charbuf >= charbuf_end)
3601         {
3602           if (byte_after_cr >= 0)
3603             src_base--;
3604           break;
3605         }
3606
3607       if (byte_after_cr >= 0)
3608         c1 = byte_after_cr, byte_after_cr = -1;
3609       else
3610         ONE_MORE_BYTE (c1);
3611       if (c1 < 0)
3612         goto invalid_code;
3613
3614       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3615         {
3616           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3617           char_offset++;
3618           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3619           continue;
3620         }
3621
3622       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3623         {
3624           if (c1 == ISO_CODE_ESC)
3625             {
3626               if (src + 1 >= src_end)
3627                 goto no_more_source;
3628               *charbuf++ = ISO_CODE_ESC;
3629               char_offset++;
3630               if (src[0] == '%' && src[1] == '@')
3631                 {
3632                   src += 2;
3633                   consumed_chars += 2;
3634                   char_offset += 2;
3635                   /* We are sure charbuf can contain two more chars. */
3636                   *charbuf++ = '%';
3637                   *charbuf++ = '@';
3638                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3639                 }
3640             }
3641           else
3642             {
3643               *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3644               char_offset++;
3645             }
3646           continue;
3647         }
3648
3649       if ((cmp_status->state == COMPOSING_RULE
3650            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3651           && c1 != ISO_CODE_ESC)
3652         {
3653           int rule, nbytes;
3654
3655           DECODE_COMPOSITION_RULE (rule, nbytes);
3656           if (rule < 0)
3657             goto invalid_code;
3658           STORE_COMPOSITION_RULE (rule);
3659           continue;
3660         }
3661
3662       /* We produce at most one character.  */
3663       switch (iso_code_class [c1])
3664         {
3665         case ISO_0x20_or_0x7F:
3666           if (charset_id_0 < 0
3667               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3668             /* This is SPACE or DEL.  */
3669             charset = CHARSET_FROM_ID (charset_ascii);
3670           else
3671             charset = CHARSET_FROM_ID (charset_id_0);
3672           break;
3673
3674         case ISO_graphic_plane_0:
3675           if (charset_id_0 < 0)
3676             charset = CHARSET_FROM_ID (charset_ascii);
3677           else
3678             charset = CHARSET_FROM_ID (charset_id_0);
3679           break;
3680
3681         case ISO_0xA0_or_0xFF:
3682           if (charset_id_1 < 0
3683               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3684               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3685             goto invalid_code;
3686           /* This is a graphic character, we fall down ... */
3687
3688         case ISO_graphic_plane_1:
3689           if (charset_id_1 < 0)
3690             goto invalid_code;
3691           charset = CHARSET_FROM_ID (charset_id_1);
3692           break;
3693
3694         case ISO_control_0:
3695           if (eol_crlf && c1 == '\r')
3696             ONE_MORE_BYTE (byte_after_cr);
3697           MAYBE_FINISH_COMPOSITION ();
3698           charset = CHARSET_FROM_ID (charset_ascii);
3699           break;
3700
3701         case ISO_control_1:
3702           goto invalid_code;
3703
3704         case ISO_shift_out:
3705           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3706               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3707             goto invalid_code;
3708           CODING_ISO_INVOCATION (coding, 0) = 1;
3709           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3710           continue;
3711
3712         case ISO_shift_in:
3713           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3714             goto invalid_code;
3715           CODING_ISO_INVOCATION (coding, 0) = 0;
3716           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3717           continue;
3718
3719         case ISO_single_shift_2_7:
3720           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3721             goto invalid_code;
3722         case ISO_single_shift_2:
3723           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3724             goto invalid_code;
3725           /* SS2 is handled as an escape sequence of ESC 'N' */
3726           c1 = 'N';
3727           goto label_escape_sequence;
3728
3729         case ISO_single_shift_3:
3730           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3731             goto invalid_code;
3732           /* SS2 is handled as an escape sequence of ESC 'O' */
3733           c1 = 'O';
3734           goto label_escape_sequence;
3735
3736         case ISO_control_sequence_introducer:
3737           /* CSI is handled as an escape sequence of ESC '[' ...  */
3738           c1 = '[';
3739           goto label_escape_sequence;
3740
3741         case ISO_escape:
3742           ONE_MORE_BYTE (c1);
3743         label_escape_sequence:
3744           /* Escape sequences handled here are invocation,
3745              designation, direction specification, and character
3746              composition specification.  */
3747           switch (c1)
3748             {
3749             case '&':           /* revision of following character set */
3750               ONE_MORE_BYTE (c1);
3751               if (!(c1 >= '@' && c1 <= '~'))
3752                 goto invalid_code;
3753               ONE_MORE_BYTE (c1);
3754               if (c1 != ISO_CODE_ESC)
3755                 goto invalid_code;
3756               ONE_MORE_BYTE (c1);
3757               goto label_escape_sequence;
3758
3759             case '$':           /* designation of 2-byte character set */
3760               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3761                 goto invalid_code;
3762               {
3763                 int reg, chars96;
3764
3765                 ONE_MORE_BYTE (c1);
3766                 if (c1 >= '@' && c1 <= 'B')
3767                   {     /* designation of JISX0208.1978, GB2312.1980,
3768                            or JISX0208.1980 */
3769                     reg = 0, chars96 = 0;
3770                   }
3771                 else if (c1 >= 0x28 && c1 <= 0x2B)
3772                   { /* designation of DIMENSION2_CHARS94 character set */
3773                     reg = c1 - 0x28, chars96 = 0;
3774                     ONE_MORE_BYTE (c1);
3775                   }
3776                 else if (c1 >= 0x2C && c1 <= 0x2F)
3777                   { /* designation of DIMENSION2_CHARS96 character set */
3778                     reg = c1 - 0x2C, chars96 = 1;
3779                     ONE_MORE_BYTE (c1);
3780                   }
3781                 else
3782                   goto invalid_code;
3783                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3784                 /* We must update these variables now.  */
3785                 if (reg == 0)
3786                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3787                 else if (reg == 1)
3788                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3789                 if (chars96 < 0)
3790                   goto invalid_code;
3791               }
3792               continue;
3793
3794             case 'n':           /* invocation of locking-shift-2 */
3795               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3796                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3797                 goto invalid_code;
3798               CODING_ISO_INVOCATION (coding, 0) = 2;
3799               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3800               continue;
3801
3802             case 'o':           /* invocation of locking-shift-3 */
3803               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3804                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3805                 goto invalid_code;
3806               CODING_ISO_INVOCATION (coding, 0) = 3;
3807               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3808               continue;
3809
3810             case 'N':           /* invocation of single-shift-2 */
3811               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3812                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3813                 goto invalid_code;
3814               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3815               if (charset_id_2 < 0)
3816                 charset = CHARSET_FROM_ID (charset_ascii);
3817               else
3818                 charset = CHARSET_FROM_ID (charset_id_2);
3819               ONE_MORE_BYTE (c1);
3820               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3821                 goto invalid_code;
3822               break;
3823
3824             case 'O':           /* invocation of single-shift-3 */
3825               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3826                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3827                 goto invalid_code;
3828               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3829               if (charset_id_3 < 0)
3830                 charset = CHARSET_FROM_ID (charset_ascii);
3831               else
3832                 charset = CHARSET_FROM_ID (charset_id_3);
3833               ONE_MORE_BYTE (c1);
3834               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3835                 goto invalid_code;
3836               break;
3837
3838             case '0': case '2': case '3': case '4': /* start composition */
3839               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3840                 goto invalid_code;
3841               if (last_id != charset_ascii)
3842                 {
3843                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3844                   last_id = charset_ascii;
3845                   last_offset = char_offset;
3846                 }
3847               DECODE_COMPOSITION_START (c1);
3848               continue;
3849
3850             case '1':           /* end composition */
3851               if (cmp_status->state == COMPOSING_NO)
3852                 goto invalid_code;
3853               DECODE_COMPOSITION_END ();
3854               continue;
3855
3856             case '[':           /* specification of direction */
3857               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3858                 goto invalid_code;
3859               /* For the moment, nested direction is not supported.
3860                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3861                  left-to-right, and nonzero means right-to-left.  */
3862               ONE_MORE_BYTE (c1);
3863               switch (c1)
3864                 {
3865                 case ']':       /* end of the current direction */
3866                   coding->mode &= ~CODING_MODE_DIRECTION;
3867
3868                 case '0':       /* end of the current direction */
3869                 case '1':       /* start of left-to-right direction */
3870                   ONE_MORE_BYTE (c1);
3871                   if (c1 == ']')
3872                     coding->mode &= ~CODING_MODE_DIRECTION;
3873                   else
3874                     goto invalid_code;
3875                   break;
3876
3877                 case '2':       /* start of right-to-left direction */
3878                   ONE_MORE_BYTE (c1);
3879                   if (c1 == ']')
3880                     coding->mode |= CODING_MODE_DIRECTION;
3881                   else
3882                     goto invalid_code;
3883                   break;
3884
3885                 default:
3886                   goto invalid_code;
3887                 }
3888               continue;
3889
3890             case '%':
3891               ONE_MORE_BYTE (c1);
3892               if (c1 == '/')
3893                 {
3894                   /* CTEXT extended segment:
3895                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3896                      We keep these bytes as is for the moment.
3897                      They may be decoded by post-read-conversion.  */
3898                   int dim, M, L;
3899                   int size;
3900
3901                   ONE_MORE_BYTE (dim);
3902                   if (dim < '0' || dim > '4')
3903                     goto invalid_code;
3904                   ONE_MORE_BYTE (M);
3905                   if (M < 128)
3906                     goto invalid_code;
3907                   ONE_MORE_BYTE (L);
3908                   if (L < 128)
3909                     goto invalid_code;
3910                   size = ((M - 128) * 128) + (L - 128);
3911                   if (charbuf + 6 > charbuf_end)
3912                     goto break_loop;
3913                   *charbuf++ = ISO_CODE_ESC;
3914                   *charbuf++ = '%';
3915                   *charbuf++ = '/';
3916                   *charbuf++ = dim;
3917                   *charbuf++ = BYTE8_TO_CHAR (M);
3918                   *charbuf++ = BYTE8_TO_CHAR (L);
3919                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3920                 }
3921               else if (c1 == 'G')
3922                 {
3923                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3924                      ESC % G --UTF-8-BYTES-- ESC % @
3925                      We keep these bytes as is for the moment.
3926                      They may be decoded by post-read-conversion.  */
3927                   if (charbuf + 3 > charbuf_end)
3928                     goto break_loop;
3929                   *charbuf++ = ISO_CODE_ESC;
3930                   *charbuf++ = '%';
3931                   *charbuf++ = 'G';
3932                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3933                 }
3934               else
3935                 goto invalid_code;
3936               continue;
3937               break;
3938
3939             default:
3940               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3941                 goto invalid_code;
3942               {
3943                 int reg, chars96;
3944
3945                 if (c1 >= 0x28 && c1 <= 0x2B)
3946                   { /* designation of DIMENSION1_CHARS94 character set */
3947                     reg = c1 - 0x28, chars96 = 0;
3948                     ONE_MORE_BYTE (c1);
3949                   }
3950                 else if (c1 >= 0x2C && c1 <= 0x2F)
3951                   { /* designation of DIMENSION1_CHARS96 character set */
3952                     reg = c1 - 0x2C, chars96 = 1;
3953                     ONE_MORE_BYTE (c1);
3954                   }
3955                 else
3956                   goto invalid_code;
3957                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3958                 /* We must update these variables now.  */
3959                 if (reg == 0)
3960                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3961                 else if (reg == 1)
3962                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3963                 if (chars96 < 0)
3964                   goto invalid_code;
3965               }
3966               continue;
3967             }
3968         }
3969
3970       if (cmp_status->state == COMPOSING_NO
3971           && charset->id != charset_ascii
3972           && last_id != charset->id)
3973         {
3974           if (last_id != charset_ascii)
3975             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3976           last_id = charset->id;
3977           last_offset = char_offset;
3978         }
3979
3980       /* Now we know CHARSET and 1st position code C1 of a character.
3981          Produce a decoded character while getting 2nd and 3rd
3982          position codes C2, C3 if necessary.  */
3983       if (CHARSET_DIMENSION (charset) > 1)
3984         {
3985           ONE_MORE_BYTE (c2);
3986           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3987               || ((c1 & 0x80) != (c2 & 0x80)))
3988             /* C2 is not in a valid range.  */
3989             goto invalid_code;
3990           if (CHARSET_DIMENSION (charset) == 2)
3991             c1 = (c1 << 8) | c2;
3992           else
3993             {
3994               ONE_MORE_BYTE (c3);
3995               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3996                   || ((c1 & 0x80) != (c3 & 0x80)))
3997                 /* C3 is not in a valid range.  */
3998                 goto invalid_code;
3999               c1 = (c1 << 16) | (c2 << 8) | c2;
4000             }
4001         }
4002       c1 &= 0x7F7F7F;
4003       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
4004       if (c < 0)
4005         {
4006           MAYBE_FINISH_COMPOSITION ();
4007           for (; src_base < src; src_base++, char_offset++)
4008             {
4009               if (ASCII_BYTE_P (*src_base))
4010                 *charbuf++ = *src_base;
4011               else
4012                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
4013             }
4014         }
4015       else if (cmp_status->state == COMPOSING_NO)
4016         {
4017           *charbuf++ = c;
4018           char_offset++;
4019         }
4020       else if ((cmp_status->state == COMPOSING_CHAR
4021                 ? cmp_status->nchars
4022                 : cmp_status->ncomps)
4023                >= MAX_COMPOSITION_COMPONENTS)
4024         {
4025           /* Too long composition.  */
4026           MAYBE_FINISH_COMPOSITION ();
4027           *charbuf++ = c;
4028           char_offset++;
4029         }
4030       else
4031         STORE_COMPOSITION_CHAR (c);
4032       continue;
4033
4034     invalid_code:
4035       MAYBE_FINISH_COMPOSITION ();
4036       src = src_base;
4037       consumed_chars = consumed_chars_base;
4038       ONE_MORE_BYTE (c);
4039       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4040       char_offset++;
4041       coding->errors++;
4042       continue;
4043
4044     break_loop:
4045       break;
4046     }
4047
4048  no_more_source:
4049   if (cmp_status->state != COMPOSING_NO)
4050     {
4051       if (coding->mode & CODING_MODE_LAST_BLOCK)
4052         MAYBE_FINISH_COMPOSITION ();
4053       else
4054         {
4055           charbuf -= cmp_status->length;
4056           for (i = 0; i < cmp_status->length; i++)
4057             cmp_status->carryover[i] = charbuf[i];
4058         }
4059     }
4060   else if (last_id != charset_ascii)
4061     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4062   coding->consumed_char += consumed_chars_base;
4063   coding->consumed = src_base - coding->source;
4064   coding->charbuf_used = charbuf - coding->charbuf;
4065 }
4066
4067
4068 /* ISO2022 encoding stuff.  */
4069
4070 /*
4071    It is not enough to say just "ISO2022" on encoding, we have to
4072    specify more details.  In Emacs, each coding system of ISO2022
4073    variant has the following specifications:
4074         1. Initial designation to G0 thru G3.
4075         2. Allows short-form designation?
4076         3. ASCII should be designated to G0 before control characters?
4077         4. ASCII should be designated to G0 at end of line?
4078         5. 7-bit environment or 8-bit environment?
4079         6. Use locking-shift?
4080         7. Use Single-shift?
4081    And the following two are only for Japanese:
4082         8. Use ASCII in place of JIS0201-1976-Roman?
4083         9. Use JISX0208-1983 in place of JISX0208-1978?
4084    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4085    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
4086    details.
4087 */
4088
4089 /* Produce codes (escape sequence) for designating CHARSET to graphic
4090    register REG at DST, and increment DST.  If <final-char> of CHARSET is
4091    '@', 'A', or 'B' and the coding system CODING allows, produce
4092    designation sequence of short-form.  */
4093
4094 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
4095   do {                                                                  \
4096     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
4097     const char *intermediate_char_94 = "()*+";                          \
4098     const char *intermediate_char_96 = ",-./";                          \
4099     int revision = -1;                                                  \
4100     int c;                                                              \
4101                                                                         \
4102     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
4103       revision = CHARSET_ISO_REVISION (charset);                        \
4104                                                                         \
4105     if (revision >= 0)                                                  \
4106       {                                                                 \
4107         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4108         EMIT_ONE_BYTE ('@' + revision);                                 \
4109       }                                                                 \
4110     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4111     if (CHARSET_DIMENSION (charset) == 1)                               \
4112       {                                                                 \
4113         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4114           c = intermediate_char_94[reg];                                \
4115         else                                                            \
4116           c = intermediate_char_96[reg];                                \
4117         EMIT_ONE_ASCII_BYTE (c);                                        \
4118       }                                                                 \
4119     else                                                                \
4120       {                                                                 \
4121         EMIT_ONE_ASCII_BYTE ('$');                                      \
4122         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4123           {                                                             \
4124             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4125                 || reg != 0                                             \
4126                 || final_char < '@' || final_char > 'B')                \
4127               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4128           }                                                             \
4129         else                                                            \
4130           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4131       }                                                                 \
4132     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4133                                                                         \
4134     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4135   } while (0)
4136
4137
4138 /* The following two macros produce codes (control character or escape
4139    sequence) for ISO2022 single-shift functions (single-shift-2 and
4140    single-shift-3).  */
4141
4142 #define ENCODE_SINGLE_SHIFT_2                                           \
4143   do {                                                                  \
4144     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4145       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4146     else                                                                \
4147       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4148     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4149   } while (0)
4150
4151
4152 #define ENCODE_SINGLE_SHIFT_3                                           \
4153   do {                                                                  \
4154     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4155       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4156     else                                                                \
4157       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4158     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4159   } while (0)
4160
4161
4162 /* The following four macros produce codes (control character or
4163    escape sequence) for ISO2022 locking-shift functions (shift-in,
4164    shift-out, locking-shift-2, and locking-shift-3).  */
4165
4166 #define ENCODE_SHIFT_IN                                 \
4167   do {                                                  \
4168     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4169     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4170   } while (0)
4171
4172
4173 #define ENCODE_SHIFT_OUT                                \
4174   do {                                                  \
4175     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4176     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4177   } while (0)
4178
4179
4180 #define ENCODE_LOCKING_SHIFT_2                          \
4181   do {                                                  \
4182     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4183     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4184   } while (0)
4185
4186
4187 #define ENCODE_LOCKING_SHIFT_3                          \
4188   do {                                                  \
4189     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4190     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4191   } while (0)
4192
4193
4194 /* Produce codes for a DIMENSION1 character whose character set is
4195    CHARSET and whose position-code is C1.  Designation and invocation
4196    sequences are also produced in advance if necessary.  */
4197
4198 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4199   do {                                                                  \
4200     int id = CHARSET_ID (charset);                                      \
4201                                                                         \
4202     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4203         && id == charset_ascii)                                         \
4204       {                                                                 \
4205         id = charset_jisx0201_roman;                                    \
4206         charset = CHARSET_FROM_ID (id);                                 \
4207       }                                                                 \
4208                                                                         \
4209     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4210       {                                                                 \
4211         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4212           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4213         else                                                            \
4214           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4215         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4216         break;                                                          \
4217       }                                                                 \
4218     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4219       {                                                                 \
4220         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4221         break;                                                          \
4222       }                                                                 \
4223     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4224       {                                                                 \
4225         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4226         break;                                                          \
4227       }                                                                 \
4228     else                                                                \
4229       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4230          must invoke it, or, at first, designate it to some graphic     \
4231          register.  Then repeat the loop to actually produce the        \
4232          character.  */                                                 \
4233       dst = encode_invocation_designation (charset, coding, dst,        \
4234                                            &produced_chars);            \
4235   } while (1)
4236
4237
4238 /* Produce codes for a DIMENSION2 character whose character set is
4239    CHARSET and whose position-codes are C1 and C2.  Designation and
4240    invocation codes are also produced in advance if necessary.  */
4241
4242 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4243   do {                                                                  \
4244     int id = CHARSET_ID (charset);                                      \
4245                                                                         \
4246     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4247         && id == charset_jisx0208)                                      \
4248       {                                                                 \
4249         id = charset_jisx0208_1978;                                     \
4250         charset = CHARSET_FROM_ID (id);                                 \
4251       }                                                                 \
4252                                                                         \
4253     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4254       {                                                                 \
4255         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4256           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4257         else                                                            \
4258           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4259         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4260         break;                                                          \
4261       }                                                                 \
4262     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4263       {                                                                 \
4264         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4265         break;                                                          \
4266       }                                                                 \
4267     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4268       {                                                                 \
4269         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4270         break;                                                          \
4271       }                                                                 \
4272     else                                                                \
4273       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4274          must invoke it, or, at first, designate it to some graphic     \
4275          register.  Then repeat the loop to actually produce the        \
4276          character.  */                                                 \
4277       dst = encode_invocation_designation (charset, coding, dst,        \
4278                                            &produced_chars);            \
4279   } while (1)
4280
4281
4282 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4283   do {                                                                     \
4284     int code = ENCODE_CHAR ((charset), (c));                               \
4285                                                                            \
4286     if (CHARSET_DIMENSION (charset) == 1)                                  \
4287       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4288     else                                                                   \
4289       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4290   } while (0)
4291
4292
4293 /* Produce designation and invocation codes at a place pointed by DST
4294    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4295    Return new DST.  */
4296
4297 unsigned char *
4298 encode_invocation_designation (struct charset *charset,
4299                                struct coding_system *coding,
4300                                unsigned char *dst, int *p_nchars)
4301 {
4302   int multibytep = coding->dst_multibyte;
4303   int produced_chars = *p_nchars;
4304   int reg;                      /* graphic register number */
4305   int id = CHARSET_ID (charset);
4306
4307   /* At first, check designations.  */
4308   for (reg = 0; reg < 4; reg++)
4309     if (id == CODING_ISO_DESIGNATION (coding, reg))
4310       break;
4311
4312   if (reg >= 4)
4313     {
4314       /* CHARSET is not yet designated to any graphic registers.  */
4315       /* At first check the requested designation.  */
4316       reg = CODING_ISO_REQUEST (coding, id);
4317       if (reg < 0)
4318         /* Since CHARSET requests no special designation, designate it
4319            to graphic register 0.  */
4320         reg = 0;
4321
4322       ENCODE_DESIGNATION (charset, reg, coding);
4323     }
4324
4325   if (CODING_ISO_INVOCATION (coding, 0) != reg
4326       && CODING_ISO_INVOCATION (coding, 1) != reg)
4327     {
4328       /* Since the graphic register REG is not invoked to any graphic
4329          planes, invoke it to graphic plane 0.  */
4330       switch (reg)
4331         {
4332         case 0:                 /* graphic register 0 */
4333           ENCODE_SHIFT_IN;
4334           break;
4335
4336         case 1:                 /* graphic register 1 */
4337           ENCODE_SHIFT_OUT;
4338           break;
4339
4340         case 2:                 /* graphic register 2 */
4341           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4342             ENCODE_SINGLE_SHIFT_2;
4343           else
4344             ENCODE_LOCKING_SHIFT_2;
4345           break;
4346
4347         case 3:                 /* graphic register 3 */
4348           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4349             ENCODE_SINGLE_SHIFT_3;
4350           else
4351             ENCODE_LOCKING_SHIFT_3;
4352           break;
4353         }
4354     }
4355
4356   *p_nchars = produced_chars;
4357   return dst;
4358 }
4359
4360 /* The following three macros produce codes for indicating direction
4361    of text.  */
4362 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER                              \
4363   do {                                                                  \
4364     if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS)        \
4365       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '[');                         \
4366     else                                                                \
4367       EMIT_ONE_BYTE (ISO_CODE_CSI);                                     \
4368   } while (0)
4369
4370
4371 #define ENCODE_DIRECTION_R2L()                  \
4372   do {                                          \
4373     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
4374     EMIT_TWO_ASCII_BYTES ('2', ']');            \
4375   } while (0)
4376
4377
4378 #define ENCODE_DIRECTION_L2R()                  \
4379   do {                                          \
4380     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
4381     EMIT_TWO_ASCII_BYTES ('0', ']');            \
4382   } while (0)
4383
4384
4385 /* Produce codes for designation and invocation to reset the graphic
4386    planes and registers to initial state.  */
4387 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4388   do {                                                                  \
4389     int reg;                                                            \
4390     struct charset *charset;                                            \
4391                                                                         \
4392     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4393       ENCODE_SHIFT_IN;                                                  \
4394     for (reg = 0; reg < 4; reg++)                                       \
4395       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4396           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4397               != CODING_ISO_INITIAL (coding, reg)))                     \
4398         {                                                               \
4399           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4400           ENCODE_DESIGNATION (charset, reg, coding);                    \
4401         }                                                               \
4402   } while (0)
4403
4404
4405 /* Produce designation sequences of charsets in the line started from
4406    SRC to a place pointed by DST, and return updated DST.
4407
4408    If the current block ends before any end-of-line, we may fail to
4409    find all the necessary designations.  */
4410
4411 static unsigned char *
4412 encode_designation_at_bol (struct coding_system *coding, int *charbuf,
4413                            int *charbuf_end, unsigned char *dst)
4414 {
4415   struct charset *charset;
4416   /* Table of charsets to be designated to each graphic register.  */
4417   int r[4];
4418   int c, found = 0, reg;
4419   int produced_chars = 0;
4420   int multibytep = coding->dst_multibyte;
4421   Lisp_Object attrs;
4422   Lisp_Object charset_list;
4423
4424   attrs = CODING_ID_ATTRS (coding->id);
4425   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4426   if (EQ (charset_list, Qiso_2022))
4427     charset_list = Viso_2022_charset_list;
4428
4429   for (reg = 0; reg < 4; reg++)
4430     r[reg] = -1;
4431
4432   while (found < 4)
4433     {
4434       int id;
4435
4436       c = *charbuf++;
4437       if (c == '\n')
4438         break;
4439       charset = char_charset (c, charset_list, NULL);
4440       id = CHARSET_ID (charset);
4441       reg = CODING_ISO_REQUEST (coding, id);
4442       if (reg >= 0 && r[reg] < 0)
4443         {
4444           found++;
4445           r[reg] = id;
4446         }
4447     }
4448
4449   if (found)
4450     {
4451       for (reg = 0; reg < 4; reg++)
4452         if (r[reg] >= 0
4453             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4454           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4455     }
4456
4457   return dst;
4458 }
4459
4460 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4461
4462 static int
4463 encode_coding_iso_2022 (struct coding_system *coding)
4464 {
4465   int multibytep = coding->dst_multibyte;
4466   int *charbuf = coding->charbuf;
4467   int *charbuf_end = charbuf + coding->charbuf_used;
4468   unsigned char *dst = coding->destination + coding->produced;
4469   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4470   int safe_room = 16;
4471   int bol_designation
4472     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4473        && CODING_ISO_BOL (coding));
4474   int produced_chars = 0;
4475   Lisp_Object attrs, eol_type, charset_list;
4476   int ascii_compatible;
4477   int c;
4478   int preferred_charset_id = -1;
4479
4480   CODING_GET_INFO (coding, attrs, charset_list);
4481   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4482   if (VECTORP (eol_type))
4483     eol_type = Qunix;
4484
4485   setup_iso_safe_charsets (attrs);
4486   /* Charset list may have been changed.  */
4487   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4488   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4489
4490   ascii_compatible
4491     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4492        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4493                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4494
4495   while (charbuf < charbuf_end)
4496     {
4497       ASSURE_DESTINATION (safe_room);
4498
4499       if (bol_designation)
4500         {
4501           unsigned char *dst_prev = dst;
4502
4503           /* We have to produce designation sequences if any now.  */
4504           dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
4505           bol_designation = 0;
4506           /* We are sure that designation sequences are all ASCII bytes.  */
4507           produced_chars += dst - dst_prev;
4508         }
4509
4510       c = *charbuf++;
4511
4512       if (c < 0)
4513         {
4514           /* Handle an annotation.  */
4515           switch (*charbuf)
4516             {
4517             case CODING_ANNOTATE_COMPOSITION_MASK:
4518               /* Not yet implemented.  */
4519               break;
4520             case CODING_ANNOTATE_CHARSET_MASK:
4521               preferred_charset_id = charbuf[2];
4522               if (preferred_charset_id >= 0
4523                   && NILP (Fmemq (make_number (preferred_charset_id),
4524                                   charset_list)))
4525                 preferred_charset_id = -1;
4526               break;
4527             default:
4528               abort ();
4529             }
4530           charbuf += -c - 1;
4531           continue;
4532         }
4533
4534       /* Now encode the character C.  */
4535       if (c < 0x20 || c == 0x7F)
4536         {
4537           if (c == '\n'
4538               || (c == '\r' && EQ (eol_type, Qmac)))
4539             {
4540               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4541                 ENCODE_RESET_PLANE_AND_REGISTER ();
4542               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4543                 {
4544                   int i;
4545
4546                   for (i = 0; i < 4; i++)
4547                     CODING_ISO_DESIGNATION (coding, i)
4548                       = CODING_ISO_INITIAL (coding, i);
4549                 }
4550               bol_designation
4551                 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
4552             }
4553           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4554             ENCODE_RESET_PLANE_AND_REGISTER ();
4555           EMIT_ONE_ASCII_BYTE (c);
4556         }
4557       else if (ASCII_CHAR_P (c))
4558         {
4559           if (ascii_compatible)
4560             EMIT_ONE_ASCII_BYTE (c);
4561           else
4562             {
4563               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4564               ENCODE_ISO_CHARACTER (charset, c);
4565             }
4566         }
4567       else if (CHAR_BYTE8_P (c))
4568         {
4569           c = CHAR_TO_BYTE8 (c);
4570           EMIT_ONE_BYTE (c);
4571         }
4572       else
4573         {
4574           struct charset *charset;
4575
4576           if (preferred_charset_id >= 0)
4577             {
4578               charset = CHARSET_FROM_ID (preferred_charset_id);
4579               if (! CHAR_CHARSET_P (c, charset))
4580                 charset = char_charset (c, charset_list, NULL);
4581             }
4582           else
4583             charset = char_charset (c, charset_list, NULL);
4584           if (!charset)
4585             {
4586               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4587                 {
4588                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4589                   charset = CHARSET_FROM_ID (charset_ascii);
4590                 }
4591               else
4592                 {
4593                   c = coding->default_char;
4594                   charset = char_charset (c, charset_list, NULL);
4595                 }
4596             }
4597           ENCODE_ISO_CHARACTER (charset, c);
4598         }
4599     }
4600
4601   if (coding->mode & CODING_MODE_LAST_BLOCK
4602       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4603     {
4604       ASSURE_DESTINATION (safe_room);
4605       ENCODE_RESET_PLANE_AND_REGISTER ();
4606     }
4607   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4608   CODING_ISO_BOL (coding) = bol_designation;
4609   coding->produced_char += produced_chars;
4610   coding->produced = dst - coding->destination;
4611   return 0;
4612 }
4613
4614 \f
4615 /*** 8,9. SJIS and BIG5 handlers ***/
4616
4617 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4618    quite widely.  So, for the moment, Emacs supports them in the bare
4619    C code.  But, in the future, they may be supported only by CCL.  */
4620
4621 /* SJIS is a coding system encoding three character sets: ASCII, right
4622    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4623    as is.  A character of charset katakana-jisx0201 is encoded by
4624    "position-code + 0x80".  A character of charset japanese-jisx0208
4625    is encoded in 2-byte but two position-codes are divided and shifted
4626    so that it fit in the range below.
4627
4628    --- CODE RANGE of SJIS ---
4629    (character set)      (range)
4630    ASCII                0x00 .. 0x7F
4631    KATAKANA-JISX0201    0xA0 .. 0xDF
4632    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4633             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4634    -------------------------------
4635
4636 */
4637
4638 /* BIG5 is a coding system encoding two character sets: ASCII and
4639    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4640    character set and is encoded in two-byte.
4641
4642    --- CODE RANGE of BIG5 ---
4643    (character set)      (range)
4644    ASCII                0x00 .. 0x7F
4645    Big5 (1st byte)      0xA1 .. 0xFE
4646         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4647    --------------------------
4648
4649   */
4650
4651 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4652    Check if a text is encoded in SJIS.  If it is, return
4653    CATEGORY_MASK_SJIS, else return 0.  */
4654
4655 static int
4656 detect_coding_sjis (struct coding_system *coding,
4657                     struct coding_detection_info *detect_info)
4658 {
4659   const unsigned char *src = coding->source, *src_base;
4660   const unsigned char *src_end = coding->source + coding->src_bytes;
4661   int multibytep = coding->src_multibyte;
4662   int consumed_chars = 0;
4663   int found = 0;
4664   int c;
4665   Lisp_Object attrs, charset_list;
4666   int max_first_byte_of_2_byte_code;
4667
4668   CODING_GET_INFO (coding, attrs, charset_list);
4669   max_first_byte_of_2_byte_code
4670     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4671
4672   detect_info->checked |= CATEGORY_MASK_SJIS;
4673   /* A coding system of this category is always ASCII compatible.  */
4674   src += coding->head_ascii;
4675
4676   while (1)
4677     {
4678       src_base = src;
4679       ONE_MORE_BYTE (c);
4680       if (c < 0x80)
4681         continue;
4682       if ((c >= 0x81 && c <= 0x9F)
4683           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4684         {
4685           ONE_MORE_BYTE (c);
4686           if (c < 0x40 || c == 0x7F || c > 0xFC)
4687             break;
4688           found = CATEGORY_MASK_SJIS;
4689         }
4690       else if (c >= 0xA0 && c < 0xE0)
4691         found = CATEGORY_MASK_SJIS;
4692       else
4693         break;
4694     }
4695   detect_info->rejected |= CATEGORY_MASK_SJIS;
4696   return 0;
4697
4698  no_more_source:
4699   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4700     {
4701       detect_info->rejected |= CATEGORY_MASK_SJIS;
4702       return 0;
4703     }
4704   detect_info->found |= found;
4705   return 1;
4706 }
4707
4708 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4709    Check if a text is encoded in BIG5.  If it is, return
4710    CATEGORY_MASK_BIG5, else return 0.  */
4711
4712 static int
4713 detect_coding_big5 (struct coding_system *coding,
4714                     struct coding_detection_info *detect_info)
4715 {
4716   const unsigned char *src = coding->source, *src_base;
4717   const unsigned char *src_end = coding->source + coding->src_bytes;
4718   int multibytep = coding->src_multibyte;
4719   int consumed_chars = 0;
4720   int found = 0;
4721   int c;
4722
4723   detect_info->checked |= CATEGORY_MASK_BIG5;
4724   /* A coding system of this category is always ASCII compatible.  */
4725   src += coding->head_ascii;
4726
4727   while (1)
4728     {
4729       src_base = src;
4730       ONE_MORE_BYTE (c);
4731       if (c < 0x80)
4732         continue;
4733       if (c >= 0xA1)
4734         {
4735           ONE_MORE_BYTE (c);
4736           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4737             return 0;
4738           found = CATEGORY_MASK_BIG5;
4739         }
4740       else
4741         break;
4742     }
4743   detect_info->rejected |= CATEGORY_MASK_BIG5;
4744   return 0;
4745
4746  no_more_source:
4747   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4748     {
4749       detect_info->rejected |= CATEGORY_MASK_BIG5;
4750       return 0;
4751     }
4752   detect_info->found |= found;
4753   return 1;
4754 }
4755
4756 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4757    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
4758
4759 static void
4760 decode_coding_sjis (struct coding_system *coding)
4761 {
4762   const unsigned char *src = coding->source + coding->consumed;
4763   const unsigned char *src_end = coding->source + coding->src_bytes;
4764   const unsigned char *src_base;
4765   int *charbuf = coding->charbuf + coding->charbuf_used;
4766   /* We may produce one charset annotation in one loop and one more at
4767      the end.  */
4768   int *charbuf_end
4769     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4770   int consumed_chars = 0, consumed_chars_base;
4771   int multibytep = coding->src_multibyte;
4772   struct charset *charset_roman, *charset_kanji, *charset_kana;
4773   struct charset *charset_kanji2;
4774   Lisp_Object attrs, charset_list, val;
4775   int char_offset = coding->produced_char;
4776   int last_offset = char_offset;
4777   int last_id = charset_ascii;
4778   int eol_crlf =
4779     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4780   int byte_after_cr = -1;
4781
4782   CODING_GET_INFO (coding, attrs, charset_list);
4783
4784   val = charset_list;
4785   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4786   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4787   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4788   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4789
4790   while (1)
4791     {
4792       int c, c1;
4793       struct charset *charset;
4794
4795       src_base = src;
4796       consumed_chars_base = consumed_chars;
4797
4798       if (charbuf >= charbuf_end)
4799         {
4800           if (byte_after_cr >= 0)
4801             src_base--;
4802           break;
4803         }
4804
4805       if (byte_after_cr >= 0)
4806         c = byte_after_cr, byte_after_cr = -1;
4807       else
4808         ONE_MORE_BYTE (c);
4809       if (c < 0)
4810         goto invalid_code;
4811       if (c < 0x80)
4812         {
4813           if (eol_crlf && c == '\r')
4814             ONE_MORE_BYTE (byte_after_cr);
4815           charset = charset_roman;
4816         }
4817       else if (c == 0x80 || c == 0xA0)
4818         goto invalid_code;
4819       else if (c >= 0xA1 && c <= 0xDF)
4820         {
4821           /* SJIS -> JISX0201-Kana */
4822           c &= 0x7F;
4823           charset = charset_kana;
4824         }
4825       else if (c <= 0xEF)
4826         {
4827           /* SJIS -> JISX0208 */
4828           ONE_MORE_BYTE (c1);
4829           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4830             goto invalid_code;
4831           c = (c << 8) | c1;
4832           SJIS_TO_JIS (c);
4833           charset = charset_kanji;
4834         }
4835       else if (c <= 0xFC && charset_kanji2)
4836         {
4837           /* SJIS -> JISX0213-2 */
4838           ONE_MORE_BYTE (c1);
4839           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4840             goto invalid_code;
4841           c = (c << 8) | c1;
4842           SJIS_TO_JIS2 (c);
4843           charset = charset_kanji2;
4844         }
4845       else
4846         goto invalid_code;
4847       if (charset->id != charset_ascii
4848           && last_id != charset->id)
4849         {
4850           if (last_id != charset_ascii)
4851             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4852           last_id = charset->id;
4853           last_offset = char_offset;
4854         }
4855       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4856       *charbuf++ = c;
4857       char_offset++;
4858       continue;
4859
4860     invalid_code:
4861       src = src_base;
4862       consumed_chars = consumed_chars_base;
4863       ONE_MORE_BYTE (c);
4864       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4865       char_offset++;
4866       coding->errors++;
4867     }
4868
4869  no_more_source:
4870   if (last_id != charset_ascii)
4871     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4872   coding->consumed_char += consumed_chars_base;
4873   coding->consumed = src_base - coding->source;
4874   coding->charbuf_used = charbuf - coding->charbuf;
4875 }
4876
4877 static void
4878 decode_coding_big5 (struct coding_system *coding)
4879 {
4880   const unsigned char *src = coding->source + coding->consumed;
4881   const unsigned char *src_end = coding->source + coding->src_bytes;
4882   const unsigned char *src_base;
4883   int *charbuf = coding->charbuf + coding->charbuf_used;
4884   /* We may produce one charset annotation in one loop and one more at
4885      the end.  */
4886   int *charbuf_end
4887     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4888   int consumed_chars = 0, consumed_chars_base;
4889   int multibytep = coding->src_multibyte;
4890   struct charset *charset_roman, *charset_big5;
4891   Lisp_Object attrs, charset_list, val;
4892   int char_offset = coding->produced_char;
4893   int last_offset = char_offset;
4894   int last_id = charset_ascii;
4895   int eol_crlf =
4896     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4897   int byte_after_cr = -1;
4898
4899   CODING_GET_INFO (coding, attrs, charset_list);
4900   val = charset_list;
4901   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4902   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4903
4904   while (1)
4905     {
4906       int c, c1;
4907       struct charset *charset;
4908
4909       src_base = src;
4910       consumed_chars_base = consumed_chars;
4911
4912       if (charbuf >= charbuf_end)
4913         {
4914           if (byte_after_cr >= 0)
4915             src_base--;
4916           break;
4917         }
4918
4919       if (byte_after_cr >= 0)
4920         c = byte_after_cr, byte_after_cr = -1;
4921       else
4922         ONE_MORE_BYTE (c);
4923
4924       if (c < 0)
4925         goto invalid_code;
4926       if (c < 0x80)
4927         {
4928           if (eol_crlf && c == '\r')
4929             ONE_MORE_BYTE (byte_after_cr);
4930           charset = charset_roman;
4931         }
4932       else
4933         {
4934           /* BIG5 -> Big5 */
4935           if (c < 0xA1 || c > 0xFE)
4936             goto invalid_code;
4937           ONE_MORE_BYTE (c1);
4938           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4939             goto invalid_code;
4940           c = c << 8 | c1;
4941           charset = charset_big5;
4942         }
4943       if (charset->id != charset_ascii
4944           && last_id != charset->id)
4945         {
4946           if (last_id != charset_ascii)
4947             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4948           last_id = charset->id;
4949           last_offset = char_offset;
4950         }
4951       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4952       *charbuf++ = c;
4953       char_offset++;
4954       continue;
4955
4956     invalid_code:
4957       src = src_base;
4958       consumed_chars = consumed_chars_base;
4959       ONE_MORE_BYTE (c);
4960       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4961       char_offset++;
4962       coding->errors++;
4963     }
4964
4965  no_more_source:
4966   if (last_id != charset_ascii)
4967     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4968   coding->consumed_char += consumed_chars_base;
4969   coding->consumed = src_base - coding->source;
4970   coding->charbuf_used = charbuf - coding->charbuf;
4971 }
4972
4973 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4974    This function can encode charsets `ascii', `katakana-jisx0201',
4975    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4976    are sure that all these charsets are registered as official charset
4977    (i.e. do not have extended leading-codes).  Characters of other
4978    charsets are produced without any encoding.  If SJIS_P is 1, encode
4979    SJIS text, else encode BIG5 text.  */
4980
4981 static int
4982 encode_coding_sjis (struct coding_system *coding)
4983 {
4984   int multibytep = coding->dst_multibyte;
4985   int *charbuf = coding->charbuf;
4986   int *charbuf_end = charbuf + coding->charbuf_used;
4987   unsigned char *dst = coding->destination + coding->produced;
4988   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4989   int safe_room = 4;
4990   int produced_chars = 0;
4991   Lisp_Object attrs, charset_list, val;
4992   int ascii_compatible;
4993   struct charset *charset_roman, *charset_kanji, *charset_kana;
4994   struct charset *charset_kanji2;
4995   int c;
4996
4997   CODING_GET_INFO (coding, attrs, charset_list);
4998   val = charset_list;
4999   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5000   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5001   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5002   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
5003
5004   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5005
5006   while (charbuf < charbuf_end)
5007     {
5008       ASSURE_DESTINATION (safe_room);
5009       c = *charbuf++;
5010       /* Now encode the character C.  */
5011       if (ASCII_CHAR_P (c) && ascii_compatible)
5012         EMIT_ONE_ASCII_BYTE (c);
5013       else if (CHAR_BYTE8_P (c))
5014         {
5015           c = CHAR_TO_BYTE8 (c);
5016           EMIT_ONE_BYTE (c);
5017         }
5018       else
5019         {
5020           unsigned code;
5021           struct charset *charset = char_charset (c, charset_list, &code);
5022
5023           if (!charset)
5024             {
5025               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5026                 {
5027                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5028                   charset = CHARSET_FROM_ID (charset_ascii);
5029                 }
5030               else
5031                 {
5032                   c = coding->default_char;
5033                   charset = char_charset (c, charset_list, &code);
5034                 }
5035             }
5036           if (code == CHARSET_INVALID_CODE (charset))
5037             abort ();
5038           if (charset == charset_kanji)
5039             {
5040               int c1, c2;
5041               JIS_TO_SJIS (code);
5042               c1 = code >> 8, c2 = code & 0xFF;
5043               EMIT_TWO_BYTES (c1, c2);
5044             }
5045           else if (charset == charset_kana)
5046             EMIT_ONE_BYTE (code | 0x80);
5047           else if (charset_kanji2 && charset == charset_kanji2)
5048             {
5049               int c1, c2;
5050
5051               c1 = code >> 8;
5052               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
5053                   || c1 == 0x28
5054                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
5055                 {
5056                   JIS_TO_SJIS2 (code);
5057                   c1 = code >> 8, c2 = code & 0xFF;
5058                   EMIT_TWO_BYTES (c1, c2);
5059                 }
5060               else
5061                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5062             }
5063           else
5064             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5065         }
5066     }
5067   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5068   coding->produced_char += produced_chars;
5069   coding->produced = dst - coding->destination;
5070   return 0;
5071 }
5072
5073 static int
5074 encode_coding_big5 (struct coding_system *coding)
5075 {
5076   int multibytep = coding->dst_multibyte;
5077   int *charbuf = coding->charbuf;
5078   int *charbuf_end = charbuf + coding->charbuf_used;
5079   unsigned char *dst = coding->destination + coding->produced;
5080   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5081   int safe_room = 4;
5082   int produced_chars = 0;
5083   Lisp_Object attrs, charset_list, val;
5084   int ascii_compatible;
5085   struct charset *charset_roman, *charset_big5;
5086   int c;
5087
5088   CODING_GET_INFO (coding, attrs, charset_list);
5089   val = charset_list;
5090   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5091   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5092   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5093
5094   while (charbuf < charbuf_end)
5095     {
5096       ASSURE_DESTINATION (safe_room);
5097       c = *charbuf++;
5098       /* Now encode the character C.  */
5099       if (ASCII_CHAR_P (c) && ascii_compatible)
5100         EMIT_ONE_ASCII_BYTE (c);
5101       else if (CHAR_BYTE8_P (c))
5102         {
5103           c = CHAR_TO_BYTE8 (c);
5104           EMIT_ONE_BYTE (c);
5105         }
5106       else
5107         {
5108           unsigned code;
5109           struct charset *charset = char_charset (c, charset_list, &code);
5110
5111           if (! charset)
5112             {
5113               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5114                 {
5115                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5116                   charset = CHARSET_FROM_ID (charset_ascii);
5117                 }
5118               else
5119                 {
5120                   c = coding->default_char;
5121                   charset = char_charset (c, charset_list, &code);
5122                 }
5123             }
5124           if (code == CHARSET_INVALID_CODE (charset))
5125             abort ();
5126           if (charset == charset_big5)
5127             {
5128               int c1, c2;
5129
5130               c1 = code >> 8, c2 = code & 0xFF;
5131               EMIT_TWO_BYTES (c1, c2);
5132             }
5133           else
5134             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5135         }
5136     }
5137   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5138   coding->produced_char += produced_chars;
5139   coding->produced = dst - coding->destination;
5140   return 0;
5141 }
5142
5143 \f
5144 /*** 10. CCL handlers ***/
5145
5146 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5147    Check if a text is encoded in a coding system of which
5148    encoder/decoder are written in CCL program.  If it is, return
5149    CATEGORY_MASK_CCL, else return 0.  */
5150
5151 static int
5152 detect_coding_ccl (struct coding_system *coding,
5153                    struct coding_detection_info *detect_info)
5154 {
5155   const unsigned char *src = coding->source, *src_base;
5156   const unsigned char *src_end = coding->source + coding->src_bytes;
5157   int multibytep = coding->src_multibyte;
5158   int consumed_chars = 0;
5159   int found = 0;
5160   unsigned char *valids;
5161   int head_ascii = coding->head_ascii;
5162   Lisp_Object attrs;
5163
5164   detect_info->checked |= CATEGORY_MASK_CCL;
5165
5166   coding = &coding_categories[coding_category_ccl];
5167   valids = CODING_CCL_VALIDS (coding);
5168   attrs = CODING_ID_ATTRS (coding->id);
5169   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5170     src += head_ascii;
5171
5172   while (1)
5173     {
5174       int c;
5175
5176       src_base = src;
5177       ONE_MORE_BYTE (c);
5178       if (c < 0 || ! valids[c])
5179         break;
5180       if ((valids[c] > 1))
5181         found = CATEGORY_MASK_CCL;
5182     }
5183   detect_info->rejected |= CATEGORY_MASK_CCL;
5184   return 0;
5185
5186  no_more_source:
5187   detect_info->found |= found;
5188   return 1;
5189 }
5190
5191 static void
5192 decode_coding_ccl (struct coding_system *coding)
5193 {
5194   const unsigned char *src = coding->source + coding->consumed;
5195   const unsigned char *src_end = coding->source + coding->src_bytes;
5196   int *charbuf = coding->charbuf + coding->charbuf_used;
5197   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5198   int consumed_chars = 0;
5199   int multibytep = coding->src_multibyte;
5200   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5201   int source_charbuf[1024];
5202   int source_byteidx[1025];
5203   Lisp_Object attrs, charset_list;
5204
5205   CODING_GET_INFO (coding, attrs, charset_list);
5206
5207   while (1)
5208     {
5209       const unsigned char *p = src;
5210       int i = 0;
5211
5212       if (multibytep)
5213         {
5214           while (i < 1024 && p < src_end)
5215             {
5216               source_byteidx[i] = p - src;
5217               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5218             }
5219           source_byteidx[i] = p - src;
5220         }
5221       else
5222         while (i < 1024 && p < src_end)
5223           source_charbuf[i++] = *p++;
5224
5225       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5226         ccl->last_block = 1;
5227       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5228                   charset_list);
5229       charbuf += ccl->produced;
5230       if (multibytep)
5231         src += source_byteidx[ccl->consumed];
5232       else
5233         src += ccl->consumed;
5234       consumed_chars += ccl->consumed;
5235       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5236         break;
5237     }
5238
5239   switch (ccl->status)
5240     {
5241     case CCL_STAT_SUSPEND_BY_SRC:
5242       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5243       break;
5244     case CCL_STAT_SUSPEND_BY_DST:
5245       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5246       break;
5247     case CCL_STAT_QUIT:
5248     case CCL_STAT_INVALID_CMD:
5249       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5250       break;
5251     default:
5252       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5253       break;
5254     }
5255   coding->consumed_char += consumed_chars;
5256   coding->consumed = src - coding->source;
5257   coding->charbuf_used = charbuf - coding->charbuf;
5258 }
5259
5260 static int
5261 encode_coding_ccl (struct coding_system *coding)
5262 {
5263   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5264   int multibytep = coding->dst_multibyte;
5265   int *charbuf = coding->charbuf;
5266   int *charbuf_end = charbuf + coding->charbuf_used;
5267   unsigned char *dst = coding->destination + coding->produced;
5268   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5269   int destination_charbuf[1024];
5270   int i, produced_chars = 0;
5271   Lisp_Object attrs, charset_list;
5272
5273   CODING_GET_INFO (coding, attrs, charset_list);
5274   if (coding->consumed_char == coding->src_chars
5275       && coding->mode & CODING_MODE_LAST_BLOCK)
5276     ccl->last_block = 1;
5277
5278   while (charbuf < charbuf_end)
5279     {
5280       ccl_driver (ccl, charbuf, destination_charbuf,
5281                   charbuf_end - charbuf, 1024, charset_list);
5282       if (multibytep)
5283         {
5284           ASSURE_DESTINATION (ccl->produced * 2);
5285           for (i = 0; i < ccl->produced; i++)
5286             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5287         }
5288       else
5289         {
5290           ASSURE_DESTINATION (ccl->produced);
5291           for (i = 0; i < ccl->produced; i++)
5292             *dst++ = destination_charbuf[i] & 0xFF;
5293           produced_chars += ccl->produced;
5294         }
5295       charbuf += ccl->consumed;
5296       if (ccl->status == CCL_STAT_QUIT
5297           || ccl->status == CCL_STAT_INVALID_CMD)
5298         break;
5299     }
5300
5301   switch (ccl->status)
5302     {
5303     case CCL_STAT_SUSPEND_BY_SRC:
5304       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5305       break;
5306     case CCL_STAT_SUSPEND_BY_DST:
5307       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5308       break;
5309     case CCL_STAT_QUIT:
5310     case CCL_STAT_INVALID_CMD:
5311       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5312       break;
5313     default:
5314       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5315       break;
5316     }
5317
5318   coding->produced_char += produced_chars;
5319   coding->produced = dst - coding->destination;
5320   return 0;
5321 }
5322
5323
5324 \f
5325 /*** 10, 11. no-conversion handlers ***/
5326
5327 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5328
5329 static void
5330 decode_coding_raw_text (struct coding_system *coding)
5331 {
5332   int eol_crlf =
5333     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5334
5335   coding->chars_at_source = 1;
5336   coding->consumed_char = coding->src_chars;
5337   coding->consumed = coding->src_bytes;
5338   if (eol_crlf && coding->source[coding->src_bytes - 1] == '\r')
5339     {
5340       coding->consumed_char--;
5341       coding->consumed--;
5342       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5343     }
5344   else
5345     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5346 }
5347
5348 static int
5349 encode_coding_raw_text (struct coding_system *coding)
5350 {
5351   int multibytep = coding->dst_multibyte;
5352   int *charbuf = coding->charbuf;
5353   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5354   unsigned char *dst = coding->destination + coding->produced;
5355   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5356   int produced_chars = 0;
5357   int c;
5358
5359   if (multibytep)
5360     {
5361       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5362
5363       if (coding->src_multibyte)
5364         while (charbuf < charbuf_end)
5365           {
5366             ASSURE_DESTINATION (safe_room);
5367             c = *charbuf++;
5368             if (ASCII_CHAR_P (c))
5369               EMIT_ONE_ASCII_BYTE (c);
5370             else if (CHAR_BYTE8_P (c))
5371               {
5372                 c = CHAR_TO_BYTE8 (c);
5373                 EMIT_ONE_BYTE (c);
5374               }
5375             else
5376               {
5377                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5378
5379                 CHAR_STRING_ADVANCE (c, p1);
5380                 while (p0 < p1)
5381                   {
5382                     EMIT_ONE_BYTE (*p0);
5383                     p0++;
5384                   }
5385               }
5386           }
5387       else
5388         while (charbuf < charbuf_end)
5389           {
5390             ASSURE_DESTINATION (safe_room);
5391             c = *charbuf++;
5392             EMIT_ONE_BYTE (c);
5393           }
5394     }
5395   else
5396     {
5397       if (coding->src_multibyte)
5398         {
5399           int safe_room = MAX_MULTIBYTE_LENGTH;
5400
5401           while (charbuf < charbuf_end)
5402             {
5403               ASSURE_DESTINATION (safe_room);
5404               c = *charbuf++;
5405               if (ASCII_CHAR_P (c))
5406                 *dst++ = c;
5407               else if (CHAR_BYTE8_P (c))
5408                 *dst++ = CHAR_TO_BYTE8 (c);
5409               else
5410                 CHAR_STRING_ADVANCE (c, dst);
5411             }
5412         }
5413       else
5414         {
5415           ASSURE_DESTINATION (charbuf_end - charbuf);
5416           while (charbuf < charbuf_end && dst < dst_end)
5417             *dst++ = *charbuf++;
5418         }
5419       produced_chars = dst - (coding->destination + coding->produced);
5420     }
5421   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5422   coding->produced_char += produced_chars;
5423   coding->produced = dst - coding->destination;
5424   return 0;
5425 }
5426
5427 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5428    Check if a text is encoded in a charset-based coding system.  If it
5429    is, return 1, else return 0.  */
5430
5431 static int
5432 detect_coding_charset (struct coding_system *coding,
5433                        struct coding_detection_info *detect_info)
5434 {
5435   const unsigned char *src = coding->source, *src_base;
5436   const unsigned char *src_end = coding->source + coding->src_bytes;
5437   int multibytep = coding->src_multibyte;
5438   int consumed_chars = 0;
5439   Lisp_Object attrs, valids, name;
5440   int found = 0;
5441   int head_ascii = coding->head_ascii;
5442   int check_latin_extra = 0;
5443
5444   detect_info->checked |= CATEGORY_MASK_CHARSET;
5445
5446   coding = &coding_categories[coding_category_charset];
5447   attrs = CODING_ID_ATTRS (coding->id);
5448   valids = AREF (attrs, coding_attr_charset_valids);
5449   name = CODING_ID_NAME (coding->id);
5450   if (strncmp ((char *) SDATA (SYMBOL_NAME (name)),
5451                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5452       || strncmp ((char *) SDATA (SYMBOL_NAME (name)),
5453                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5454     check_latin_extra = 1;
5455
5456   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5457     src += head_ascii;
5458
5459   while (1)
5460     {
5461       int c;
5462       Lisp_Object val;
5463       struct charset *charset;
5464       int dim, idx;
5465
5466       src_base = src;
5467       ONE_MORE_BYTE (c);
5468       if (c < 0)
5469         continue;
5470       val = AREF (valids, c);
5471       if (NILP (val))
5472         break;
5473       if (c >= 0x80)
5474         {
5475           if (c < 0xA0
5476               && check_latin_extra
5477               && (!VECTORP (Vlatin_extra_code_table)
5478                   || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])))
5479             break;
5480           found = CATEGORY_MASK_CHARSET;
5481         }
5482       if (INTEGERP (val))
5483         {
5484           charset = CHARSET_FROM_ID (XFASTINT (val));
5485           dim = CHARSET_DIMENSION (charset);
5486           for (idx = 1; idx < dim; idx++)
5487             {
5488               if (src == src_end)
5489                 goto too_short;
5490               ONE_MORE_BYTE (c);
5491               if (c < charset->code_space[(dim - 1 - idx) * 2]
5492                   || c > charset->code_space[(dim - 1 - idx) * 2 + 1])
5493                 break;
5494             }
5495           if (idx < dim)
5496             break;
5497         }
5498       else
5499         {
5500           idx = 1;
5501           for (; CONSP (val); val = XCDR (val))
5502             {
5503               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5504               dim = CHARSET_DIMENSION (charset);
5505               while (idx < dim)
5506                 {
5507                   if (src == src_end)
5508                     goto too_short;
5509                   ONE_MORE_BYTE (c);
5510                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5511                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5512                     break;
5513                   idx++;
5514                 }
5515               if (idx == dim)
5516                 {
5517                   val = Qnil;
5518                   break;
5519                 }
5520             }
5521           if (CONSP (val))
5522             break;
5523         }
5524     }
5525  too_short:
5526   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5527   return 0;
5528
5529  no_more_source:
5530   detect_info->found |= found;
5531   return 1;
5532 }
5533
5534 static void
5535 decode_coding_charset (struct coding_system *coding)
5536 {
5537   const unsigned char *src = coding->source + coding->consumed;
5538   const unsigned char *src_end = coding->source + coding->src_bytes;
5539   const unsigned char *src_base;
5540   int *charbuf = coding->charbuf + coding->charbuf_used;
5541   /* We may produce one charset annotation in one loop and one more at
5542      the end.  */
5543   int *charbuf_end
5544     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5545   int consumed_chars = 0, consumed_chars_base;
5546   int multibytep = coding->src_multibyte;
5547   Lisp_Object attrs, charset_list, valids;
5548   int char_offset = coding->produced_char;
5549   int last_offset = char_offset;
5550   int last_id = charset_ascii;
5551   int eol_crlf =
5552     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5553   int byte_after_cr = -1;
5554
5555   CODING_GET_INFO (coding, attrs, charset_list);
5556   valids = AREF (attrs, coding_attr_charset_valids);
5557
5558   while (1)
5559     {
5560       int c;
5561       Lisp_Object val;
5562       struct charset *charset;
5563       int dim;
5564       int len = 1;
5565       unsigned code;
5566
5567       src_base = src;
5568       consumed_chars_base = consumed_chars;
5569
5570       if (charbuf >= charbuf_end)
5571         {
5572           if (byte_after_cr >= 0)
5573             src_base--;
5574           break;
5575         }
5576
5577       if (byte_after_cr >= 0)
5578         {
5579           c = byte_after_cr;
5580           byte_after_cr = -1;
5581         }
5582       else
5583         {
5584           ONE_MORE_BYTE (c);
5585           if (eol_crlf && c == '\r')
5586             ONE_MORE_BYTE (byte_after_cr);
5587         }
5588       if (c < 0)
5589         goto invalid_code;
5590       code = c;
5591
5592       val = AREF (valids, c);
5593       if (! INTEGERP (val) && ! CONSP (val))
5594         goto invalid_code;
5595       if (INTEGERP (val))
5596         {
5597           charset = CHARSET_FROM_ID (XFASTINT (val));
5598           dim = CHARSET_DIMENSION (charset);
5599           while (len < dim)
5600             {
5601               ONE_MORE_BYTE (c);
5602               code = (code << 8) | c;
5603               len++;
5604             }
5605           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5606                               charset, code, c);
5607         }
5608       else
5609         {
5610           /* VAL is a list of charset IDs.  It is assured that the
5611              list is sorted by charset dimensions (smaller one
5612              comes first).  */
5613           while (CONSP (val))
5614             {
5615               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5616               dim = CHARSET_DIMENSION (charset);
5617               while (len < dim)
5618                 {
5619                   ONE_MORE_BYTE (c);
5620                   code = (code << 8) | c;
5621                   len++;
5622                 }
5623               CODING_DECODE_CHAR (coding, src, src_base,
5624                                   src_end, charset, code, c);
5625               if (c >= 0)
5626                 break;
5627               val = XCDR (val);
5628             }
5629         }
5630       if (c < 0)
5631         goto invalid_code;
5632       if (charset->id != charset_ascii
5633           && last_id != charset->id)
5634         {
5635           if (last_id != charset_ascii)
5636             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5637           last_id = charset->id;
5638           last_offset = char_offset;
5639         }
5640
5641       *charbuf++ = c;
5642       char_offset++;
5643       continue;
5644
5645     invalid_code:
5646       src = src_base;
5647       consumed_chars = consumed_chars_base;
5648       ONE_MORE_BYTE (c);
5649       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5650       char_offset++;
5651       coding->errors++;
5652     }
5653
5654  no_more_source:
5655   if (last_id != charset_ascii)
5656     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5657   coding->consumed_char += consumed_chars_base;
5658   coding->consumed = src_base - coding->source;
5659   coding->charbuf_used = charbuf - coding->charbuf;
5660 }
5661
5662 static int
5663 encode_coding_charset (struct coding_system *coding)
5664 {
5665   int multibytep = coding->dst_multibyte;
5666   int *charbuf = coding->charbuf;
5667   int *charbuf_end = charbuf + coding->charbuf_used;
5668   unsigned char *dst = coding->destination + coding->produced;
5669   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5670   int safe_room = MAX_MULTIBYTE_LENGTH;
5671   int produced_chars = 0;
5672   Lisp_Object attrs, charset_list;
5673   int ascii_compatible;
5674   int c;
5675
5676   CODING_GET_INFO (coding, attrs, charset_list);
5677   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5678
5679   while (charbuf < charbuf_end)
5680     {
5681       struct charset *charset;
5682       unsigned code;
5683
5684       ASSURE_DESTINATION (safe_room);
5685       c = *charbuf++;
5686       if (ascii_compatible && ASCII_CHAR_P (c))
5687         EMIT_ONE_ASCII_BYTE (c);
5688       else if (CHAR_BYTE8_P (c))
5689         {
5690           c = CHAR_TO_BYTE8 (c);
5691           EMIT_ONE_BYTE (c);
5692         }
5693       else
5694         {
5695           charset = char_charset (c, charset_list, &code);
5696           if (charset)
5697             {
5698               if (CHARSET_DIMENSION (charset) == 1)
5699                 EMIT_ONE_BYTE (code);
5700               else if (CHARSET_DIMENSION (charset) == 2)
5701                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5702               else if (CHARSET_DIMENSION (charset) == 3)
5703                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5704               else
5705                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5706                                  (code >> 8) & 0xFF, code & 0xFF);
5707             }
5708           else
5709             {
5710               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5711                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5712               else
5713                 c = coding->default_char;
5714               EMIT_ONE_BYTE (c);
5715             }
5716         }
5717     }
5718
5719   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5720   coding->produced_char += produced_chars;
5721   coding->produced = dst - coding->destination;
5722   return 0;
5723 }
5724
5725 \f
5726 /*** 7. C library functions ***/
5727
5728 /* Setup coding context CODING from information about CODING_SYSTEM.
5729    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5730    CODING_SYSTEM is invalid, signal an error.  */
5731
5732 void
5733 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5734 {
5735   Lisp_Object attrs;
5736   Lisp_Object eol_type;
5737   Lisp_Object coding_type;
5738   Lisp_Object val;
5739
5740   if (NILP (coding_system))
5741     coding_system = Qundecided;
5742
5743   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5744
5745   attrs = CODING_ID_ATTRS (coding->id);
5746   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5747
5748   coding->mode = 0;
5749   coding->head_ascii = -1;
5750   if (VECTORP (eol_type))
5751     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5752                             | CODING_REQUIRE_DETECTION_MASK);
5753   else if (! EQ (eol_type, Qunix))
5754     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5755                             | CODING_REQUIRE_ENCODING_MASK);
5756   else
5757     coding->common_flags = 0;
5758   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5759     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5760   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5761     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5762   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5763     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5764
5765   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5766   coding->max_charset_id = SCHARS (val) - 1;
5767   coding->safe_charsets = SDATA (val);
5768   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5769   coding->carryover_bytes = 0;
5770
5771   coding_type = CODING_ATTR_TYPE (attrs);
5772   if (EQ (coding_type, Qundecided))
5773     {
5774       coding->detector = NULL;
5775       coding->decoder = decode_coding_raw_text;
5776       coding->encoder = encode_coding_raw_text;
5777       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5778     }
5779   else if (EQ (coding_type, Qiso_2022))
5780     {
5781       int i;
5782       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5783
5784       /* Invoke graphic register 0 to plane 0.  */
5785       CODING_ISO_INVOCATION (coding, 0) = 0;
5786       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5787       CODING_ISO_INVOCATION (coding, 1)
5788         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5789       /* Setup the initial status of designation.  */
5790       for (i = 0; i < 4; i++)
5791         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5792       /* Not single shifting initially.  */
5793       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5794       /* Beginning of buffer should also be regarded as bol. */
5795       CODING_ISO_BOL (coding) = 1;
5796       coding->detector = detect_coding_iso_2022;
5797       coding->decoder = decode_coding_iso_2022;
5798       coding->encoder = encode_coding_iso_2022;
5799       if (flags & CODING_ISO_FLAG_SAFE)
5800         coding->mode |= CODING_MODE_SAFE_ENCODING;
5801       coding->common_flags
5802         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5803             | CODING_REQUIRE_FLUSHING_MASK);
5804       if (flags & CODING_ISO_FLAG_COMPOSITION)
5805         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5806       if (flags & CODING_ISO_FLAG_DESIGNATION)
5807         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5808       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5809         {
5810           setup_iso_safe_charsets (attrs);
5811           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5812           coding->max_charset_id = SCHARS (val) - 1;
5813           coding->safe_charsets = SDATA (val);
5814         }
5815       CODING_ISO_FLAGS (coding) = flags;
5816       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5817       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5818       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5819       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5820     }
5821   else if (EQ (coding_type, Qcharset))
5822     {
5823       coding->detector = detect_coding_charset;
5824       coding->decoder = decode_coding_charset;
5825       coding->encoder = encode_coding_charset;
5826       coding->common_flags
5827         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5828     }
5829   else if (EQ (coding_type, Qutf_8))
5830     {
5831       val = AREF (attrs, coding_attr_utf_bom);
5832       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5833                                    : EQ (val, Qt) ? utf_with_bom
5834                                    : utf_without_bom);
5835       coding->detector = detect_coding_utf_8;
5836       coding->decoder = decode_coding_utf_8;
5837       coding->encoder = encode_coding_utf_8;
5838       coding->common_flags
5839         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5840       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5841         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5842     }
5843   else if (EQ (coding_type, Qutf_16))
5844     {
5845       val = AREF (attrs, coding_attr_utf_bom);
5846       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5847                                     : EQ (val, Qt) ? utf_with_bom
5848                                     : utf_without_bom);
5849       val = AREF (attrs, coding_attr_utf_16_endian);
5850       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5851                                        : utf_16_little_endian);
5852       CODING_UTF_16_SURROGATE (coding) = 0;
5853       coding->detector = detect_coding_utf_16;
5854       coding->decoder = decode_coding_utf_16;
5855       coding->encoder = encode_coding_utf_16;
5856       coding->common_flags
5857         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5858       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5859         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5860     }
5861   else if (EQ (coding_type, Qccl))
5862     {
5863       coding->detector = detect_coding_ccl;
5864       coding->decoder = decode_coding_ccl;
5865       coding->encoder = encode_coding_ccl;
5866       coding->common_flags
5867         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5868             | CODING_REQUIRE_FLUSHING_MASK);
5869     }
5870   else if (EQ (coding_type, Qemacs_mule))
5871     {
5872       coding->detector = detect_coding_emacs_mule;
5873       coding->decoder = decode_coding_emacs_mule;
5874       coding->encoder = encode_coding_emacs_mule;
5875       coding->common_flags
5876         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5877       coding->spec.emacs_mule.full_support = 1;
5878       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5879           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5880         {
5881           Lisp_Object tail, safe_charsets;
5882           int max_charset_id = 0;
5883
5884           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5885                tail = XCDR (tail))
5886             if (max_charset_id < XFASTINT (XCAR (tail)))
5887               max_charset_id = XFASTINT (XCAR (tail));
5888           safe_charsets = make_uninit_string (max_charset_id + 1);
5889           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5890           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5891                tail = XCDR (tail))
5892             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5893           coding->max_charset_id = max_charset_id;
5894           coding->safe_charsets = SDATA (safe_charsets);
5895           coding->spec.emacs_mule.full_support = 1;
5896         }
5897       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5898       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5899     }
5900   else if (EQ (coding_type, Qshift_jis))
5901     {
5902       coding->detector = detect_coding_sjis;
5903       coding->decoder = decode_coding_sjis;
5904       coding->encoder = encode_coding_sjis;
5905       coding->common_flags
5906         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5907     }
5908   else if (EQ (coding_type, Qbig5))
5909     {
5910       coding->detector = detect_coding_big5;
5911       coding->decoder = decode_coding_big5;
5912       coding->encoder = encode_coding_big5;
5913       coding->common_flags
5914         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5915     }
5916   else                          /* EQ (coding_type, Qraw_text) */
5917     {
5918       coding->detector = NULL;
5919       coding->decoder = decode_coding_raw_text;
5920       coding->encoder = encode_coding_raw_text;
5921       if (! EQ (eol_type, Qunix))
5922         {
5923           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5924           if (! VECTORP (eol_type))
5925             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5926         }
5927
5928     }
5929
5930   return;
5931 }
5932
5933 /* Return a list of charsets supported by CODING.  */
5934
5935 Lisp_Object
5936 coding_charset_list (struct coding_system *coding)
5937 {
5938   Lisp_Object attrs, charset_list;
5939
5940   CODING_GET_INFO (coding, attrs, charset_list);
5941   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5942     {
5943       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5944
5945       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5946         charset_list = Viso_2022_charset_list;
5947     }
5948   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5949     {
5950       charset_list = Vemacs_mule_charset_list;
5951     }
5952   return charset_list;
5953 }
5954
5955
5956 /* Return a list of charsets supported by CODING-SYSTEM.  */
5957
5958 Lisp_Object
5959 coding_system_charset_list (Lisp_Object coding_system)
5960 {
5961   int id;
5962   Lisp_Object attrs, charset_list;
5963
5964   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5965   attrs = CODING_ID_ATTRS (id);
5966
5967   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5968     {
5969       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5970
5971       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5972         charset_list = Viso_2022_charset_list;
5973       else
5974         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5975     }
5976   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5977     {
5978       charset_list = Vemacs_mule_charset_list;
5979     }
5980   else
5981     {
5982       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5983     }
5984   return charset_list;
5985 }
5986
5987
5988 /* Return raw-text or one of its subsidiaries that has the same
5989    eol_type as CODING-SYSTEM.  */
5990
5991 Lisp_Object
5992 raw_text_coding_system (Lisp_Object coding_system)
5993 {
5994   Lisp_Object spec, attrs;
5995   Lisp_Object eol_type, raw_text_eol_type;
5996
5997   if (NILP (coding_system))
5998     return Qraw_text;
5999   spec = CODING_SYSTEM_SPEC (coding_system);
6000   attrs = AREF (spec, 0);
6001
6002   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
6003     return coding_system;
6004
6005   eol_type = AREF (spec, 2);
6006   if (VECTORP (eol_type))
6007     return Qraw_text;
6008   spec = CODING_SYSTEM_SPEC (Qraw_text);
6009   raw_text_eol_type = AREF (spec, 2);
6010   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
6011           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
6012           : AREF (raw_text_eol_type, 2));
6013 }
6014
6015
6016 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
6017    the subsidiary that has the same eol-spec as PARENT (if it is not
6018    nil and specifies end-of-line format) or the system's setting
6019    (system_eol_type).  */
6020
6021 Lisp_Object
6022 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
6023 {
6024   Lisp_Object spec, eol_type;
6025
6026   if (NILP (coding_system))
6027     coding_system = Qraw_text;
6028   spec = CODING_SYSTEM_SPEC (coding_system);
6029   eol_type = AREF (spec, 2);
6030   if (VECTORP (eol_type))
6031     {
6032       Lisp_Object parent_eol_type;
6033
6034       if (! NILP (parent))
6035         {
6036           Lisp_Object parent_spec;
6037
6038           parent_spec = CODING_SYSTEM_SPEC (parent);
6039           parent_eol_type = AREF (parent_spec, 2);
6040           if (VECTORP (parent_eol_type))
6041             parent_eol_type = system_eol_type;
6042         }
6043       else
6044         parent_eol_type = system_eol_type;
6045       if (EQ (parent_eol_type, Qunix))
6046         coding_system = AREF (eol_type, 0);
6047       else if (EQ (parent_eol_type, Qdos))
6048         coding_system = AREF (eol_type, 1);
6049       else if (EQ (parent_eol_type, Qmac))
6050         coding_system = AREF (eol_type, 2);
6051     }
6052   return coding_system;
6053 }
6054
6055
6056 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
6057    decided for writing to a process.  If not, complement them, and
6058    return a new coding system.  */
6059
6060 Lisp_Object
6061 complement_process_encoding_system (Lisp_Object coding_system)
6062 {
6063   Lisp_Object coding_base = Qnil, eol_base = Qnil;
6064   Lisp_Object spec, attrs;
6065   int i;
6066
6067   for (i = 0; i < 3; i++)
6068     {
6069       if (i == 1)
6070         coding_system = CDR_SAFE (Vdefault_process_coding_system);
6071       else if (i == 2)
6072         coding_system = preferred_coding_system ();
6073       spec = CODING_SYSTEM_SPEC (coding_system);
6074       if (NILP (spec))
6075         continue;
6076       attrs = AREF (spec, 0);
6077       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6078         coding_base = CODING_ATTR_BASE_NAME (attrs);
6079       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
6080         eol_base = coding_system;
6081       if (! NILP (coding_base) && ! NILP (eol_base))
6082         break;
6083     }
6084
6085   if (i > 0)
6086     /* The original CODING_SYSTEM didn't specify text-conversion or
6087        eol-conversion.  Be sure that we return a fully complemented
6088        coding system.  */
6089     coding_system = coding_inherit_eol_type (coding_base, eol_base);
6090   return coding_system;
6091 }
6092
6093
6094 /* Emacs has a mechanism to automatically detect a coding system if it
6095    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
6096    it's impossible to distinguish some coding systems accurately
6097    because they use the same range of codes.  So, at first, coding
6098    systems are categorized into 7, those are:
6099
6100    o coding-category-emacs-mule
6101
6102         The category for a coding system which has the same code range
6103         as Emacs' internal format.  Assigned the coding-system (Lisp
6104         symbol) `emacs-mule' by default.
6105
6106    o coding-category-sjis
6107
6108         The category for a coding system which has the same code range
6109         as SJIS.  Assigned the coding-system (Lisp
6110         symbol) `japanese-shift-jis' by default.
6111
6112    o coding-category-iso-7
6113
6114         The category for a coding system which has the same code range
6115         as ISO2022 of 7-bit environment.  This doesn't use any locking
6116         shift and single shift functions.  This can encode/decode all
6117         charsets.  Assigned the coding-system (Lisp symbol)
6118         `iso-2022-7bit' by default.
6119
6120    o coding-category-iso-7-tight
6121
6122         Same as coding-category-iso-7 except that this can
6123         encode/decode only the specified charsets.
6124
6125    o coding-category-iso-8-1
6126
6127         The category for a coding system which has the same code range
6128         as ISO2022 of 8-bit environment and graphic plane 1 used only
6129         for DIMENSION1 charset.  This doesn't use any locking shift
6130         and single shift functions.  Assigned the coding-system (Lisp
6131         symbol) `iso-latin-1' by default.
6132
6133    o coding-category-iso-8-2
6134
6135         The category for a coding system which has the same code range
6136         as ISO2022 of 8-bit environment and graphic plane 1 used only
6137         for DIMENSION2 charset.  This doesn't use any locking shift
6138         and single shift functions.  Assigned the coding-system (Lisp
6139         symbol) `japanese-iso-8bit' by default.
6140
6141    o coding-category-iso-7-else
6142
6143         The category for a coding system which has the same code range
6144         as ISO2022 of 7-bit environment but uses locking shift or
6145         single shift functions.  Assigned the coding-system (Lisp
6146         symbol) `iso-2022-7bit-lock' by default.
6147
6148    o coding-category-iso-8-else
6149
6150         The category for a coding system which has the same code range
6151         as ISO2022 of 8-bit environment but uses locking shift or
6152         single shift functions.  Assigned the coding-system (Lisp
6153         symbol) `iso-2022-8bit-ss2' by default.
6154
6155    o coding-category-big5
6156
6157         The category for a coding system which has the same code range
6158         as BIG5.  Assigned the coding-system (Lisp symbol)
6159         `cn-big5' by default.
6160
6161    o coding-category-utf-8
6162
6163         The category for a coding system which has the same code range
6164         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6165         symbol) `utf-8' by default.
6166
6167    o coding-category-utf-16-be
6168
6169         The category for a coding system in which a text has an
6170         Unicode signature (cf. Unicode Standard) in the order of BIG
6171         endian at the head.  Assigned the coding-system (Lisp symbol)
6172         `utf-16-be' by default.
6173
6174    o coding-category-utf-16-le
6175
6176         The category for a coding system in which a text has an
6177         Unicode signature (cf. Unicode Standard) in the order of
6178         LITTLE endian at the head.  Assigned the coding-system (Lisp
6179         symbol) `utf-16-le' by default.
6180
6181    o coding-category-ccl
6182
6183         The category for a coding system of which encoder/decoder is
6184         written in CCL programs.  The default value is nil, i.e., no
6185         coding system is assigned.
6186
6187    o coding-category-binary
6188
6189         The category for a coding system not categorized in any of the
6190         above.  Assigned the coding-system (Lisp symbol)
6191         `no-conversion' by default.
6192
6193    Each of them is a Lisp symbol and the value is an actual
6194    `coding-system's (this is also a Lisp symbol) assigned by a user.
6195    What Emacs does actually is to detect a category of coding system.
6196    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6197    decide only one possible category, it selects a category of the
6198    highest priority.  Priorities of categories are also specified by a
6199    user in a Lisp variable `coding-category-list'.
6200
6201 */
6202
6203 #define EOL_SEEN_NONE   0
6204 #define EOL_SEEN_LF     1
6205 #define EOL_SEEN_CR     2
6206 #define EOL_SEEN_CRLF   4
6207
6208 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6209    SOURCE is encoded.  If CATEGORY is one of
6210    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6211    two-byte, else they are encoded by one-byte.
6212
6213    Return one of EOL_SEEN_XXX.  */
6214
6215 #define MAX_EOL_CHECK_COUNT 3
6216
6217 static int
6218 detect_eol (const unsigned char *source, EMACS_INT src_bytes,
6219             enum coding_category category)
6220 {
6221   const unsigned char *src = source, *src_end = src + src_bytes;
6222   unsigned char c;
6223   int total  = 0;
6224   int eol_seen = EOL_SEEN_NONE;
6225
6226   if ((1 << category) & CATEGORY_MASK_UTF_16)
6227     {
6228       int msb, lsb;
6229
6230       msb = category == (coding_category_utf_16_le
6231                          | coding_category_utf_16_le_nosig);
6232       lsb = 1 - msb;
6233
6234       while (src + 1 < src_end)
6235         {
6236           c = src[lsb];
6237           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6238             {
6239               int this_eol;
6240
6241               if (c == '\n')
6242                 this_eol = EOL_SEEN_LF;
6243               else if (src + 3 >= src_end
6244                        || src[msb + 2] != 0
6245                        || src[lsb + 2] != '\n')
6246                 this_eol = EOL_SEEN_CR;
6247               else
6248                 {
6249                   this_eol = EOL_SEEN_CRLF;
6250                   src += 2;
6251                 }
6252
6253               if (eol_seen == EOL_SEEN_NONE)
6254                 /* This is the first end-of-line.  */
6255                 eol_seen = this_eol;
6256               else if (eol_seen != this_eol)
6257                 {
6258                   /* The found type is different from what found before.
6259                      Allow for stray ^M characters in DOS EOL files.  */
6260                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6261                       || (eol_seen == EOL_SEEN_CRLF
6262                           && this_eol == EOL_SEEN_CR))
6263                     eol_seen = EOL_SEEN_CRLF;
6264                   else
6265                     {
6266                       eol_seen = EOL_SEEN_LF;
6267                       break;
6268                     }
6269                 }
6270               if (++total == MAX_EOL_CHECK_COUNT)
6271                 break;
6272             }
6273           src += 2;
6274         }
6275     }
6276   else
6277     while (src < src_end)
6278       {
6279         c = *src++;
6280         if (c == '\n' || c == '\r')
6281           {
6282             int this_eol;
6283
6284             if (c == '\n')
6285               this_eol = EOL_SEEN_LF;
6286             else if (src >= src_end || *src != '\n')
6287               this_eol = EOL_SEEN_CR;
6288             else
6289               this_eol = EOL_SEEN_CRLF, src++;
6290
6291             if (eol_seen == EOL_SEEN_NONE)
6292               /* This is the first end-of-line.  */
6293               eol_seen = this_eol;
6294             else if (eol_seen != this_eol)
6295               {
6296                 /* The found type is different from what found before.
6297                    Allow for stray ^M characters in DOS EOL files.  */
6298                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6299                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6300                   eol_seen = EOL_SEEN_CRLF;
6301                 else
6302                   {
6303                     eol_seen = EOL_SEEN_LF;
6304                     break;
6305                   }
6306               }
6307             if (++total == MAX_EOL_CHECK_COUNT)
6308               break;
6309           }
6310       }
6311   return eol_seen;
6312 }
6313
6314
6315 static Lisp_Object
6316 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6317 {
6318   Lisp_Object eol_type;
6319
6320   eol_type = CODING_ID_EOL_TYPE (coding->id);
6321   if (eol_seen & EOL_SEEN_LF)
6322     {
6323       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6324       eol_type = Qunix;
6325     }
6326   else if (eol_seen & EOL_SEEN_CRLF)
6327     {
6328       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6329       eol_type = Qdos;
6330     }
6331   else if (eol_seen & EOL_SEEN_CR)
6332     {
6333       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6334       eol_type = Qmac;
6335     }
6336   return eol_type;
6337 }
6338
6339 /* Detect how a text specified in CODING is encoded.  If a coding
6340    system is detected, update fields of CODING by the detected coding
6341    system.  */
6342
6343 void
6344 detect_coding (struct coding_system *coding)
6345 {
6346   const unsigned char *src, *src_end;
6347   int saved_mode = coding->mode;
6348
6349   coding->consumed = coding->consumed_char = 0;
6350   coding->produced = coding->produced_char = 0;
6351   coding_set_source (coding);
6352
6353   src_end = coding->source + coding->src_bytes;
6354   coding->head_ascii = 0;
6355
6356   /* If we have not yet decided the text encoding type, detect it
6357      now.  */
6358   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6359     {
6360       int c, i;
6361       struct coding_detection_info detect_info;
6362       int null_byte_found = 0, eight_bit_found = 0;
6363
6364       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6365       for (src = coding->source; src < src_end; src++)
6366         {
6367           c = *src;
6368           if (c & 0x80)
6369             {
6370               eight_bit_found = 1;
6371               if (null_byte_found)
6372                 break;
6373             }
6374           else if (c < 0x20)
6375             {
6376               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6377                   && ! inhibit_iso_escape_detection
6378                   && ! detect_info.checked)
6379                 {
6380                   if (detect_coding_iso_2022 (coding, &detect_info))
6381                     {
6382                       /* We have scanned the whole data.  */
6383                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6384                         {
6385                           /* We didn't find an 8-bit code.  We may
6386                              have found a null-byte, but it's very
6387                              rare that a binary file conforms to
6388                              ISO-2022.  */
6389                           src = src_end;
6390                           coding->head_ascii = src - coding->source;
6391                         }
6392                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6393                       break;
6394                     }
6395                 }
6396               else if (! c && !inhibit_null_byte_detection)
6397                 {
6398                   null_byte_found = 1;
6399                   if (eight_bit_found)
6400                     break;
6401                 }
6402               if (! eight_bit_found)
6403                 coding->head_ascii++;
6404             }
6405           else if (! eight_bit_found)
6406             coding->head_ascii++;
6407         }
6408
6409       if (null_byte_found || eight_bit_found
6410           || coding->head_ascii < coding->src_bytes
6411           || detect_info.found)
6412         {
6413           enum coding_category category;
6414           struct coding_system *this;
6415
6416           if (coding->head_ascii == coding->src_bytes)
6417             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6418             for (i = 0; i < coding_category_raw_text; i++)
6419               {
6420                 category = coding_priorities[i];
6421                 this = coding_categories + category;
6422                 if (detect_info.found & (1 << category))
6423                   break;
6424               }
6425           else
6426             {
6427               if (null_byte_found)
6428                 {
6429                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6430                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6431                 }
6432               for (i = 0; i < coding_category_raw_text; i++)
6433                 {
6434                   category = coding_priorities[i];
6435                   this = coding_categories + category;
6436                   if (this->id < 0)
6437                     {
6438                       /* No coding system of this category is defined.  */
6439                       detect_info.rejected |= (1 << category);
6440                     }
6441                   else if (category >= coding_category_raw_text)
6442                     continue;
6443                   else if (detect_info.checked & (1 << category))
6444                     {
6445                       if (detect_info.found & (1 << category))
6446                         break;
6447                     }
6448                   else if ((*(this->detector)) (coding, &detect_info)
6449                            && detect_info.found & (1 << category))
6450                     {
6451                       if (category == coding_category_utf_16_auto)
6452                         {
6453                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6454                             category = coding_category_utf_16_le;
6455                           else
6456                             category = coding_category_utf_16_be;
6457                         }
6458                       break;
6459                     }
6460                 }
6461             }
6462
6463           if (i < coding_category_raw_text)
6464             setup_coding_system (CODING_ID_NAME (this->id), coding);
6465           else if (null_byte_found)
6466             setup_coding_system (Qno_conversion, coding);
6467           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6468                    == CATEGORY_MASK_ANY)
6469             setup_coding_system (Qraw_text, coding);
6470           else if (detect_info.rejected)
6471             for (i = 0; i < coding_category_raw_text; i++)
6472               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6473                 {
6474                   this = coding_categories + coding_priorities[i];
6475                   setup_coding_system (CODING_ID_NAME (this->id), coding);
6476                   break;
6477                 }
6478         }
6479     }
6480   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6481            == coding_category_utf_8_auto)
6482     {
6483       Lisp_Object coding_systems;
6484       struct coding_detection_info detect_info;
6485
6486       coding_systems
6487         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6488       detect_info.found = detect_info.rejected = 0;
6489       coding->head_ascii = 0;
6490       if (CONSP (coding_systems)
6491           && detect_coding_utf_8 (coding, &detect_info))
6492         {
6493           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6494             setup_coding_system (XCAR (coding_systems), coding);
6495           else
6496             setup_coding_system (XCDR (coding_systems), coding);
6497         }
6498     }
6499   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6500            == coding_category_utf_16_auto)
6501     {
6502       Lisp_Object coding_systems;
6503       struct coding_detection_info detect_info;
6504
6505       coding_systems
6506         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6507       detect_info.found = detect_info.rejected = 0;
6508       coding->head_ascii = 0;
6509       if (CONSP (coding_systems)
6510           && detect_coding_utf_16 (coding, &detect_info))
6511         {
6512           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6513             setup_coding_system (XCAR (coding_systems), coding);
6514           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6515             setup_coding_system (XCDR (coding_systems), coding);
6516         }
6517     }
6518   coding->mode = saved_mode;
6519 }
6520
6521
6522 static void
6523 decode_eol (struct coding_system *coding)
6524 {
6525   Lisp_Object eol_type;
6526   unsigned char *p, *pbeg, *pend;
6527
6528   eol_type = CODING_ID_EOL_TYPE (coding->id);
6529   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6530     return;
6531
6532   if (NILP (coding->dst_object))
6533     pbeg = coding->destination;
6534   else
6535     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6536   pend = pbeg + coding->produced;
6537
6538   if (VECTORP (eol_type))
6539     {
6540       int eol_seen = EOL_SEEN_NONE;
6541
6542       for (p = pbeg; p < pend; p++)
6543         {
6544           if (*p == '\n')
6545             eol_seen |= EOL_SEEN_LF;
6546           else if (*p == '\r')
6547             {
6548               if (p + 1 < pend && *(p + 1) == '\n')
6549                 {
6550                   eol_seen |= EOL_SEEN_CRLF;
6551                   p++;
6552                 }
6553               else
6554                 eol_seen |= EOL_SEEN_CR;
6555             }
6556         }
6557       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6558       if ((eol_seen & EOL_SEEN_CRLF) != 0
6559           && (eol_seen & EOL_SEEN_CR) != 0
6560           && (eol_seen & EOL_SEEN_LF) == 0)
6561         eol_seen = EOL_SEEN_CRLF;
6562       else if (eol_seen != EOL_SEEN_NONE
6563           && eol_seen != EOL_SEEN_LF
6564           && eol_seen != EOL_SEEN_CRLF
6565           && eol_seen != EOL_SEEN_CR)
6566         eol_seen = EOL_SEEN_LF;
6567       if (eol_seen != EOL_SEEN_NONE)
6568         eol_type = adjust_coding_eol_type (coding, eol_seen);
6569     }
6570
6571   if (EQ (eol_type, Qmac))
6572     {
6573       for (p = pbeg; p < pend; p++)
6574         if (*p == '\r')
6575           *p = '\n';
6576     }
6577   else if (EQ (eol_type, Qdos))
6578     {
6579       int n = 0;
6580
6581       if (NILP (coding->dst_object))
6582         {
6583           /* Start deleting '\r' from the tail to minimize the memory
6584              movement.  */
6585           for (p = pend - 2; p >= pbeg; p--)
6586             if (*p == '\r')
6587               {
6588                 memmove (p, p + 1, pend-- - p - 1);
6589                 n++;
6590               }
6591         }
6592       else
6593         {
6594           int pos_byte = coding->dst_pos_byte;
6595           int pos = coding->dst_pos;
6596           int pos_end = pos + coding->produced_char - 1;
6597
6598           while (pos < pos_end)
6599             {
6600               p = BYTE_POS_ADDR (pos_byte);
6601               if (*p == '\r' && p[1] == '\n')
6602                 {
6603                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6604                   n++;
6605                   pos_end--;
6606                 }
6607               pos++;
6608               if (coding->dst_multibyte)
6609                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6610               else
6611                 pos_byte++;
6612             }
6613         }
6614       coding->produced -= n;
6615       coding->produced_char -= n;
6616     }
6617 }
6618
6619
6620 /* Return a translation table (or list of them) from coding system
6621    attribute vector ATTRS for encoding (ENCODEP is nonzero) or
6622    decoding (ENCODEP is zero). */
6623
6624 static Lisp_Object
6625 get_translation_table (Lisp_Object attrs, int encodep, int *max_lookup)
6626 {
6627   Lisp_Object standard, translation_table;
6628   Lisp_Object val;
6629
6630   if (NILP (Venable_character_translation))
6631     {
6632       if (max_lookup)
6633         *max_lookup = 0;
6634       return Qnil;
6635     }
6636   if (encodep)
6637     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6638       standard = Vstandard_translation_table_for_encode;
6639   else
6640     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6641       standard = Vstandard_translation_table_for_decode;
6642   if (NILP (translation_table))
6643     translation_table = standard;
6644   else
6645     {
6646       if (SYMBOLP (translation_table))
6647         translation_table = Fget (translation_table, Qtranslation_table);
6648       else if (CONSP (translation_table))
6649         {
6650           translation_table = Fcopy_sequence (translation_table);
6651           for (val = translation_table; CONSP (val); val = XCDR (val))
6652             if (SYMBOLP (XCAR (val)))
6653               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6654         }
6655       if (CHAR_TABLE_P (standard))
6656         {
6657           if (CONSP (translation_table))
6658             translation_table = nconc2 (translation_table,
6659                                         Fcons (standard, Qnil));
6660           else
6661             translation_table = Fcons (translation_table,
6662                                        Fcons (standard, Qnil));
6663         }
6664     }
6665
6666   if (max_lookup)
6667     {
6668       *max_lookup = 1;
6669       if (CHAR_TABLE_P (translation_table)
6670           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6671         {
6672           val = XCHAR_TABLE (translation_table)->extras[1];
6673           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6674             *max_lookup = XFASTINT (val);
6675         }
6676       else if (CONSP (translation_table))
6677         {
6678           Lisp_Object tail, val;
6679
6680           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6681             if (CHAR_TABLE_P (XCAR (tail))
6682                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6683               {
6684                 val = XCHAR_TABLE (XCAR (tail))->extras[1];
6685                 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6686                   *max_lookup = XFASTINT (val);
6687               }
6688         }
6689     }
6690   return translation_table;
6691 }
6692
6693 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6694   do {                                                          \
6695     trans = Qnil;                                               \
6696     if (CHAR_TABLE_P (table))                                   \
6697       {                                                         \
6698         trans = CHAR_TABLE_REF (table, c);                      \
6699         if (CHARACTERP (trans))                                 \
6700           c = XFASTINT (trans), trans = Qnil;                   \
6701       }                                                         \
6702     else if (CONSP (table))                                     \
6703       {                                                         \
6704         Lisp_Object tail;                                       \
6705                                                                 \
6706         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6707           if (CHAR_TABLE_P (XCAR (tail)))                       \
6708             {                                                   \
6709               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6710               if (CHARACTERP (trans))                           \
6711                 c = XFASTINT (trans), trans = Qnil;             \
6712               else if (! NILP (trans))                          \
6713                 break;                                          \
6714             }                                                   \
6715       }                                                         \
6716   } while (0)
6717
6718
6719 /* Return a translation of character(s) at BUF according to TRANS.
6720    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6721    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6722    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6723    translation is found, and Qnil if not found..
6724    If BUF is too short to lookup characters in FROM, return Qt.  */
6725
6726 static Lisp_Object
6727 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6728 {
6729
6730   if (INTEGERP (trans))
6731     return trans;
6732   for (; CONSP (trans); trans = XCDR (trans))
6733     {
6734       Lisp_Object val = XCAR (trans);
6735       Lisp_Object from = XCAR (val);
6736       int len = ASIZE (from);
6737       int i;
6738
6739       for (i = 0; i < len; i++)
6740         {
6741           if (buf + i == buf_end)
6742             return Qt;
6743           if (XINT (AREF (from, i)) != buf[i])
6744             break;
6745         }
6746       if (i == len)
6747         return val;
6748     }
6749   return Qnil;
6750 }
6751
6752
6753 static int
6754 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6755                int last_block)
6756 {
6757   unsigned char *dst = coding->destination + coding->produced;
6758   unsigned char *dst_end = coding->destination + coding->dst_bytes;
6759   EMACS_INT produced;
6760   EMACS_INT produced_chars = 0;
6761   int carryover = 0;
6762
6763   if (! coding->chars_at_source)
6764     {
6765       /* Source characters are in coding->charbuf.  */
6766       int *buf = coding->charbuf;
6767       int *buf_end = buf + coding->charbuf_used;
6768
6769       if (EQ (coding->src_object, coding->dst_object))
6770         {
6771           coding_set_source (coding);
6772           dst_end = ((unsigned char *) coding->source) + coding->consumed;
6773         }
6774
6775       while (buf < buf_end)
6776         {
6777           int c = *buf, i;
6778
6779           if (c >= 0)
6780             {
6781               int from_nchars = 1, to_nchars = 1;
6782               Lisp_Object trans = Qnil;
6783
6784               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6785               if (! NILP (trans))
6786                 {
6787                   trans = get_translation (trans, buf, buf_end);
6788                   if (INTEGERP (trans))
6789                     c = XINT (trans);
6790                   else if (CONSP (trans))
6791                     {
6792                       from_nchars = ASIZE (XCAR (trans));
6793                       trans = XCDR (trans);
6794                       if (INTEGERP (trans))
6795                         c = XINT (trans);
6796                       else
6797                         {
6798                           to_nchars = ASIZE (trans);
6799                           c = XINT (AREF (trans, 0));
6800                         }
6801                     }
6802                   else if (EQ (trans, Qt) && ! last_block)
6803                     break;
6804                 }
6805
6806               if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end)
6807                 {
6808                   dst = alloc_destination (coding,
6809                                            buf_end - buf
6810                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
6811                                            dst);
6812                   if (EQ (coding->src_object, coding->dst_object))
6813                     {
6814                       coding_set_source (coding);
6815                       dst_end = (((unsigned char *) coding->source)
6816                                  + coding->consumed);
6817                     }
6818                   else
6819                     dst_end = coding->destination + coding->dst_bytes;
6820                 }
6821
6822               for (i = 0; i < to_nchars; i++)
6823                 {
6824                   if (i > 0)
6825                     c = XINT (AREF (trans, i));
6826                   if (coding->dst_multibyte
6827                       || ! CHAR_BYTE8_P (c))
6828                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6829                   else
6830                     *dst++ = CHAR_TO_BYTE8 (c);
6831                 }
6832               produced_chars += to_nchars;
6833               buf += from_nchars;
6834             }
6835           else
6836             /* This is an annotation datum.  (-C) is the length.  */
6837             buf += -c;
6838         }
6839       carryover = buf_end - buf;
6840     }
6841   else
6842     {
6843       /* Source characters are at coding->source.  */
6844       const unsigned char *src = coding->source;
6845       const unsigned char *src_end = src + coding->consumed;
6846
6847       if (EQ (coding->dst_object, coding->src_object))
6848         dst_end = (unsigned char *) src;
6849       if (coding->src_multibyte != coding->dst_multibyte)
6850         {
6851           if (coding->src_multibyte)
6852             {
6853               int multibytep = 1;
6854               EMACS_INT consumed_chars = 0;
6855
6856               while (1)
6857                 {
6858                   const unsigned char *src_base = src;
6859                   int c;
6860
6861                   ONE_MORE_BYTE (c);
6862                   if (dst == dst_end)
6863                     {
6864                       if (EQ (coding->src_object, coding->dst_object))
6865                         dst_end = (unsigned char *) src;
6866                       if (dst == dst_end)
6867                         {
6868                           EMACS_INT offset = src - coding->source;
6869
6870                           dst = alloc_destination (coding, src_end - src + 1,
6871                                                    dst);
6872                           dst_end = coding->destination + coding->dst_bytes;
6873                           coding_set_source (coding);
6874                           src = coding->source + offset;
6875                           src_end = coding->source + coding->src_bytes;
6876                           if (EQ (coding->src_object, coding->dst_object))
6877                             dst_end = (unsigned char *) src;
6878                         }
6879                     }
6880                   *dst++ = c;
6881                   produced_chars++;
6882                 }
6883             no_more_source:
6884               ;
6885             }
6886           else
6887             while (src < src_end)
6888               {
6889                 int multibytep = 1;
6890                 int c = *src++;
6891
6892                 if (dst >= dst_end - 1)
6893                   {
6894                     if (EQ (coding->src_object, coding->dst_object))
6895                       dst_end = (unsigned char *) src;
6896                     if (dst >= dst_end - 1)
6897                       {
6898                         EMACS_INT offset = src - coding->source;
6899                         EMACS_INT more_bytes;
6900
6901                         if (EQ (coding->src_object, coding->dst_object))
6902                           more_bytes = ((src_end - src) / 2) + 2;
6903                         else
6904                           more_bytes = src_end - src + 2;
6905                         dst = alloc_destination (coding, more_bytes, dst);
6906                         dst_end = coding->destination + coding->dst_bytes;
6907                         coding_set_source (coding);
6908                         src = coding->source + offset;
6909                         src_end = coding->source + coding->src_bytes;
6910                         if (EQ (coding->src_object, coding->dst_object))
6911                           dst_end = (unsigned char *) src;
6912                       }
6913                   }
6914                 EMIT_ONE_BYTE (c);
6915               }
6916         }
6917       else
6918         {
6919           if (!EQ (coding->src_object, coding->dst_object))
6920             {
6921               EMACS_INT require = coding->src_bytes - coding->dst_bytes;
6922
6923               if (require > 0)
6924                 {
6925                   EMACS_INT offset = src - coding->source;
6926
6927                   dst = alloc_destination (coding, require, dst);
6928                   coding_set_source (coding);
6929                   src = coding->source + offset;
6930                   src_end = coding->source + coding->src_bytes;
6931                 }
6932             }
6933           produced_chars = coding->consumed_char;
6934           while (src < src_end)
6935             *dst++ = *src++;
6936         }
6937     }
6938
6939   produced = dst - (coding->destination + coding->produced);
6940   if (BUFFERP (coding->dst_object) && produced_chars > 0)
6941     insert_from_gap (produced_chars, produced);
6942   coding->produced += produced;
6943   coding->produced_char += produced_chars;
6944   return carryover;
6945 }
6946
6947 /* Compose text in CODING->object according to the annotation data at
6948    CHARBUF.  CHARBUF is an array:
6949      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
6950  */
6951
6952 static INLINE void
6953 produce_composition (struct coding_system *coding, int *charbuf, EMACS_INT pos)
6954 {
6955   int len;
6956   EMACS_INT to;
6957   enum composition_method method;
6958   Lisp_Object components;
6959
6960   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
6961   to = pos + charbuf[2];
6962   method = (enum composition_method) (charbuf[4]);
6963
6964   if (method == COMPOSITION_RELATIVE)
6965     components = Qnil;
6966   else
6967     {
6968       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6969       int i, j;
6970
6971       if (method == COMPOSITION_WITH_RULE)
6972         len = charbuf[2] * 3 - 2;
6973       charbuf += MAX_ANNOTATION_LENGTH;
6974       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
6975       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
6976         {
6977           if (charbuf[i] >= 0)
6978             args[j] = make_number (charbuf[i]);
6979           else
6980             {
6981               i++;
6982               args[j] = make_number (charbuf[i] % 0x100);
6983             }
6984         }
6985       components = (i == j ? Fstring (j, args) : Fvector (j, args));
6986     }
6987   compose_text (pos, to, components, Qnil, coding->dst_object);
6988 }
6989
6990
6991 /* Put `charset' property on text in CODING->object according to
6992    the annotation data at CHARBUF.  CHARBUF is an array:
6993      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6994  */
6995
6996 static INLINE void
6997 produce_charset (struct coding_system *coding, int *charbuf, EMACS_INT pos)
6998 {
6999   EMACS_INT from = pos - charbuf[2];
7000   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
7001
7002   Fput_text_property (make_number (from), make_number (pos),
7003                       Qcharset, CHARSET_NAME (charset),
7004                       coding->dst_object);
7005 }
7006
7007
7008 #define CHARBUF_SIZE 0x4000
7009
7010 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
7011   do {                                                                  \
7012     int size = CHARBUF_SIZE;                                            \
7013                                                                         \
7014     coding->charbuf = NULL;                                             \
7015     while (size > 1024)                                                 \
7016       {                                                                 \
7017         coding->charbuf = (int *) alloca (sizeof (int) * size);         \
7018         if (coding->charbuf)                                            \
7019           break;                                                        \
7020         size >>= 1;                                                     \
7021       }                                                                 \
7022     if (! coding->charbuf)                                              \
7023       {                                                                 \
7024         record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
7025         return coding->result;                                          \
7026       }                                                                 \
7027     coding->charbuf_size = size;                                        \
7028   } while (0)
7029
7030
7031 static void
7032 produce_annotation (struct coding_system *coding, EMACS_INT pos)
7033 {
7034   int *charbuf = coding->charbuf;
7035   int *charbuf_end = charbuf + coding->charbuf_used;
7036
7037   if (NILP (coding->dst_object))
7038     return;
7039
7040   while (charbuf < charbuf_end)
7041     {
7042       if (*charbuf >= 0)
7043         pos++, charbuf++;
7044       else
7045         {
7046           int len = -*charbuf;
7047
7048           if (len > 2)
7049             switch (charbuf[1])
7050               {
7051               case CODING_ANNOTATE_COMPOSITION_MASK:
7052                 produce_composition (coding, charbuf, pos);
7053                 break;
7054               case CODING_ANNOTATE_CHARSET_MASK:
7055                 produce_charset (coding, charbuf, pos);
7056                 break;
7057               }
7058           charbuf += len;
7059         }
7060     }
7061 }
7062
7063 /* Decode the data at CODING->src_object into CODING->dst_object.
7064    CODING->src_object is a buffer, a string, or nil.
7065    CODING->dst_object is a buffer.
7066
7067    If CODING->src_object is a buffer, it must be the current buffer.
7068    In this case, if CODING->src_pos is positive, it is a position of
7069    the source text in the buffer, otherwise, the source text is in the
7070    gap area of the buffer, and CODING->src_pos specifies the offset of
7071    the text from GPT (which must be the same as PT).  If this is the
7072    same buffer as CODING->dst_object, CODING->src_pos must be
7073    negative.
7074
7075    If CODING->src_object is a string, CODING->src_pos is an index to
7076    that string.
7077
7078    If CODING->src_object is nil, CODING->source must already point to
7079    the non-relocatable memory area.  In this case, CODING->src_pos is
7080    an offset from CODING->source.
7081
7082    The decoded data is inserted at the current point of the buffer
7083    CODING->dst_object.
7084 */
7085
7086 static int
7087 decode_coding (struct coding_system *coding)
7088 {
7089   Lisp_Object attrs;
7090   Lisp_Object undo_list;
7091   Lisp_Object translation_table;
7092   struct ccl_spec cclspec;
7093   int carryover;
7094   int i;
7095
7096   if (BUFFERP (coding->src_object)
7097       && coding->src_pos > 0
7098       && coding->src_pos < GPT
7099       && coding->src_pos + coding->src_chars > GPT)
7100     move_gap_both (coding->src_pos, coding->src_pos_byte);
7101
7102   undo_list = Qt;
7103   if (BUFFERP (coding->dst_object))
7104     {
7105       if (current_buffer != XBUFFER (coding->dst_object))
7106         set_buffer_internal (XBUFFER (coding->dst_object));
7107       if (GPT != PT)
7108         move_gap_both (PT, PT_BYTE);
7109       undo_list = current_buffer->undo_list;
7110       current_buffer->undo_list = Qt;
7111     }
7112
7113   coding->consumed = coding->consumed_char = 0;
7114   coding->produced = coding->produced_char = 0;
7115   coding->chars_at_source = 0;
7116   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7117   coding->errors = 0;
7118
7119   ALLOC_CONVERSION_WORK_AREA (coding);
7120
7121   attrs = CODING_ID_ATTRS (coding->id);
7122   translation_table = get_translation_table (attrs, 0, NULL);
7123
7124   carryover = 0;
7125   if (coding->decoder == decode_coding_ccl)
7126     {
7127       coding->spec.ccl = &cclspec;
7128       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7129     }
7130   do
7131     {
7132       EMACS_INT pos = coding->dst_pos + coding->produced_char;
7133
7134       coding_set_source (coding);
7135       coding->annotated = 0;
7136       coding->charbuf_used = carryover;
7137       (*(coding->decoder)) (coding);
7138       coding_set_destination (coding);
7139       carryover = produce_chars (coding, translation_table, 0);
7140       if (coding->annotated)
7141         produce_annotation (coding, pos);
7142       for (i = 0; i < carryover; i++)
7143         coding->charbuf[i]
7144           = coding->charbuf[coding->charbuf_used - carryover + i];
7145     }
7146   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7147          || (coding->consumed < coding->src_bytes
7148              && (coding->result == CODING_RESULT_SUCCESS
7149                  || coding->result == CODING_RESULT_INVALID_SRC)));
7150
7151   if (carryover > 0)
7152     {
7153       coding_set_destination (coding);
7154       coding->charbuf_used = carryover;
7155       produce_chars (coding, translation_table, 1);
7156     }
7157
7158   coding->carryover_bytes = 0;
7159   if (coding->consumed < coding->src_bytes)
7160     {
7161       int nbytes = coding->src_bytes - coding->consumed;
7162       const unsigned char *src;
7163
7164       coding_set_source (coding);
7165       coding_set_destination (coding);
7166       src = coding->source + coding->consumed;
7167
7168       if (coding->mode & CODING_MODE_LAST_BLOCK)
7169         {
7170           /* Flush out unprocessed data as binary chars.  We are sure
7171              that the number of data is less than the size of
7172              coding->charbuf.  */
7173           coding->charbuf_used = 0;
7174           coding->chars_at_source = 0;
7175
7176           while (nbytes-- > 0)
7177             {
7178               int c = *src++;
7179
7180               if (c & 0x80)
7181                 c = BYTE8_TO_CHAR (c);
7182               coding->charbuf[coding->charbuf_used++] = c;
7183             }
7184           produce_chars (coding, Qnil, 1);
7185         }
7186       else
7187         {
7188           /* Record unprocessed bytes in coding->carryover.  We are
7189              sure that the number of data is less than the size of
7190              coding->carryover.  */
7191           unsigned char *p = coding->carryover;
7192
7193           if (nbytes > sizeof coding->carryover)
7194             nbytes = sizeof coding->carryover;
7195           coding->carryover_bytes = nbytes;
7196           while (nbytes-- > 0)
7197             *p++ = *src++;
7198         }
7199       coding->consumed = coding->src_bytes;
7200     }
7201
7202   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7203       && !inhibit_eol_conversion)
7204     decode_eol (coding);
7205   if (BUFFERP (coding->dst_object))
7206     {
7207       current_buffer->undo_list = undo_list;
7208       record_insert (coding->dst_pos, coding->produced_char);
7209     }
7210   return coding->result;
7211 }
7212
7213
7214 /* Extract an annotation datum from a composition starting at POS and
7215    ending before LIMIT of CODING->src_object (buffer or string), store
7216    the data in BUF, set *STOP to a starting position of the next
7217    composition (if any) or to LIMIT, and return the address of the
7218    next element of BUF.
7219
7220    If such an annotation is not found, set *STOP to a starting
7221    position of a composition after POS (if any) or to LIMIT, and
7222    return BUF.  */
7223
7224 static INLINE int *
7225 handle_composition_annotation (EMACS_INT pos, EMACS_INT limit,
7226                                struct coding_system *coding, int *buf,
7227                                EMACS_INT *stop)
7228 {
7229   EMACS_INT start, end;
7230   Lisp_Object prop;
7231
7232   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7233       || end > limit)
7234     *stop = limit;
7235   else if (start > pos)
7236     *stop = start;
7237   else
7238     {
7239       if (start == pos)
7240         {
7241           /* We found a composition.  Store the corresponding
7242              annotation data in BUF.  */
7243           int *head = buf;
7244           enum composition_method method = COMPOSITION_METHOD (prop);
7245           int nchars = COMPOSITION_LENGTH (prop);
7246
7247           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7248           if (method != COMPOSITION_RELATIVE)
7249             {
7250               Lisp_Object components;
7251               int len, i, i_byte;
7252
7253               components = COMPOSITION_COMPONENTS (prop);
7254               if (VECTORP (components))
7255                 {
7256                   len = XVECTOR (components)->size;
7257                   for (i = 0; i < len; i++)
7258                     *buf++ = XINT (AREF (components, i));
7259                 }
7260               else if (STRINGP (components))
7261                 {
7262                   len = SCHARS (components);
7263                   i = i_byte = 0;
7264                   while (i < len)
7265                     {
7266                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7267                       buf++;
7268                     }
7269                 }
7270               else if (INTEGERP (components))
7271                 {
7272                   len = 1;
7273                   *buf++ = XINT (components);
7274                 }
7275               else if (CONSP (components))
7276                 {
7277                   for (len = 0; CONSP (components);
7278                        len++, components = XCDR (components))
7279                     *buf++ = XINT (XCAR (components));
7280                 }
7281               else
7282                 abort ();
7283               *head -= len;
7284             }
7285         }
7286
7287       if (find_composition (end, limit, &start, &end, &prop,
7288                             coding->src_object)
7289           && end <= limit)
7290         *stop = start;
7291       else
7292         *stop = limit;
7293     }
7294   return buf;
7295 }
7296
7297
7298 /* Extract an annotation datum from a text property `charset' at POS of
7299    CODING->src_object (buffer of string), store the data in BUF, set
7300    *STOP to the position where the value of `charset' property changes
7301    (limiting by LIMIT), and return the address of the next element of
7302    BUF.
7303
7304    If the property value is nil, set *STOP to the position where the
7305    property value is non-nil (limiting by LIMIT), and return BUF.  */
7306
7307 static INLINE int *
7308 handle_charset_annotation (EMACS_INT pos, EMACS_INT limit,
7309                            struct coding_system *coding, int *buf,
7310                            EMACS_INT *stop)
7311 {
7312   Lisp_Object val, next;
7313   int id;
7314
7315   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7316   if (! NILP (val) && CHARSETP (val))
7317     id = XINT (CHARSET_SYMBOL_ID (val));
7318   else
7319     id = -1;
7320   ADD_CHARSET_DATA (buf, 0, id);
7321   next = Fnext_single_property_change (make_number (pos), Qcharset,
7322                                        coding->src_object,
7323                                        make_number (limit));
7324   *stop = XINT (next);
7325   return buf;
7326 }
7327
7328
7329 static void
7330 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7331                int max_lookup)
7332 {
7333   int *buf = coding->charbuf;
7334   int *buf_end = coding->charbuf + coding->charbuf_size;
7335   const unsigned char *src = coding->source + coding->consumed;
7336   const unsigned char *src_end = coding->source + coding->src_bytes;
7337   EMACS_INT pos = coding->src_pos + coding->consumed_char;
7338   EMACS_INT end_pos = coding->src_pos + coding->src_chars;
7339   int multibytep = coding->src_multibyte;
7340   Lisp_Object eol_type;
7341   int c;
7342   EMACS_INT stop, stop_composition, stop_charset;
7343   int *lookup_buf = NULL;
7344
7345   if (! NILP (translation_table))
7346     lookup_buf = alloca (sizeof (int) * max_lookup);
7347
7348   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7349   if (VECTORP (eol_type))
7350     eol_type = Qunix;
7351
7352   /* Note: composition handling is not yet implemented.  */
7353   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7354
7355   if (NILP (coding->src_object))
7356     stop = stop_composition = stop_charset = end_pos;
7357   else
7358     {
7359       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7360         stop = stop_composition = pos;
7361       else
7362         stop = stop_composition = end_pos;
7363       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7364         stop = stop_charset = pos;
7365       else
7366         stop_charset = end_pos;
7367     }
7368
7369   /* Compensate for CRLF and conversion.  */
7370   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7371   while (buf < buf_end)
7372     {
7373       Lisp_Object trans;
7374
7375       if (pos == stop)
7376         {
7377           if (pos == end_pos)
7378             break;
7379           if (pos == stop_composition)
7380             buf = handle_composition_annotation (pos, end_pos, coding,
7381                                                  buf, &stop_composition);
7382           if (pos == stop_charset)
7383             buf = handle_charset_annotation (pos, end_pos, coding,
7384                                              buf, &stop_charset);
7385           stop = (stop_composition < stop_charset
7386                   ? stop_composition : stop_charset);
7387         }
7388
7389       if (! multibytep)
7390         {
7391           EMACS_INT bytes;
7392
7393           if (coding->encoder == encode_coding_raw_text
7394               || coding->encoder == encode_coding_ccl)
7395             c = *src++, pos++;
7396           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7397             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7398           else
7399             c = BYTE8_TO_CHAR (*src), src++, pos++;
7400         }
7401       else
7402         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7403       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7404         c = '\n';
7405       if (! EQ (eol_type, Qunix))
7406         {
7407           if (c == '\n')
7408             {
7409               if (EQ (eol_type, Qdos))
7410                 *buf++ = '\r';
7411               else
7412                 c = '\r';
7413             }
7414         }
7415
7416       trans = Qnil;
7417       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7418       if (NILP (trans))
7419         *buf++ = c;
7420       else
7421         {
7422           int from_nchars = 1, to_nchars = 1;
7423           int *lookup_buf_end;
7424           const unsigned char *p = src;
7425           int i;
7426
7427           lookup_buf[0] = c;
7428           for (i = 1; i < max_lookup && p < src_end; i++)
7429             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7430           lookup_buf_end = lookup_buf + i;
7431           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7432           if (INTEGERP (trans))
7433             c = XINT (trans);
7434           else if (CONSP (trans))
7435             {
7436               from_nchars = ASIZE (XCAR (trans));
7437               trans = XCDR (trans);
7438               if (INTEGERP (trans))
7439                 c = XINT (trans);
7440               else
7441                 {
7442                   to_nchars = ASIZE (trans);
7443                   if (buf + to_nchars > buf_end)
7444                     break;
7445                   c = XINT (AREF (trans, 0));
7446                 }
7447             }
7448           else
7449             break;
7450           *buf++ = c;
7451           for (i = 1; i < to_nchars; i++)
7452             *buf++ = XINT (AREF (trans, i));
7453           for (i = 1; i < from_nchars; i++, pos++)
7454             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7455         }
7456     }
7457
7458   coding->consumed = src - coding->source;
7459   coding->consumed_char = pos - coding->src_pos;
7460   coding->charbuf_used = buf - coding->charbuf;
7461   coding->chars_at_source = 0;
7462 }
7463
7464
7465 /* Encode the text at CODING->src_object into CODING->dst_object.
7466    CODING->src_object is a buffer or a string.
7467    CODING->dst_object is a buffer or nil.
7468
7469    If CODING->src_object is a buffer, it must be the current buffer.
7470    In this case, if CODING->src_pos is positive, it is a position of
7471    the source text in the buffer, otherwise. the source text is in the
7472    gap area of the buffer, and coding->src_pos specifies the offset of
7473    the text from GPT (which must be the same as PT).  If this is the
7474    same buffer as CODING->dst_object, CODING->src_pos must be
7475    negative and CODING should not have `pre-write-conversion'.
7476
7477    If CODING->src_object is a string, CODING should not have
7478    `pre-write-conversion'.
7479
7480    If CODING->dst_object is a buffer, the encoded data is inserted at
7481    the current point of that buffer.
7482
7483    If CODING->dst_object is nil, the encoded data is placed at the
7484    memory area specified by CODING->destination.  */
7485
7486 static int
7487 encode_coding (struct coding_system *coding)
7488 {
7489   Lisp_Object attrs;
7490   Lisp_Object translation_table;
7491   int max_lookup;
7492   struct ccl_spec cclspec;
7493
7494   attrs = CODING_ID_ATTRS (coding->id);
7495   if (coding->encoder == encode_coding_raw_text)
7496     translation_table = Qnil, max_lookup = 0;
7497   else
7498     translation_table = get_translation_table (attrs, 1, &max_lookup);
7499
7500   if (BUFFERP (coding->dst_object))
7501     {
7502       set_buffer_internal (XBUFFER (coding->dst_object));
7503       coding->dst_multibyte
7504         = ! NILP (current_buffer->enable_multibyte_characters);
7505     }
7506
7507   coding->consumed = coding->consumed_char = 0;
7508   coding->produced = coding->produced_char = 0;
7509   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7510   coding->errors = 0;
7511
7512   ALLOC_CONVERSION_WORK_AREA (coding);
7513
7514   if (coding->encoder == encode_coding_ccl)
7515     {
7516       coding->spec.ccl = &cclspec;
7517       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7518     }
7519   do {
7520     coding_set_source (coding);
7521     consume_chars (coding, translation_table, max_lookup);
7522     coding_set_destination (coding);
7523     (*(coding->encoder)) (coding);
7524   } while (coding->consumed_char < coding->src_chars);
7525
7526   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7527     insert_from_gap (coding->produced_char, coding->produced);
7528
7529   return (coding->result);
7530 }
7531
7532
7533 /* Name (or base name) of work buffer for code conversion.  */
7534 static Lisp_Object Vcode_conversion_workbuf_name;
7535
7536 /* A working buffer used by the top level conversion.  Once it is
7537    created, it is never destroyed.  It has the name
7538    Vcode_conversion_workbuf_name.  The other working buffers are
7539    destroyed after the use is finished, and their names are modified
7540    versions of Vcode_conversion_workbuf_name.  */
7541 static Lisp_Object Vcode_conversion_reused_workbuf;
7542
7543 /* 1 iff Vcode_conversion_reused_workbuf is already in use.  */
7544 static int reused_workbuf_in_use;
7545
7546
7547 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7548    multibyteness of returning buffer.  */
7549
7550 static Lisp_Object
7551 make_conversion_work_buffer (int multibyte)
7552 {
7553   Lisp_Object name, workbuf;
7554   struct buffer *current;
7555
7556   if (reused_workbuf_in_use++)
7557     {
7558       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7559       workbuf = Fget_buffer_create (name);
7560     }
7561   else
7562     {
7563       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7564         Vcode_conversion_reused_workbuf
7565           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7566       workbuf = Vcode_conversion_reused_workbuf;
7567     }
7568   current = current_buffer;
7569   set_buffer_internal (XBUFFER (workbuf));
7570   /* We can't allow modification hooks to run in the work buffer.  For
7571      instance, directory_files_internal assumes that file decoding
7572      doesn't compile new regexps.  */
7573   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7574   Ferase_buffer ();
7575   current_buffer->undo_list = Qt;
7576   current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
7577   set_buffer_internal (current);
7578   return workbuf;
7579 }
7580
7581
7582 static Lisp_Object
7583 code_conversion_restore (Lisp_Object arg)
7584 {
7585   Lisp_Object current, workbuf;
7586   struct gcpro gcpro1;
7587
7588   GCPRO1 (arg);
7589   current = XCAR (arg);
7590   workbuf = XCDR (arg);
7591   if (! NILP (workbuf))
7592     {
7593       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7594         reused_workbuf_in_use = 0;
7595       else if (! NILP (Fbuffer_live_p (workbuf)))
7596         Fkill_buffer (workbuf);
7597     }
7598   set_buffer_internal (XBUFFER (current));
7599   UNGCPRO;
7600   return Qnil;
7601 }
7602
7603 Lisp_Object
7604 code_conversion_save (int with_work_buf, int multibyte)
7605 {
7606   Lisp_Object workbuf = Qnil;
7607
7608   if (with_work_buf)
7609     workbuf = make_conversion_work_buffer (multibyte);
7610   record_unwind_protect (code_conversion_restore,
7611                          Fcons (Fcurrent_buffer (), workbuf));
7612   return workbuf;
7613 }
7614
7615 int
7616 decode_coding_gap (struct coding_system *coding,
7617                    EMACS_INT chars, EMACS_INT bytes)
7618 {
7619   int count = SPECPDL_INDEX ();
7620   Lisp_Object attrs;
7621
7622   code_conversion_save (0, 0);
7623
7624   coding->src_object = Fcurrent_buffer ();
7625   coding->src_chars = chars;
7626   coding->src_bytes = bytes;
7627   coding->src_pos = -chars;
7628   coding->src_pos_byte = -bytes;
7629   coding->src_multibyte = chars < bytes;
7630   coding->dst_object = coding->src_object;
7631   coding->dst_pos = PT;
7632   coding->dst_pos_byte = PT_BYTE;
7633   coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
7634
7635   if (CODING_REQUIRE_DETECTION (coding))
7636     detect_coding (coding);
7637
7638   coding->mode |= CODING_MODE_LAST_BLOCK;
7639   current_buffer->text->inhibit_shrinking = 1;
7640   decode_coding (coding);
7641   current_buffer->text->inhibit_shrinking = 0;
7642
7643   attrs = CODING_ID_ATTRS (coding->id);
7644   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7645     {
7646       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7647       Lisp_Object val;
7648
7649       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7650       val = call1 (CODING_ATTR_POST_READ (attrs),
7651                    make_number (coding->produced_char));
7652       CHECK_NATNUM (val);
7653       coding->produced_char += Z - prev_Z;
7654       coding->produced += Z_BYTE - prev_Z_BYTE;
7655     }
7656
7657   unbind_to (count, Qnil);
7658   return coding->result;
7659 }
7660
7661 int
7662 encode_coding_gap (struct coding_system *coding,
7663                    EMACS_INT chars, EMACS_INT bytes)
7664 {
7665   int count = SPECPDL_INDEX ();
7666
7667   code_conversion_save (0, 0);
7668
7669   coding->src_object = Fcurrent_buffer ();
7670   coding->src_chars = chars;
7671   coding->src_bytes = bytes;
7672   coding->src_pos = -chars;
7673   coding->src_pos_byte = -bytes;
7674   coding->src_multibyte = chars < bytes;
7675   coding->dst_object = coding->src_object;
7676   coding->dst_pos = PT;
7677   coding->dst_pos_byte = PT_BYTE;
7678
7679   encode_coding (coding);
7680
7681   unbind_to (count, Qnil);
7682   return coding->result;
7683 }
7684
7685
7686 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7687    SRC_OBJECT into DST_OBJECT by coding context CODING.
7688
7689    SRC_OBJECT is a buffer, a string, or Qnil.
7690
7691    If it is a buffer, the text is at point of the buffer.  FROM and TO
7692    are positions in the buffer.
7693
7694    If it is a string, the text is at the beginning of the string.
7695    FROM and TO are indices to the string.
7696
7697    If it is nil, the text is at coding->source.  FROM and TO are
7698    indices to coding->source.
7699
7700    DST_OBJECT is a buffer, Qt, or Qnil.
7701
7702    If it is a buffer, the decoded text is inserted at point of the
7703    buffer.  If the buffer is the same as SRC_OBJECT, the source text
7704    is deleted.
7705
7706    If it is Qt, a string is made from the decoded text, and
7707    set in CODING->dst_object.
7708
7709    If it is Qnil, the decoded text is stored at CODING->destination.
7710    The caller must allocate CODING->dst_bytes bytes at
7711    CODING->destination by xmalloc.  If the decoded text is longer than
7712    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7713  */
7714
7715 void
7716 decode_coding_object (struct coding_system *coding,
7717                       Lisp_Object src_object,
7718                       EMACS_INT from, EMACS_INT from_byte,
7719                       EMACS_INT to, EMACS_INT to_byte,
7720                       Lisp_Object dst_object)
7721 {
7722   int count = SPECPDL_INDEX ();
7723   unsigned char *destination;
7724   EMACS_INT dst_bytes;
7725   EMACS_INT chars = to - from;
7726   EMACS_INT bytes = to_byte - from_byte;
7727   Lisp_Object attrs;
7728   int saved_pt = -1, saved_pt_byte;
7729   int need_marker_adjustment = 0;
7730   Lisp_Object old_deactivate_mark;
7731
7732   old_deactivate_mark = Vdeactivate_mark;
7733
7734   if (NILP (dst_object))
7735     {
7736       destination = coding->destination;
7737       dst_bytes = coding->dst_bytes;
7738     }
7739
7740   coding->src_object = src_object;
7741   coding->src_chars = chars;
7742   coding->src_bytes = bytes;
7743   coding->src_multibyte = chars < bytes;
7744
7745   if (STRINGP (src_object))
7746     {
7747       coding->src_pos = from;
7748       coding->src_pos_byte = from_byte;
7749     }
7750   else if (BUFFERP (src_object))
7751     {
7752       set_buffer_internal (XBUFFER (src_object));
7753       if (from != GPT)
7754         move_gap_both (from, from_byte);
7755       if (EQ (src_object, dst_object))
7756         {
7757           struct Lisp_Marker *tail;
7758
7759           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7760             {
7761               tail->need_adjustment
7762                 = tail->charpos == (tail->insertion_type ? from : to);
7763               need_marker_adjustment |= tail->need_adjustment;
7764             }
7765           saved_pt = PT, saved_pt_byte = PT_BYTE;
7766           TEMP_SET_PT_BOTH (from, from_byte);
7767           current_buffer->text->inhibit_shrinking = 1;
7768           del_range_both (from, from_byte, to, to_byte, 1);
7769           coding->src_pos = -chars;
7770           coding->src_pos_byte = -bytes;
7771         }
7772       else
7773         {
7774           coding->src_pos = from;
7775           coding->src_pos_byte = from_byte;
7776         }
7777     }
7778
7779   if (CODING_REQUIRE_DETECTION (coding))
7780     detect_coding (coding);
7781   attrs = CODING_ID_ATTRS (coding->id);
7782
7783   if (EQ (dst_object, Qt)
7784       || (! NILP (CODING_ATTR_POST_READ (attrs))
7785           && NILP (dst_object)))
7786     {
7787       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7788       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
7789       coding->dst_pos = BEG;
7790       coding->dst_pos_byte = BEG_BYTE;
7791     }
7792   else if (BUFFERP (dst_object))
7793     {
7794       code_conversion_save (0, 0);
7795       coding->dst_object = dst_object;
7796       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7797       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7798       coding->dst_multibyte
7799         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
7800     }
7801   else
7802     {
7803       code_conversion_save (0, 0);
7804       coding->dst_object = Qnil;
7805       /* Most callers presume this will return a multibyte result, and they
7806          won't use `binary' or `raw-text' anyway, so let's not worry about
7807          CODING_FOR_UNIBYTE.  */
7808       coding->dst_multibyte = 1;
7809     }
7810
7811   decode_coding (coding);
7812
7813   if (BUFFERP (coding->dst_object))
7814     set_buffer_internal (XBUFFER (coding->dst_object));
7815
7816   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7817     {
7818       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7819       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7820       Lisp_Object val;
7821
7822       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7823       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7824               old_deactivate_mark);
7825       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7826                         make_number (coding->produced_char));
7827       UNGCPRO;
7828       CHECK_NATNUM (val);
7829       coding->produced_char += Z - prev_Z;
7830       coding->produced += Z_BYTE - prev_Z_BYTE;
7831     }
7832
7833   if (EQ (dst_object, Qt))
7834     {
7835       coding->dst_object = Fbuffer_string ();
7836     }
7837   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7838     {
7839       set_buffer_internal (XBUFFER (coding->dst_object));
7840       if (dst_bytes < coding->produced)
7841         {
7842           destination = xrealloc (destination, coding->produced);
7843           if (! destination)
7844             {
7845               record_conversion_result (coding,
7846                                         CODING_RESULT_INSUFFICIENT_MEM);
7847               unbind_to (count, Qnil);
7848               return;
7849             }
7850           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7851             move_gap_both (BEGV, BEGV_BYTE);
7852           memcpy (destination, BEGV_ADDR, coding->produced);
7853           coding->destination = destination;
7854         }
7855     }
7856
7857   if (saved_pt >= 0)
7858     {
7859       /* This is the case of:
7860          (BUFFERP (src_object) && EQ (src_object, dst_object))
7861          As we have moved PT while replacing the original buffer
7862          contents, we must recover it now.  */
7863       set_buffer_internal (XBUFFER (src_object));
7864       current_buffer->text->inhibit_shrinking = 0;
7865       if (saved_pt < from)
7866         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7867       else if (saved_pt < from + chars)
7868         TEMP_SET_PT_BOTH (from, from_byte);
7869       else if (! NILP (current_buffer->enable_multibyte_characters))
7870         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7871                           saved_pt_byte + (coding->produced - bytes));
7872       else
7873         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7874                           saved_pt_byte + (coding->produced - bytes));
7875
7876       if (need_marker_adjustment)
7877         {
7878           struct Lisp_Marker *tail;
7879
7880           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7881             if (tail->need_adjustment)
7882               {
7883                 tail->need_adjustment = 0;
7884                 if (tail->insertion_type)
7885                   {
7886                     tail->bytepos = from_byte;
7887                     tail->charpos = from;
7888                   }
7889                 else
7890                   {
7891                     tail->bytepos = from_byte + coding->produced;
7892                     tail->charpos
7893                       = (NILP (current_buffer->enable_multibyte_characters)
7894                          ? tail->bytepos : from + coding->produced_char);
7895                   }
7896               }
7897         }
7898     }
7899
7900   Vdeactivate_mark = old_deactivate_mark;
7901   unbind_to (count, coding->dst_object);
7902 }
7903
7904
7905 void
7906 encode_coding_object (struct coding_system *coding,
7907                       Lisp_Object src_object,
7908                       EMACS_INT from, EMACS_INT from_byte,
7909                       EMACS_INT to, EMACS_INT to_byte,
7910                       Lisp_Object dst_object)
7911 {
7912   int count = SPECPDL_INDEX ();
7913   EMACS_INT chars = to - from;
7914   EMACS_INT bytes = to_byte - from_byte;
7915   Lisp_Object attrs;
7916   int saved_pt = -1, saved_pt_byte;
7917   int need_marker_adjustment = 0;
7918   int kill_src_buffer = 0;
7919   Lisp_Object old_deactivate_mark;
7920
7921   old_deactivate_mark = Vdeactivate_mark;
7922
7923   coding->src_object = src_object;
7924   coding->src_chars = chars;
7925   coding->src_bytes = bytes;
7926   coding->src_multibyte = chars < bytes;
7927
7928   attrs = CODING_ID_ATTRS (coding->id);
7929
7930   if (EQ (src_object, dst_object))
7931     {
7932       struct Lisp_Marker *tail;
7933
7934       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7935         {
7936           tail->need_adjustment
7937             = tail->charpos == (tail->insertion_type ? from : to);
7938           need_marker_adjustment |= tail->need_adjustment;
7939         }
7940     }
7941
7942   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
7943     {
7944       coding->src_object = code_conversion_save (1, coding->src_multibyte);
7945       set_buffer_internal (XBUFFER (coding->src_object));
7946       if (STRINGP (src_object))
7947         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7948       else if (BUFFERP (src_object))
7949         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7950       else
7951         insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
7952
7953       if (EQ (src_object, dst_object))
7954         {
7955           set_buffer_internal (XBUFFER (src_object));
7956           saved_pt = PT, saved_pt_byte = PT_BYTE;
7957           del_range_both (from, from_byte, to, to_byte, 1);
7958           set_buffer_internal (XBUFFER (coding->src_object));
7959         }
7960
7961       {
7962         Lisp_Object args[3];
7963         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7964
7965         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7966                 old_deactivate_mark);
7967         args[0] = CODING_ATTR_PRE_WRITE (attrs);
7968         args[1] = make_number (BEG);
7969         args[2] = make_number (Z);
7970         safe_call (3, args);
7971         UNGCPRO;
7972       }
7973       if (XBUFFER (coding->src_object) != current_buffer)
7974         kill_src_buffer = 1;
7975       coding->src_object = Fcurrent_buffer ();
7976       if (BEG != GPT)
7977         move_gap_both (BEG, BEG_BYTE);
7978       coding->src_chars = Z - BEG;
7979       coding->src_bytes = Z_BYTE - BEG_BYTE;
7980       coding->src_pos = BEG;
7981       coding->src_pos_byte = BEG_BYTE;
7982       coding->src_multibyte = Z < Z_BYTE;
7983     }
7984   else if (STRINGP (src_object))
7985     {
7986       code_conversion_save (0, 0);
7987       coding->src_pos = from;
7988       coding->src_pos_byte = from_byte;
7989     }
7990   else if (BUFFERP (src_object))
7991     {
7992       code_conversion_save (0, 0);
7993       set_buffer_internal (XBUFFER (src_object));
7994       if (EQ (src_object, dst_object))
7995         {
7996           saved_pt = PT, saved_pt_byte = PT_BYTE;
7997           coding->src_object = del_range_1 (from, to, 1, 1);
7998           coding->src_pos = 0;
7999           coding->src_pos_byte = 0;
8000         }
8001       else
8002         {
8003           if (from < GPT && to >= GPT)
8004             move_gap_both (from, from_byte);
8005           coding->src_pos = from;
8006           coding->src_pos_byte = from_byte;
8007         }
8008     }
8009   else
8010     code_conversion_save (0, 0);
8011
8012   if (BUFFERP (dst_object))
8013     {
8014       coding->dst_object = dst_object;
8015       if (EQ (src_object, dst_object))
8016         {
8017           coding->dst_pos = from;
8018           coding->dst_pos_byte = from_byte;
8019         }
8020       else
8021         {
8022           struct buffer *current = current_buffer;
8023
8024           set_buffer_temp (XBUFFER (dst_object));
8025           coding->dst_pos = PT;
8026           coding->dst_pos_byte = PT_BYTE;
8027           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8028           set_buffer_temp (current);
8029         }
8030       coding->dst_multibyte
8031         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
8032     }
8033   else if (EQ (dst_object, Qt))
8034     {
8035       coding->dst_object = Qnil;
8036       coding->dst_bytes = coding->src_chars;
8037       if (coding->dst_bytes == 0)
8038         coding->dst_bytes = 1;
8039       coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
8040       coding->dst_multibyte = 0;
8041     }
8042   else
8043     {
8044       coding->dst_object = Qnil;
8045       coding->dst_multibyte = 0;
8046     }
8047
8048   encode_coding (coding);
8049
8050   if (EQ (dst_object, Qt))
8051     {
8052       if (BUFFERP (coding->dst_object))
8053         coding->dst_object = Fbuffer_string ();
8054       else
8055         {
8056           coding->dst_object
8057             = make_unibyte_string ((char *) coding->destination,
8058                                    coding->produced);
8059           xfree (coding->destination);
8060         }
8061     }
8062
8063   if (saved_pt >= 0)
8064     {
8065       /* This is the case of:
8066          (BUFFERP (src_object) && EQ (src_object, dst_object))
8067          As we have moved PT while replacing the original buffer
8068          contents, we must recover it now.  */
8069       set_buffer_internal (XBUFFER (src_object));
8070       if (saved_pt < from)
8071         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8072       else if (saved_pt < from + chars)
8073         TEMP_SET_PT_BOTH (from, from_byte);
8074       else if (! NILP (current_buffer->enable_multibyte_characters))
8075         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8076                           saved_pt_byte + (coding->produced - bytes));
8077       else
8078         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8079                           saved_pt_byte + (coding->produced - bytes));
8080
8081       if (need_marker_adjustment)
8082         {
8083           struct Lisp_Marker *tail;
8084
8085           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8086             if (tail->need_adjustment)
8087               {
8088                 tail->need_adjustment = 0;
8089                 if (tail->insertion_type)
8090                   {
8091                     tail->bytepos = from_byte;
8092                     tail->charpos = from;
8093                   }
8094                 else
8095                   {
8096                     tail->bytepos = from_byte + coding->produced;
8097                     tail->charpos
8098                       = (NILP (current_buffer->enable_multibyte_characters)
8099                          ? tail->bytepos : from + coding->produced_char);
8100                   }
8101               }
8102         }
8103     }
8104
8105   if (kill_src_buffer)
8106     Fkill_buffer (coding->src_object);
8107
8108   Vdeactivate_mark = old_deactivate_mark;
8109   unbind_to (count, Qnil);
8110 }
8111
8112
8113 Lisp_Object
8114 preferred_coding_system (void)
8115 {
8116   int id = coding_categories[coding_priorities[0]].id;
8117
8118   return CODING_ID_NAME (id);
8119 }
8120
8121 \f
8122 #ifdef emacs
8123 /*** 8. Emacs Lisp library functions ***/
8124
8125 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8126        doc: /* Return t if OBJECT is nil or a coding-system.
8127 See the documentation of `define-coding-system' for information
8128 about coding-system objects.  */)
8129   (Lisp_Object object)
8130 {
8131   if (NILP (object)
8132       || CODING_SYSTEM_ID (object) >= 0)
8133     return Qt;
8134   if (! SYMBOLP (object)
8135       || NILP (Fget (object, Qcoding_system_define_form)))
8136     return Qnil;
8137   return Qt;
8138 }
8139
8140 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8141        Sread_non_nil_coding_system, 1, 1, 0,
8142        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8143   (Lisp_Object prompt)
8144 {
8145   Lisp_Object val;
8146   do
8147     {
8148       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8149                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8150     }
8151   while (SCHARS (val) == 0);
8152   return (Fintern (val, Qnil));
8153 }
8154
8155 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8156        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8157 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8158 Ignores case when completing coding systems (all Emacs coding systems
8159 are lower-case).  */)
8160   (Lisp_Object prompt, Lisp_Object default_coding_system)
8161 {
8162   Lisp_Object val;
8163   int count = SPECPDL_INDEX ();
8164
8165   if (SYMBOLP (default_coding_system))
8166     default_coding_system = SYMBOL_NAME (default_coding_system);
8167   specbind (Qcompletion_ignore_case, Qt);
8168   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8169                           Qt, Qnil, Qcoding_system_history,
8170                           default_coding_system, Qnil);
8171   unbind_to (count, Qnil);
8172   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8173 }
8174
8175 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8176        1, 1, 0,
8177        doc: /* Check validity of CODING-SYSTEM.
8178 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8179 It is valid if it is nil or a symbol defined as a coding system by the
8180 function `define-coding-system'.  */)
8181   (Lisp_Object coding_system)
8182 {
8183   Lisp_Object define_form;
8184
8185   define_form = Fget (coding_system, Qcoding_system_define_form);
8186   if (! NILP (define_form))
8187     {
8188       Fput (coding_system, Qcoding_system_define_form, Qnil);
8189       safe_eval (define_form);
8190     }
8191   if (!NILP (Fcoding_system_p (coding_system)))
8192     return coding_system;
8193   xsignal1 (Qcoding_system_error, coding_system);
8194 }
8195
8196 \f
8197 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8198    HIGHEST is nonzero, return the coding system of the highest
8199    priority among the detected coding systems.  Otherwise return a
8200    list of detected coding systems sorted by their priorities.  If
8201    MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
8202    multibyte form but contains only ASCII and eight-bit chars.
8203    Otherwise, the bytes are raw bytes.
8204
8205    CODING-SYSTEM controls the detection as below:
8206
8207    If it is nil, detect both text-format and eol-format.  If the
8208    text-format part of CODING-SYSTEM is already specified
8209    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8210    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8211    detect only text-format.  */
8212
8213 Lisp_Object
8214 detect_coding_system (const unsigned char *src,
8215                       EMACS_INT src_chars, EMACS_INT src_bytes,
8216                       int highest, int multibytep,
8217                       Lisp_Object coding_system)
8218 {
8219   const unsigned char *src_end = src + src_bytes;
8220   Lisp_Object attrs, eol_type;
8221   Lisp_Object val = Qnil;
8222   struct coding_system coding;
8223   int id;
8224   struct coding_detection_info detect_info;
8225   enum coding_category base_category;
8226   int null_byte_found = 0, eight_bit_found = 0;
8227
8228   if (NILP (coding_system))
8229     coding_system = Qundecided;
8230   setup_coding_system (coding_system, &coding);
8231   attrs = CODING_ID_ATTRS (coding.id);
8232   eol_type = CODING_ID_EOL_TYPE (coding.id);
8233   coding_system = CODING_ATTR_BASE_NAME (attrs);
8234
8235   coding.source = src;
8236   coding.src_chars = src_chars;
8237   coding.src_bytes = src_bytes;
8238   coding.src_multibyte = multibytep;
8239   coding.consumed = 0;
8240   coding.mode |= CODING_MODE_LAST_BLOCK;
8241   coding.head_ascii = 0;
8242
8243   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8244
8245   /* At first, detect text-format if necessary.  */
8246   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8247   if (base_category == coding_category_undecided)
8248     {
8249       enum coding_category category;
8250       struct coding_system *this;
8251       int c, i;
8252
8253       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8254       for (; src < src_end; src++)
8255         {
8256           c = *src;
8257           if (c & 0x80)
8258             {
8259               eight_bit_found = 1;
8260               if (null_byte_found)
8261                 break;
8262             }
8263           else if (c < 0x20)
8264             {
8265               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8266                   && ! inhibit_iso_escape_detection
8267                   && ! detect_info.checked)
8268                 {
8269                   if (detect_coding_iso_2022 (&coding, &detect_info))
8270                     {
8271                       /* We have scanned the whole data.  */
8272                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8273                         {
8274                           /* We didn't find an 8-bit code.  We may
8275                              have found a null-byte, but it's very
8276                              rare that a binary file confirm to
8277                              ISO-2022.  */
8278                           src = src_end;
8279                           coding.head_ascii = src - coding.source;
8280                         }
8281                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8282                       break;
8283                     }
8284                 }
8285               else if (! c && !inhibit_null_byte_detection)
8286                 {
8287                   null_byte_found = 1;
8288                   if (eight_bit_found)
8289                     break;
8290                 }
8291               if (! eight_bit_found)
8292                 coding.head_ascii++;
8293             }
8294           else if (! eight_bit_found)
8295             coding.head_ascii++;
8296         }
8297
8298       if (null_byte_found || eight_bit_found
8299           || coding.head_ascii < coding.src_bytes
8300           || detect_info.found)
8301         {
8302           if (coding.head_ascii == coding.src_bytes)
8303             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8304             for (i = 0; i < coding_category_raw_text; i++)
8305               {
8306                 category = coding_priorities[i];
8307                 this = coding_categories + category;
8308                 if (detect_info.found & (1 << category))
8309                   break;
8310               }
8311           else
8312             {
8313               if (null_byte_found)
8314                 {
8315                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8316                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8317                 }
8318               for (i = 0; i < coding_category_raw_text; i++)
8319                 {
8320                   category = coding_priorities[i];
8321                   this = coding_categories + category;
8322
8323                   if (this->id < 0)
8324                     {
8325                       /* No coding system of this category is defined.  */
8326                       detect_info.rejected |= (1 << category);
8327                     }
8328                   else if (category >= coding_category_raw_text)
8329                     continue;
8330                   else if (detect_info.checked & (1 << category))
8331                     {
8332                       if (highest
8333                           && (detect_info.found & (1 << category)))
8334                         break;
8335                     }
8336                   else if ((*(this->detector)) (&coding, &detect_info)
8337                            && highest
8338                            && (detect_info.found & (1 << category)))
8339                     {
8340                       if (category == coding_category_utf_16_auto)
8341                         {
8342                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8343                             category = coding_category_utf_16_le;
8344                           else
8345                             category = coding_category_utf_16_be;
8346                         }
8347                       break;
8348                     }
8349                 }
8350             }
8351         }
8352
8353       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8354           || null_byte_found)
8355         {
8356           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8357           id = CODING_SYSTEM_ID (Qno_conversion);
8358           val = Fcons (make_number (id), Qnil);
8359         }
8360       else if (! detect_info.rejected && ! detect_info.found)
8361         {
8362           detect_info.found = CATEGORY_MASK_ANY;
8363           id = coding_categories[coding_category_undecided].id;
8364           val = Fcons (make_number (id), Qnil);
8365         }
8366       else if (highest)
8367         {
8368           if (detect_info.found)
8369             {
8370               detect_info.found = 1 << category;
8371               val = Fcons (make_number (this->id), Qnil);
8372             }
8373           else
8374             for (i = 0; i < coding_category_raw_text; i++)
8375               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8376                 {
8377                   detect_info.found = 1 << coding_priorities[i];
8378                   id = coding_categories[coding_priorities[i]].id;
8379                   val = Fcons (make_number (id), Qnil);
8380                   break;
8381                 }
8382         }
8383       else
8384         {
8385           int mask = detect_info.rejected | detect_info.found;
8386           int found = 0;
8387
8388           for (i = coding_category_raw_text - 1; i >= 0; i--)
8389             {
8390               category = coding_priorities[i];
8391               if (! (mask & (1 << category)))
8392                 {
8393                   found |= 1 << category;
8394                   id = coding_categories[category].id;
8395                   if (id >= 0)
8396                     val = Fcons (make_number (id), val);
8397                 }
8398             }
8399           for (i = coding_category_raw_text - 1; i >= 0; i--)
8400             {
8401               category = coding_priorities[i];
8402               if (detect_info.found & (1 << category))
8403                 {
8404                   id = coding_categories[category].id;
8405                   val = Fcons (make_number (id), val);
8406                 }
8407             }
8408           detect_info.found |= found;
8409         }
8410     }
8411   else if (base_category == coding_category_utf_8_auto)
8412     {
8413       if (detect_coding_utf_8 (&coding, &detect_info))
8414         {
8415           struct coding_system *this;
8416
8417           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8418             this = coding_categories + coding_category_utf_8_sig;
8419           else
8420             this = coding_categories + coding_category_utf_8_nosig;
8421           val = Fcons (make_number (this->id), Qnil);
8422         }
8423     }
8424   else if (base_category == coding_category_utf_16_auto)
8425     {
8426       if (detect_coding_utf_16 (&coding, &detect_info))
8427         {
8428           struct coding_system *this;
8429
8430           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8431             this = coding_categories + coding_category_utf_16_le;
8432           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8433             this = coding_categories + coding_category_utf_16_be;
8434           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8435             this = coding_categories + coding_category_utf_16_be_nosig;
8436           else
8437             this = coding_categories + coding_category_utf_16_le_nosig;
8438           val = Fcons (make_number (this->id), Qnil);
8439         }
8440     }
8441   else
8442     {
8443       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8444       val = Fcons (make_number (coding.id), Qnil);
8445     }
8446
8447   /* Then, detect eol-format if necessary.  */
8448   {
8449     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8450     Lisp_Object tail;
8451
8452     if (VECTORP (eol_type))
8453       {
8454         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8455           {
8456             if (null_byte_found)
8457               normal_eol = EOL_SEEN_LF;
8458             else
8459               normal_eol = detect_eol (coding.source, src_bytes,
8460                                        coding_category_raw_text);
8461           }
8462         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8463                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8464           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8465                                       coding_category_utf_16_be);
8466         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8467                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8468           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8469                                       coding_category_utf_16_le);
8470       }
8471     else
8472       {
8473         if (EQ (eol_type, Qunix))
8474           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8475         else if (EQ (eol_type, Qdos))
8476           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8477         else
8478           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8479       }
8480
8481     for (tail = val; CONSP (tail); tail = XCDR (tail))
8482       {
8483         enum coding_category category;
8484         int this_eol;
8485
8486         id = XINT (XCAR (tail));
8487         attrs = CODING_ID_ATTRS (id);
8488         category = XINT (CODING_ATTR_CATEGORY (attrs));
8489         eol_type = CODING_ID_EOL_TYPE (id);
8490         if (VECTORP (eol_type))
8491           {
8492             if (category == coding_category_utf_16_be
8493                 || category == coding_category_utf_16_be_nosig)
8494               this_eol = utf_16_be_eol;
8495             else if (category == coding_category_utf_16_le
8496                      || category == coding_category_utf_16_le_nosig)
8497               this_eol = utf_16_le_eol;
8498             else
8499               this_eol = normal_eol;
8500
8501             if (this_eol == EOL_SEEN_LF)
8502               XSETCAR (tail, AREF (eol_type, 0));
8503             else if (this_eol == EOL_SEEN_CRLF)
8504               XSETCAR (tail, AREF (eol_type, 1));
8505             else if (this_eol == EOL_SEEN_CR)
8506               XSETCAR (tail, AREF (eol_type, 2));
8507             else
8508               XSETCAR (tail, CODING_ID_NAME (id));
8509           }
8510         else
8511           XSETCAR (tail, CODING_ID_NAME (id));
8512       }
8513   }
8514
8515   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8516 }
8517
8518
8519 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8520        2, 3, 0,
8521        doc: /* Detect coding system of the text in the region between START and END.
8522 Return a list of possible coding systems ordered by priority.
8523 The coding systems to try and their priorities follows what
8524 the function `coding-system-priority-list' (which see) returns.
8525
8526 If only ASCII characters are found (except for such ISO-2022 control
8527 characters as ESC), it returns a list of single element `undecided'
8528 or its subsidiary coding system according to a detected end-of-line
8529 format.
8530
8531 If optional argument HIGHEST is non-nil, return the coding system of
8532 highest priority.  */)
8533   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8534 {
8535   int from, to;
8536   int from_byte, to_byte;
8537
8538   CHECK_NUMBER_COERCE_MARKER (start);
8539   CHECK_NUMBER_COERCE_MARKER (end);
8540
8541   validate_region (&start, &end);
8542   from = XINT (start), to = XINT (end);
8543   from_byte = CHAR_TO_BYTE (from);
8544   to_byte = CHAR_TO_BYTE (to);
8545
8546   if (from < GPT && to >= GPT)
8547     move_gap_both (to, to_byte);
8548
8549   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8550                                to - from, to_byte - from_byte,
8551                                !NILP (highest),
8552                                !NILP (current_buffer
8553                                       ->enable_multibyte_characters),
8554                                Qnil);
8555 }
8556
8557 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8558        1, 2, 0,
8559        doc: /* Detect coding system of the text in STRING.
8560 Return a list of possible coding systems ordered by priority.
8561 The coding systems to try and their priorities follows what
8562 the function `coding-system-priority-list' (which see) returns.
8563
8564 If only ASCII characters are found (except for such ISO-2022 control
8565 characters as ESC), it returns a list of single element `undecided'
8566 or its subsidiary coding system according to a detected end-of-line
8567 format.
8568
8569 If optional argument HIGHEST is non-nil, return the coding system of
8570 highest priority.  */)
8571   (Lisp_Object string, Lisp_Object highest)
8572 {
8573   CHECK_STRING (string);
8574
8575   return detect_coding_system (SDATA (string),
8576                                SCHARS (string), SBYTES (string),
8577                                !NILP (highest), STRING_MULTIBYTE (string),
8578                                Qnil);
8579 }
8580
8581
8582 static INLINE int
8583 char_encodable_p (int c, Lisp_Object attrs)
8584 {
8585   Lisp_Object tail;
8586   struct charset *charset;
8587   Lisp_Object translation_table;
8588
8589   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8590   if (! NILP (translation_table))
8591     c = translate_char (translation_table, c);
8592   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8593        CONSP (tail); tail = XCDR (tail))
8594     {
8595       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8596       if (CHAR_CHARSET_P (c, charset))
8597         break;
8598     }
8599   return (! NILP (tail));
8600 }
8601
8602
8603 /* Return a list of coding systems that safely encode the text between
8604    START and END.  If EXCLUDE is non-nil, it is a list of coding
8605    systems not to check.  The returned list doesn't contain any such
8606    coding systems.  In any case, if the text contains only ASCII or is
8607    unibyte, return t.  */
8608
8609 DEFUN ("find-coding-systems-region-internal",
8610        Ffind_coding_systems_region_internal,
8611        Sfind_coding_systems_region_internal, 2, 3, 0,
8612        doc: /* Internal use only.  */)
8613   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8614 {
8615   Lisp_Object coding_attrs_list, safe_codings;
8616   EMACS_INT start_byte, end_byte;
8617   const unsigned char *p, *pbeg, *pend;
8618   int c;
8619   Lisp_Object tail, elt, work_table;
8620
8621   if (STRINGP (start))
8622     {
8623       if (!STRING_MULTIBYTE (start)
8624           || SCHARS (start) == SBYTES (start))
8625         return Qt;
8626       start_byte = 0;
8627       end_byte = SBYTES (start);
8628     }
8629   else
8630     {
8631       CHECK_NUMBER_COERCE_MARKER (start);
8632       CHECK_NUMBER_COERCE_MARKER (end);
8633       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8634         args_out_of_range (start, end);
8635       if (NILP (current_buffer->enable_multibyte_characters))
8636         return Qt;
8637       start_byte = CHAR_TO_BYTE (XINT (start));
8638       end_byte = CHAR_TO_BYTE (XINT (end));
8639       if (XINT (end) - XINT (start) == end_byte - start_byte)
8640         return Qt;
8641
8642       if (XINT (start) < GPT && XINT (end) > GPT)
8643         {
8644           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8645             move_gap_both (XINT (start), start_byte);
8646           else
8647             move_gap_both (XINT (end), end_byte);
8648         }
8649     }
8650
8651   coding_attrs_list = Qnil;
8652   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8653     if (NILP (exclude)
8654         || NILP (Fmemq (XCAR (tail), exclude)))
8655       {
8656         Lisp_Object attrs;
8657
8658         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8659         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8660             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8661           {
8662             ASET (attrs, coding_attr_trans_tbl,
8663                   get_translation_table (attrs, 1, NULL));
8664             coding_attrs_list = Fcons (attrs, coding_attrs_list);
8665           }
8666       }
8667
8668   if (STRINGP (start))
8669     p = pbeg = SDATA (start);
8670   else
8671     p = pbeg = BYTE_POS_ADDR (start_byte);
8672   pend = p + (end_byte - start_byte);
8673
8674   while (p < pend && ASCII_BYTE_P (*p)) p++;
8675   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8676
8677   work_table = Fmake_char_table (Qnil, Qnil);
8678   while (p < pend)
8679     {
8680       if (ASCII_BYTE_P (*p))
8681         p++;
8682       else
8683         {
8684           c = STRING_CHAR_ADVANCE (p);
8685           if (!NILP (char_table_ref (work_table, c)))
8686             /* This character was already checked.  Ignore it.  */
8687             continue;
8688
8689           charset_map_loaded = 0;
8690           for (tail = coding_attrs_list; CONSP (tail);)
8691             {
8692               elt = XCAR (tail);
8693               if (NILP (elt))
8694                 tail = XCDR (tail);
8695               else if (char_encodable_p (c, elt))
8696                 tail = XCDR (tail);
8697               else if (CONSP (XCDR (tail)))
8698                 {
8699                   XSETCAR (tail, XCAR (XCDR (tail)));
8700                   XSETCDR (tail, XCDR (XCDR (tail)));
8701                 }
8702               else
8703                 {
8704                   XSETCAR (tail, Qnil);
8705                   tail = XCDR (tail);
8706                 }
8707             }
8708           if (charset_map_loaded)
8709             {
8710               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8711
8712               if (STRINGP (start))
8713                 pbeg = SDATA (start);
8714               else
8715                 pbeg = BYTE_POS_ADDR (start_byte);
8716               p = pbeg + p_offset;
8717               pend = pbeg + pend_offset;
8718             }
8719           char_table_set (work_table, c, Qt);
8720         }
8721     }
8722
8723   safe_codings = list2 (Qraw_text, Qno_conversion);
8724   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8725     if (! NILP (XCAR (tail)))
8726       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8727
8728   return safe_codings;
8729 }
8730
8731
8732 DEFUN ("unencodable-char-position", Funencodable_char_position,
8733        Sunencodable_char_position, 3, 5, 0,
8734        doc: /*
8735 Return position of first un-encodable character in a region.
8736 START and END specify the region and CODING-SYSTEM specifies the
8737 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
8738
8739 If optional 4th argument COUNT is non-nil, it specifies at most how
8740 many un-encodable characters to search.  In this case, the value is a
8741 list of positions.
8742
8743 If optional 5th argument STRING is non-nil, it is a string to search
8744 for un-encodable characters.  In that case, START and END are indexes
8745 to the string.  */)
8746   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
8747 {
8748   int n;
8749   struct coding_system coding;
8750   Lisp_Object attrs, charset_list, translation_table;
8751   Lisp_Object positions;
8752   int from, to;
8753   const unsigned char *p, *stop, *pend;
8754   int ascii_compatible;
8755
8756   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8757   attrs = CODING_ID_ATTRS (coding.id);
8758   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8759     return Qnil;
8760   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8761   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8762   translation_table = get_translation_table (attrs, 1, NULL);
8763
8764   if (NILP (string))
8765     {
8766       validate_region (&start, &end);
8767       from = XINT (start);
8768       to = XINT (end);
8769       if (NILP (current_buffer->enable_multibyte_characters)
8770           || (ascii_compatible
8771               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8772         return Qnil;
8773       p = CHAR_POS_ADDR (from);
8774       pend = CHAR_POS_ADDR (to);
8775       if (from < GPT && to >= GPT)
8776         stop = GPT_ADDR;
8777       else
8778         stop = pend;
8779     }
8780   else
8781     {
8782       CHECK_STRING (string);
8783       CHECK_NATNUM (start);
8784       CHECK_NATNUM (end);
8785       from = XINT (start);
8786       to = XINT (end);
8787       if (from > to
8788           || to > SCHARS (string))
8789         args_out_of_range_3 (string, start, end);
8790       if (! STRING_MULTIBYTE (string))
8791         return Qnil;
8792       p = SDATA (string) + string_char_to_byte (string, from);
8793       stop = pend = SDATA (string) + string_char_to_byte (string, to);
8794       if (ascii_compatible && (to - from) == (pend - p))
8795         return Qnil;
8796     }
8797
8798   if (NILP (count))
8799     n = 1;
8800   else
8801     {
8802       CHECK_NATNUM (count);
8803       n = XINT (count);
8804     }
8805
8806   positions = Qnil;
8807   while (1)
8808     {
8809       int c;
8810
8811       if (ascii_compatible)
8812         while (p < stop && ASCII_BYTE_P (*p))
8813           p++, from++;
8814       if (p >= stop)
8815         {
8816           if (p >= pend)
8817             break;
8818           stop = pend;
8819           p = GAP_END_ADDR;
8820         }
8821
8822       c = STRING_CHAR_ADVANCE (p);
8823       if (! (ASCII_CHAR_P (c) && ascii_compatible)
8824           && ! char_charset (translate_char (translation_table, c),
8825                              charset_list, NULL))
8826         {
8827           positions = Fcons (make_number (from), positions);
8828           n--;
8829           if (n == 0)
8830             break;
8831         }
8832
8833       from++;
8834     }
8835
8836   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8837 }
8838
8839
8840 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8841        Scheck_coding_systems_region, 3, 3, 0,
8842        doc: /* Check if the region is encodable by coding systems.
8843
8844 START and END are buffer positions specifying the region.
8845 CODING-SYSTEM-LIST is a list of coding systems to check.
8846
8847 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
8848 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
8849 whole region, POS0, POS1, ... are buffer positions where non-encodable
8850 characters are found.
8851
8852 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8853 value is nil.
8854
8855 START may be a string.  In that case, check if the string is
8856 encodable, and the value contains indices to the string instead of
8857 buffer positions.  END is ignored.
8858
8859 If the current buffer (or START if it is a string) is unibyte, the value
8860 is nil.  */)
8861   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
8862 {
8863   Lisp_Object list;
8864   EMACS_INT start_byte, end_byte;
8865   int pos;
8866   const unsigned char *p, *pbeg, *pend;
8867   int c;
8868   Lisp_Object tail, elt, attrs;
8869
8870   if (STRINGP (start))
8871     {
8872       if (!STRING_MULTIBYTE (start)
8873           || SCHARS (start) == SBYTES (start))
8874         return Qnil;
8875       start_byte = 0;
8876       end_byte = SBYTES (start);
8877       pos = 0;
8878     }
8879   else
8880     {
8881       CHECK_NUMBER_COERCE_MARKER (start);
8882       CHECK_NUMBER_COERCE_MARKER (end);
8883       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8884         args_out_of_range (start, end);
8885       if (NILP (current_buffer->enable_multibyte_characters))
8886         return Qnil;
8887       start_byte = CHAR_TO_BYTE (XINT (start));
8888       end_byte = CHAR_TO_BYTE (XINT (end));
8889       if (XINT (end) - XINT (start) == end_byte - start_byte)
8890         return Qnil;
8891
8892       if (XINT (start) < GPT && XINT (end) > GPT)
8893         {
8894           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8895             move_gap_both (XINT (start), start_byte);
8896           else
8897             move_gap_both (XINT (end), end_byte);
8898         }
8899       pos = XINT (start);
8900     }
8901
8902   list = Qnil;
8903   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
8904     {
8905       elt = XCAR (tail);
8906       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
8907       ASET (attrs, coding_attr_trans_tbl,
8908             get_translation_table (attrs, 1, NULL));
8909       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
8910     }
8911
8912   if (STRINGP (start))
8913     p = pbeg = SDATA (start);
8914   else
8915     p = pbeg = BYTE_POS_ADDR (start_byte);
8916   pend = p + (end_byte - start_byte);
8917
8918   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8919   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8920
8921   while (p < pend)
8922     {
8923       if (ASCII_BYTE_P (*p))
8924         p++;
8925       else
8926         {
8927           c = STRING_CHAR_ADVANCE (p);
8928
8929           charset_map_loaded = 0;
8930           for (tail = list; CONSP (tail); tail = XCDR (tail))
8931             {
8932               elt = XCDR (XCAR (tail));
8933               if (! char_encodable_p (c, XCAR (elt)))
8934                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8935             }
8936           if (charset_map_loaded)
8937             {
8938               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8939
8940               if (STRINGP (start))
8941                 pbeg = SDATA (start);
8942               else
8943                 pbeg = BYTE_POS_ADDR (start_byte);
8944               p = pbeg + p_offset;
8945               pend = pbeg + pend_offset;
8946             }
8947         }
8948       pos++;
8949     }
8950
8951   tail = list;
8952   list = Qnil;
8953   for (; CONSP (tail); tail = XCDR (tail))
8954     {
8955       elt = XCAR (tail);
8956       if (CONSP (XCDR (XCDR (elt))))
8957         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8958                       list);
8959     }
8960
8961   return list;
8962 }
8963
8964
8965 Lisp_Object
8966 code_convert_region (Lisp_Object start, Lisp_Object end,
8967                      Lisp_Object coding_system, Lisp_Object dst_object,
8968                      int encodep, int norecord)
8969 {
8970   struct coding_system coding;
8971   EMACS_INT from, from_byte, to, to_byte;
8972   Lisp_Object src_object;
8973
8974   CHECK_NUMBER_COERCE_MARKER (start);
8975   CHECK_NUMBER_COERCE_MARKER (end);
8976   if (NILP (coding_system))
8977     coding_system = Qno_conversion;
8978   else
8979     CHECK_CODING_SYSTEM (coding_system);
8980   src_object = Fcurrent_buffer ();
8981   if (NILP (dst_object))
8982     dst_object = src_object;
8983   else if (! EQ (dst_object, Qt))
8984     CHECK_BUFFER (dst_object);
8985
8986   validate_region (&start, &end);
8987   from = XFASTINT (start);
8988   from_byte = CHAR_TO_BYTE (from);
8989   to = XFASTINT (end);
8990   to_byte = CHAR_TO_BYTE (to);
8991
8992   setup_coding_system (coding_system, &coding);
8993   coding.mode |= CODING_MODE_LAST_BLOCK;
8994
8995   if (encodep)
8996     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8997                           dst_object);
8998   else
8999     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9000                           dst_object);
9001   if (! norecord)
9002     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9003
9004   return (BUFFERP (dst_object)
9005           ? make_number (coding.produced_char)
9006           : coding.dst_object);
9007 }
9008
9009
9010 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9011        3, 4, "r\nzCoding system: ",
9012        doc: /* Decode the current region from the specified coding system.
9013 When called from a program, takes four arguments:
9014         START, END, CODING-SYSTEM, and DESTINATION.
9015 START and END are buffer positions.
9016
9017 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9018 If nil, the region between START and END is replaced by the decoded text.
9019 If buffer, the decoded text is inserted in that buffer after point (point
9020 does not move).
9021 In those cases, the length of the decoded text is returned.
9022 If DESTINATION is t, the decoded text is returned.
9023
9024 This function sets `last-coding-system-used' to the precise coding system
9025 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9026 not fully specified.)  */)
9027   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9028 {
9029   return code_convert_region (start, end, coding_system, destination, 0, 0);
9030 }
9031
9032 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9033        3, 4, "r\nzCoding system: ",
9034        doc: /* Encode the current region by specified coding system.
9035 When called from a program, takes four arguments:
9036         START, END, CODING-SYSTEM and DESTINATION.
9037 START and END are buffer positions.
9038
9039 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9040 If nil, the region between START and END is replace by the encoded text.
9041 If buffer, the encoded text is inserted in that buffer after point (point
9042 does not move).
9043 In those cases, the length of the encoded text is returned.
9044 If DESTINATION is t, the encoded text is returned.
9045
9046 This function sets `last-coding-system-used' to the precise coding system
9047 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9048 not fully specified.)  */)
9049   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9050 {
9051   return code_convert_region (start, end, coding_system, destination, 1, 0);
9052 }
9053
9054 Lisp_Object
9055 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
9056                      Lisp_Object dst_object, int encodep, int nocopy, int norecord)
9057 {
9058   struct coding_system coding;
9059   EMACS_INT chars, bytes;
9060
9061   CHECK_STRING (string);
9062   if (NILP (coding_system))
9063     {
9064       if (! norecord)
9065         Vlast_coding_system_used = Qno_conversion;
9066       if (NILP (dst_object))
9067         return (nocopy ? Fcopy_sequence (string) : string);
9068     }
9069
9070   if (NILP (coding_system))
9071     coding_system = Qno_conversion;
9072   else
9073     CHECK_CODING_SYSTEM (coding_system);
9074   if (NILP (dst_object))
9075     dst_object = Qt;
9076   else if (! EQ (dst_object, Qt))
9077     CHECK_BUFFER (dst_object);
9078
9079   setup_coding_system (coding_system, &coding);
9080   coding.mode |= CODING_MODE_LAST_BLOCK;
9081   chars = SCHARS (string);
9082   bytes = SBYTES (string);
9083   if (encodep)
9084     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9085   else
9086     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9087   if (! norecord)
9088     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9089
9090   return (BUFFERP (dst_object)
9091           ? make_number (coding.produced_char)
9092           : coding.dst_object);
9093 }
9094
9095
9096 /* Encode or decode STRING according to CODING_SYSTEM.
9097    Do not set Vlast_coding_system_used.
9098
9099    This function is called only from macros DECODE_FILE and
9100    ENCODE_FILE, thus we ignore character composition.  */
9101
9102 Lisp_Object
9103 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9104                               int encodep)
9105 {
9106   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9107 }
9108
9109
9110 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9111        2, 4, 0,
9112        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9113
9114 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9115 if the decoding operation is trivial.
9116
9117 Optional fourth arg BUFFER non-nil means that the decoded text is
9118 inserted in that buffer after point (point does not move).  In this
9119 case, the return value is the length of the decoded text.
9120
9121 This function sets `last-coding-system-used' to the precise coding system
9122 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9123 not fully specified.)  */)
9124   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9125 {
9126   return code_convert_string (string, coding_system, buffer,
9127                               0, ! NILP (nocopy), 0);
9128 }
9129
9130 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9131        2, 4, 0,
9132        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9133
9134 Optional third arg NOCOPY non-nil means it is OK to return STRING
9135 itself if the encoding operation is trivial.
9136
9137 Optional fourth arg BUFFER non-nil means that the encoded text is
9138 inserted in that buffer after point (point does not move).  In this
9139 case, the return value is the length of the encoded text.
9140
9141 This function sets `last-coding-system-used' to the precise coding system
9142 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9143 not fully specified.)  */)
9144   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9145 {
9146   return code_convert_string (string, coding_system, buffer,
9147                               1, ! NILP (nocopy), 1);
9148 }
9149
9150 \f
9151 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9152        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9153 Return the corresponding character.  */)
9154   (Lisp_Object code)
9155 {
9156   Lisp_Object spec, attrs, val;
9157   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9158   int c;
9159
9160   CHECK_NATNUM (code);
9161   c = XFASTINT (code);
9162   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9163   attrs = AREF (spec, 0);
9164
9165   if (ASCII_BYTE_P (c)
9166       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9167     return code;
9168
9169   val = CODING_ATTR_CHARSET_LIST (attrs);
9170   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9171   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9172   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9173
9174   if (c <= 0x7F)
9175     charset = charset_roman;
9176   else if (c >= 0xA0 && c < 0xDF)
9177     {
9178       charset = charset_kana;
9179       c -= 0x80;
9180     }
9181   else
9182     {
9183       int s1 = c >> 8, s2 = c & 0xFF;
9184
9185       if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
9186           || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
9187         error ("Invalid code: %d", code);
9188       SJIS_TO_JIS (c);
9189       charset = charset_kanji;
9190     }
9191   c = DECODE_CHAR (charset, c);
9192   if (c < 0)
9193     error ("Invalid code: %d", code);
9194   return make_number (c);
9195 }
9196
9197
9198 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9199        doc: /* Encode a Japanese character CH to shift_jis encoding.
9200 Return the corresponding code in SJIS.  */)
9201   (Lisp_Object ch)
9202 {
9203   Lisp_Object spec, attrs, charset_list;
9204   int c;
9205   struct charset *charset;
9206   unsigned code;
9207
9208   CHECK_CHARACTER (ch);
9209   c = XFASTINT (ch);
9210   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9211   attrs = AREF (spec, 0);
9212
9213   if (ASCII_CHAR_P (c)
9214       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9215     return ch;
9216
9217   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9218   charset = char_charset (c, charset_list, &code);
9219   if (code == CHARSET_INVALID_CODE (charset))
9220     error ("Can't encode by shift_jis encoding: %d", c);
9221   JIS_TO_SJIS (code);
9222
9223   return make_number (code);
9224 }
9225
9226 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9227        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9228 Return the corresponding character.  */)
9229   (Lisp_Object code)
9230 {
9231   Lisp_Object spec, attrs, val;
9232   struct charset *charset_roman, *charset_big5, *charset;
9233   int c;
9234
9235   CHECK_NATNUM (code);
9236   c = XFASTINT (code);
9237   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9238   attrs = AREF (spec, 0);
9239
9240   if (ASCII_BYTE_P (c)
9241       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9242     return code;
9243
9244   val = CODING_ATTR_CHARSET_LIST (attrs);
9245   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9246   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9247
9248   if (c <= 0x7F)
9249     charset = charset_roman;
9250   else
9251     {
9252       int b1 = c >> 8, b2 = c & 0x7F;
9253       if (b1 < 0xA1 || b1 > 0xFE
9254           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9255         error ("Invalid code: %d", code);
9256       charset = charset_big5;
9257     }
9258   c = DECODE_CHAR (charset, (unsigned )c);
9259   if (c < 0)
9260     error ("Invalid code: %d", code);
9261   return make_number (c);
9262 }
9263
9264 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9265        doc: /* Encode the Big5 character CH to BIG5 coding system.
9266 Return the corresponding character code in Big5.  */)
9267   (Lisp_Object ch)
9268 {
9269   Lisp_Object spec, attrs, charset_list;
9270   struct charset *charset;
9271   int c;
9272   unsigned code;
9273
9274   CHECK_CHARACTER (ch);
9275   c = XFASTINT (ch);
9276   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9277   attrs = AREF (spec, 0);
9278   if (ASCII_CHAR_P (c)
9279       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9280     return ch;
9281
9282   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9283   charset = char_charset (c, charset_list, &code);
9284   if (code == CHARSET_INVALID_CODE (charset))
9285     error ("Can't encode by Big5 encoding: %d", c);
9286
9287   return make_number (code);
9288 }
9289
9290 \f
9291 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9292        Sset_terminal_coding_system_internal, 1, 2, 0,
9293        doc: /* Internal use only.  */)
9294   (Lisp_Object coding_system, Lisp_Object terminal)
9295 {
9296   struct terminal *term = get_terminal (terminal, 1);
9297   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9298   CHECK_SYMBOL (coding_system);
9299   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9300   /* We had better not send unsafe characters to terminal.  */
9301   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9302   /* Character composition should be disabled.  */
9303   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9304   terminal_coding->src_multibyte = 1;
9305   terminal_coding->dst_multibyte = 0;
9306   if (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK)
9307     term->charset_list = coding_charset_list (terminal_coding);
9308   else
9309     term->charset_list = Fcons (make_number (charset_ascii), Qnil);
9310   return Qnil;
9311 }
9312
9313 DEFUN ("set-safe-terminal-coding-system-internal",
9314        Fset_safe_terminal_coding_system_internal,
9315        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9316        doc: /* Internal use only.  */)
9317   (Lisp_Object coding_system)
9318 {
9319   CHECK_SYMBOL (coding_system);
9320   setup_coding_system (Fcheck_coding_system (coding_system),
9321                        &safe_terminal_coding);
9322   /* Character composition should be disabled.  */
9323   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9324   safe_terminal_coding.src_multibyte = 1;
9325   safe_terminal_coding.dst_multibyte = 0;
9326   return Qnil;
9327 }
9328
9329 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9330        Sterminal_coding_system, 0, 1, 0,
9331        doc: /* Return coding system specified for terminal output on the given terminal.
9332 TERMINAL may be a terminal object, a frame, or nil for the selected
9333 frame's terminal device.  */)
9334   (Lisp_Object terminal)
9335 {
9336   struct coding_system *terminal_coding
9337     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9338   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9339
9340   /* For backward compatibility, return nil if it is `undecided'. */
9341   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9342 }
9343
9344 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9345        Sset_keyboard_coding_system_internal, 1, 2, 0,
9346        doc: /* Internal use only.  */)
9347   (Lisp_Object coding_system, Lisp_Object terminal)
9348 {
9349   struct terminal *t = get_terminal (terminal, 1);
9350   CHECK_SYMBOL (coding_system);
9351   if (NILP (coding_system))
9352     coding_system = Qno_conversion;
9353   else
9354     Fcheck_coding_system (coding_system);
9355   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9356   /* Character composition should be disabled.  */
9357   TERMINAL_KEYBOARD_CODING (t)->common_flags
9358     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9359   return Qnil;
9360 }
9361
9362 DEFUN ("keyboard-coding-system",
9363        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9364        doc: /* Return coding system specified for decoding keyboard input.  */)
9365   (Lisp_Object terminal)
9366 {
9367   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9368                          (get_terminal (terminal, 1))->id);
9369 }
9370
9371 \f
9372 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9373        Sfind_operation_coding_system,  1, MANY, 0,
9374        doc: /* Choose a coding system for an operation based on the target name.
9375 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9376 DECODING-SYSTEM is the coding system to use for decoding
9377 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9378 for encoding (in case OPERATION does encoding).
9379
9380 The first argument OPERATION specifies an I/O primitive:
9381   For file I/O, `insert-file-contents' or `write-region'.
9382   For process I/O, `call-process', `call-process-region', or `start-process'.
9383   For network I/O, `open-network-stream'.
9384
9385 The remaining arguments should be the same arguments that were passed
9386 to the primitive.  Depending on which primitive, one of those arguments
9387 is selected as the TARGET.  For example, if OPERATION does file I/O,
9388 whichever argument specifies the file name is TARGET.
9389
9390 TARGET has a meaning which depends on OPERATION:
9391   For file I/O, TARGET is a file name (except for the special case below).
9392   For process I/O, TARGET is a process name.
9393   For network I/O, TARGET is a service name or a port number.
9394
9395 This function looks up what is specified for TARGET in
9396 `file-coding-system-alist', `process-coding-system-alist',
9397 or `network-coding-system-alist' depending on OPERATION.
9398 They may specify a coding system, a cons of coding systems,
9399 or a function symbol to call.
9400 In the last case, we call the function with one argument,
9401 which is a list of all the arguments given to this function.
9402 If the function can't decide a coding system, it can return
9403 `undecided' so that the normal code-detection is performed.
9404
9405 If OPERATION is `insert-file-contents', the argument corresponding to
9406 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9407 file name to look up, and BUFFER is a buffer that contains the file's
9408 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9409 function to call for FILENAME, that function should examine the
9410 contents of BUFFER instead of reading the file.
9411
9412 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9413   (int nargs, Lisp_Object *args)
9414 {
9415   Lisp_Object operation, target_idx, target, val;
9416   register Lisp_Object chain;
9417
9418   if (nargs < 2)
9419     error ("Too few arguments");
9420   operation = args[0];
9421   if (!SYMBOLP (operation)
9422       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
9423     error ("Invalid first argument");
9424   if (nargs < 1 + XINT (target_idx))
9425     error ("Too few arguments for operation: %s",
9426            SDATA (SYMBOL_NAME (operation)));
9427   target = args[XINT (target_idx) + 1];
9428   if (!(STRINGP (target)
9429         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9430             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9431         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9432     error ("Invalid %dth argument", XINT (target_idx) + 1);
9433   if (CONSP (target))
9434     target = XCAR (target);
9435
9436   chain = ((EQ (operation, Qinsert_file_contents)
9437             || EQ (operation, Qwrite_region))
9438            ? Vfile_coding_system_alist
9439            : (EQ (operation, Qopen_network_stream)
9440               ? Vnetwork_coding_system_alist
9441               : Vprocess_coding_system_alist));
9442   if (NILP (chain))
9443     return Qnil;
9444
9445   for (; CONSP (chain); chain = XCDR (chain))
9446     {
9447       Lisp_Object elt;
9448
9449       elt = XCAR (chain);
9450       if (CONSP (elt)
9451           && ((STRINGP (target)
9452                && STRINGP (XCAR (elt))
9453                && fast_string_match (XCAR (elt), target) >= 0)
9454               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9455         {
9456           val = XCDR (elt);
9457           /* Here, if VAL is both a valid coding system and a valid
9458              function symbol, we return VAL as a coding system.  */
9459           if (CONSP (val))
9460             return val;
9461           if (! SYMBOLP (val))
9462             return Qnil;
9463           if (! NILP (Fcoding_system_p (val)))
9464             return Fcons (val, val);
9465           if (! NILP (Ffboundp (val)))
9466             {
9467               /* We use call1 rather than safe_call1
9468                  so as to get bug reports about functions called here
9469                  which don't handle the current interface.  */
9470               val = call1 (val, Flist (nargs, args));
9471               if (CONSP (val))
9472                 return val;
9473               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9474                 return Fcons (val, val);
9475             }
9476           return Qnil;
9477         }
9478     }
9479   return Qnil;
9480 }
9481
9482 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9483        Sset_coding_system_priority, 0, MANY, 0,
9484        doc: /* Assign higher priority to the coding systems given as arguments.
9485 If multiple coding systems belong to the same category,
9486 all but the first one are ignored.
9487
9488 usage: (set-coding-system-priority &rest coding-systems)  */)
9489   (int nargs, Lisp_Object *args)
9490 {
9491   int i, j;
9492   int changed[coding_category_max];
9493   enum coding_category priorities[coding_category_max];
9494
9495   memset (changed, 0, sizeof changed);
9496
9497   for (i = j = 0; i < nargs; i++)
9498     {
9499       enum coding_category category;
9500       Lisp_Object spec, attrs;
9501
9502       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9503       attrs = AREF (spec, 0);
9504       category = XINT (CODING_ATTR_CATEGORY (attrs));
9505       if (changed[category])
9506         /* Ignore this coding system because a coding system of the
9507            same category already had a higher priority.  */
9508         continue;
9509       changed[category] = 1;
9510       priorities[j++] = category;
9511       if (coding_categories[category].id >= 0
9512           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9513         setup_coding_system (args[i], &coding_categories[category]);
9514       Fset (AREF (Vcoding_category_table, category), args[i]);
9515     }
9516
9517   /* Now we have decided top J priorities.  Reflect the order of the
9518      original priorities to the remaining priorities.  */
9519
9520   for (i = j, j = 0; i < coding_category_max; i++, j++)
9521     {
9522       while (j < coding_category_max
9523              && changed[coding_priorities[j]])
9524         j++;
9525       if (j == coding_category_max)
9526         abort ();
9527       priorities[i] = coding_priorities[j];
9528     }
9529
9530   memcpy (coding_priorities, priorities, sizeof priorities);
9531
9532   /* Update `coding-category-list'.  */
9533   Vcoding_category_list = Qnil;
9534   for (i = coding_category_max - 1; i >= 0; i--)
9535     Vcoding_category_list
9536       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9537                Vcoding_category_list);
9538
9539   return Qnil;
9540 }
9541
9542 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9543        Scoding_system_priority_list, 0, 1, 0,
9544        doc: /* Return a list of coding systems ordered by their priorities.
9545 The list contains a subset of coding systems; i.e. coding systems
9546 assigned to each coding category (see `coding-category-list').
9547
9548 HIGHESTP non-nil means just return the highest priority one.  */)
9549   (Lisp_Object highestp)
9550 {
9551   int i;
9552   Lisp_Object val;
9553
9554   for (i = 0, val = Qnil; i < coding_category_max; i++)
9555     {
9556       enum coding_category category = coding_priorities[i];
9557       int id = coding_categories[category].id;
9558       Lisp_Object attrs;
9559
9560       if (id < 0)
9561         continue;
9562       attrs = CODING_ID_ATTRS (id);
9563       if (! NILP (highestp))
9564         return CODING_ATTR_BASE_NAME (attrs);
9565       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9566     }
9567   return Fnreverse (val);
9568 }
9569
9570 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9571
9572 static Lisp_Object
9573 make_subsidiaries (Lisp_Object base)
9574 {
9575   Lisp_Object subsidiaries;
9576   int base_name_len = SBYTES (SYMBOL_NAME (base));
9577   char *buf = (char *) alloca (base_name_len + 6);
9578   int i;
9579
9580   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
9581   subsidiaries = Fmake_vector (make_number (3), Qnil);
9582   for (i = 0; i < 3; i++)
9583     {
9584       memcpy (buf + base_name_len, suffixes[i], strlen (suffixes[i]) + 1);
9585       ASET (subsidiaries, i, intern (buf));
9586     }
9587   return subsidiaries;
9588 }
9589
9590
9591 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9592        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9593        doc: /* For internal use only.
9594 usage: (define-coding-system-internal ...)  */)
9595   (int nargs, Lisp_Object *args)
9596 {
9597   Lisp_Object name;
9598   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
9599   Lisp_Object attrs;            /* Vector of attributes.  */
9600   Lisp_Object eol_type;
9601   Lisp_Object aliases;
9602   Lisp_Object coding_type, charset_list, safe_charsets;
9603   enum coding_category category;
9604   Lisp_Object tail, val;
9605   int max_charset_id = 0;
9606   int i;
9607
9608   if (nargs < coding_arg_max)
9609     goto short_args;
9610
9611   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9612
9613   name = args[coding_arg_name];
9614   CHECK_SYMBOL (name);
9615   CODING_ATTR_BASE_NAME (attrs) = name;
9616
9617   val = args[coding_arg_mnemonic];
9618   if (! STRINGP (val))
9619     CHECK_CHARACTER (val);
9620   CODING_ATTR_MNEMONIC (attrs) = val;
9621
9622   coding_type = args[coding_arg_coding_type];
9623   CHECK_SYMBOL (coding_type);
9624   CODING_ATTR_TYPE (attrs) = coding_type;
9625
9626   charset_list = args[coding_arg_charset_list];
9627   if (SYMBOLP (charset_list))
9628     {
9629       if (EQ (charset_list, Qiso_2022))
9630         {
9631           if (! EQ (coding_type, Qiso_2022))
9632             error ("Invalid charset-list");
9633           charset_list = Viso_2022_charset_list;
9634         }
9635       else if (EQ (charset_list, Qemacs_mule))
9636         {
9637           if (! EQ (coding_type, Qemacs_mule))
9638             error ("Invalid charset-list");
9639           charset_list = Vemacs_mule_charset_list;
9640         }
9641       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9642         if (max_charset_id < XFASTINT (XCAR (tail)))
9643           max_charset_id = XFASTINT (XCAR (tail));
9644     }
9645   else
9646     {
9647       charset_list = Fcopy_sequence (charset_list);
9648       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9649         {
9650           struct charset *charset;
9651
9652           val = XCAR (tail);
9653           CHECK_CHARSET_GET_CHARSET (val, charset);
9654           if (EQ (coding_type, Qiso_2022)
9655               ? CHARSET_ISO_FINAL (charset) < 0
9656               : EQ (coding_type, Qemacs_mule)
9657               ? CHARSET_EMACS_MULE_ID (charset) < 0
9658               : 0)
9659             error ("Can't handle charset `%s'",
9660                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9661
9662           XSETCAR (tail, make_number (charset->id));
9663           if (max_charset_id < charset->id)
9664             max_charset_id = charset->id;
9665         }
9666     }
9667   CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
9668
9669   safe_charsets = make_uninit_string (max_charset_id + 1);
9670   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
9671   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9672     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9673   CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
9674
9675   CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
9676
9677   val = args[coding_arg_decode_translation_table];
9678   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9679     CHECK_SYMBOL (val);
9680   CODING_ATTR_DECODE_TBL (attrs) = val;
9681
9682   val = args[coding_arg_encode_translation_table];
9683   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9684     CHECK_SYMBOL (val);
9685   CODING_ATTR_ENCODE_TBL (attrs) = val;
9686
9687   val = args[coding_arg_post_read_conversion];
9688   CHECK_SYMBOL (val);
9689   CODING_ATTR_POST_READ (attrs) = val;
9690
9691   val = args[coding_arg_pre_write_conversion];
9692   CHECK_SYMBOL (val);
9693   CODING_ATTR_PRE_WRITE (attrs) = val;
9694
9695   val = args[coding_arg_default_char];
9696   if (NILP (val))
9697     CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
9698   else
9699     {
9700       CHECK_CHARACTER (val);
9701       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9702     }
9703
9704   val = args[coding_arg_for_unibyte];
9705   CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
9706
9707   val = args[coding_arg_plist];
9708   CHECK_LIST (val);
9709   CODING_ATTR_PLIST (attrs) = val;
9710
9711   if (EQ (coding_type, Qcharset))
9712     {
9713       /* Generate a lisp vector of 256 elements.  Each element is nil,
9714          integer, or a list of charset IDs.
9715
9716          If Nth element is nil, the byte code N is invalid in this
9717          coding system.
9718
9719          If Nth element is a number NUM, N is the first byte of a
9720          charset whose ID is NUM.
9721
9722          If Nth element is a list of charset IDs, N is the first byte
9723          of one of them.  The list is sorted by dimensions of the
9724          charsets.  A charset of smaller dimension comes first. */
9725       val = Fmake_vector (make_number (256), Qnil);
9726
9727       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9728         {
9729           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9730           int dim = CHARSET_DIMENSION (charset);
9731           int idx = (dim - 1) * 4;
9732
9733           if (CHARSET_ASCII_COMPATIBLE_P (charset))
9734             CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9735
9736           for (i = charset->code_space[idx];
9737                i <= charset->code_space[idx + 1]; i++)
9738             {
9739               Lisp_Object tmp, tmp2;
9740               int dim2;
9741
9742               tmp = AREF (val, i);
9743               if (NILP (tmp))
9744                 tmp = XCAR (tail);
9745               else if (NUMBERP (tmp))
9746                 {
9747                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9748                   if (dim < dim2)
9749                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
9750                   else
9751                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
9752                 }
9753               else
9754                 {
9755                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9756                     {
9757                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9758                       if (dim < dim2)
9759                         break;
9760                     }
9761                   if (NILP (tmp2))
9762                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9763                   else
9764                     {
9765                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9766                       XSETCAR (tmp2, XCAR (tail));
9767                     }
9768                 }
9769               ASET (val, i, tmp);
9770             }
9771         }
9772       ASET (attrs, coding_attr_charset_valids, val);
9773       category = coding_category_charset;
9774     }
9775   else if (EQ (coding_type, Qccl))
9776     {
9777       Lisp_Object valids;
9778
9779       if (nargs < coding_arg_ccl_max)
9780         goto short_args;
9781
9782       val = args[coding_arg_ccl_decoder];
9783       CHECK_CCL_PROGRAM (val);
9784       if (VECTORP (val))
9785         val = Fcopy_sequence (val);
9786       ASET (attrs, coding_attr_ccl_decoder, val);
9787
9788       val = args[coding_arg_ccl_encoder];
9789       CHECK_CCL_PROGRAM (val);
9790       if (VECTORP (val))
9791         val = Fcopy_sequence (val);
9792       ASET (attrs, coding_attr_ccl_encoder, val);
9793
9794       val = args[coding_arg_ccl_valids];
9795       valids = Fmake_string (make_number (256), make_number (0));
9796       for (tail = val; !NILP (tail); tail = Fcdr (tail))
9797         {
9798           int from, to;
9799
9800           val = Fcar (tail);
9801           if (INTEGERP (val))
9802             {
9803               from = to = XINT (val);
9804               if (from < 0 || from > 255)
9805                 args_out_of_range_3 (val, make_number (0), make_number (255));
9806             }
9807           else
9808             {
9809               CHECK_CONS (val);
9810               CHECK_NATNUM_CAR (val);
9811               CHECK_NATNUM_CDR (val);
9812               from = XINT (XCAR (val));
9813               if (from > 255)
9814                 args_out_of_range_3 (XCAR (val),
9815                                      make_number (0), make_number (255));
9816               to = XINT (XCDR (val));
9817               if (to < from || to > 255)
9818                 args_out_of_range_3 (XCDR (val),
9819                                      XCAR (val), make_number (255));
9820             }
9821           for (i = from; i <= to; i++)
9822             SSET (valids, i, 1);
9823         }
9824       ASET (attrs, coding_attr_ccl_valids, valids);
9825
9826       category = coding_category_ccl;
9827     }
9828   else if (EQ (coding_type, Qutf_16))
9829     {
9830       Lisp_Object bom, endian;
9831
9832       CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9833
9834       if (nargs < coding_arg_utf16_max)
9835         goto short_args;
9836
9837       bom = args[coding_arg_utf16_bom];
9838       if (! NILP (bom) && ! EQ (bom, Qt))
9839         {
9840           CHECK_CONS (bom);
9841           val = XCAR (bom);
9842           CHECK_CODING_SYSTEM (val);
9843           val = XCDR (bom);
9844           CHECK_CODING_SYSTEM (val);
9845         }
9846       ASET (attrs, coding_attr_utf_bom, bom);
9847
9848       endian = args[coding_arg_utf16_endian];
9849       CHECK_SYMBOL (endian);
9850       if (NILP (endian))
9851         endian = Qbig;
9852       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
9853         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
9854       ASET (attrs, coding_attr_utf_16_endian, endian);
9855
9856       category = (CONSP (bom)
9857                   ? coding_category_utf_16_auto
9858                   : NILP (bom)
9859                   ? (EQ (endian, Qbig)
9860                      ? coding_category_utf_16_be_nosig
9861                      : coding_category_utf_16_le_nosig)
9862                   : (EQ (endian, Qbig)
9863                      ? coding_category_utf_16_be
9864                      : coding_category_utf_16_le));
9865     }
9866   else if (EQ (coding_type, Qiso_2022))
9867     {
9868       Lisp_Object initial, reg_usage, request, flags;
9869       int i;
9870
9871       if (nargs < coding_arg_iso2022_max)
9872         goto short_args;
9873
9874       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9875       CHECK_VECTOR (initial);
9876       for (i = 0; i < 4; i++)
9877         {
9878           val = Faref (initial, make_number (i));
9879           if (! NILP (val))
9880             {
9881               struct charset *charset;
9882
9883               CHECK_CHARSET_GET_CHARSET (val, charset);
9884               ASET (initial, i, make_number (CHARSET_ID (charset)));
9885               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9886                 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9887             }
9888           else
9889             ASET (initial, i, make_number (-1));
9890         }
9891
9892       reg_usage = args[coding_arg_iso2022_reg_usage];
9893       CHECK_CONS (reg_usage);
9894       CHECK_NUMBER_CAR (reg_usage);
9895       CHECK_NUMBER_CDR (reg_usage);
9896
9897       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9898       for (tail = request; ! NILP (tail); tail = Fcdr (tail))
9899         {
9900           int id;
9901           Lisp_Object tmp;
9902
9903           val = Fcar (tail);
9904           CHECK_CONS (val);
9905           tmp = XCAR (val);
9906           CHECK_CHARSET_GET_ID (tmp, id);
9907           CHECK_NATNUM_CDR (val);
9908           if (XINT (XCDR (val)) >= 4)
9909             error ("Invalid graphic register number: %d", XINT (XCDR (val)));
9910           XSETCAR (val, make_number (id));
9911         }
9912
9913       flags = args[coding_arg_iso2022_flags];
9914       CHECK_NATNUM (flags);
9915       i = XINT (flags);
9916       if (EQ (args[coding_arg_charset_list], Qiso_2022))
9917         flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
9918
9919       ASET (attrs, coding_attr_iso_initial, initial);
9920       ASET (attrs, coding_attr_iso_usage, reg_usage);
9921       ASET (attrs, coding_attr_iso_request, request);
9922       ASET (attrs, coding_attr_iso_flags, flags);
9923       setup_iso_safe_charsets (attrs);
9924
9925       if (i & CODING_ISO_FLAG_SEVEN_BITS)
9926         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9927                           | CODING_ISO_FLAG_SINGLE_SHIFT))
9928                     ? coding_category_iso_7_else
9929                     : EQ (args[coding_arg_charset_list], Qiso_2022)
9930                     ? coding_category_iso_7
9931                     : coding_category_iso_7_tight);
9932       else
9933         {
9934           int id = XINT (AREF (initial, 1));
9935
9936           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
9937                        || EQ (args[coding_arg_charset_list], Qiso_2022)
9938                        || id < 0)
9939                       ? coding_category_iso_8_else
9940                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9941                       ? coding_category_iso_8_1
9942                       : coding_category_iso_8_2);
9943         }
9944       if (category != coding_category_iso_8_1
9945           && category != coding_category_iso_8_2)
9946         CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9947     }
9948   else if (EQ (coding_type, Qemacs_mule))
9949     {
9950       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
9951         ASET (attrs, coding_attr_emacs_mule_full, Qt);
9952       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9953       category = coding_category_emacs_mule;
9954     }
9955   else if (EQ (coding_type, Qshift_jis))
9956     {
9957
9958       struct charset *charset;
9959
9960       if (XINT (Flength (charset_list)) != 3
9961           && XINT (Flength (charset_list)) != 4)
9962         error ("There should be three or four charsets");
9963
9964       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9965       if (CHARSET_DIMENSION (charset) != 1)
9966         error ("Dimension of charset %s is not one",
9967                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9968       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9969         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9970
9971       charset_list = XCDR (charset_list);
9972       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9973       if (CHARSET_DIMENSION (charset) != 1)
9974         error ("Dimension of charset %s is not one",
9975                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9976
9977       charset_list = XCDR (charset_list);
9978       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9979       if (CHARSET_DIMENSION (charset) != 2)
9980         error ("Dimension of charset %s is not two",
9981                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9982
9983       charset_list = XCDR (charset_list);
9984       if (! NILP (charset_list))
9985         {
9986           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9987           if (CHARSET_DIMENSION (charset) != 2)
9988             error ("Dimension of charset %s is not two",
9989                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9990         }
9991
9992       category = coding_category_sjis;
9993       Vsjis_coding_system = name;
9994     }
9995   else if (EQ (coding_type, Qbig5))
9996     {
9997       struct charset *charset;
9998
9999       if (XINT (Flength (charset_list)) != 2)
10000         error ("There should be just two charsets");
10001
10002       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10003       if (CHARSET_DIMENSION (charset) != 1)
10004         error ("Dimension of charset %s is not one",
10005                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10006       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10007         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10008
10009       charset_list = XCDR (charset_list);
10010       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10011       if (CHARSET_DIMENSION (charset) != 2)
10012         error ("Dimension of charset %s is not two",
10013                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10014
10015       category = coding_category_big5;
10016       Vbig5_coding_system = name;
10017     }
10018   else if (EQ (coding_type, Qraw_text))
10019     {
10020       category = coding_category_raw_text;
10021       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10022     }
10023   else if (EQ (coding_type, Qutf_8))
10024     {
10025       Lisp_Object bom;
10026
10027       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10028
10029       if (nargs < coding_arg_utf8_max)
10030         goto short_args;
10031
10032       bom = args[coding_arg_utf8_bom];
10033       if (! NILP (bom) && ! EQ (bom, Qt))
10034         {
10035           CHECK_CONS (bom);
10036           val = XCAR (bom);
10037           CHECK_CODING_SYSTEM (val);
10038           val = XCDR (bom);
10039           CHECK_CODING_SYSTEM (val);
10040         }
10041       ASET (attrs, coding_attr_utf_bom, bom);
10042
10043       category = (CONSP (bom) ? coding_category_utf_8_auto
10044                   : NILP (bom) ? coding_category_utf_8_nosig
10045                   : coding_category_utf_8_sig);
10046     }
10047   else if (EQ (coding_type, Qundecided))
10048     category = coding_category_undecided;
10049   else
10050     error ("Invalid coding system type: %s",
10051            SDATA (SYMBOL_NAME (coding_type)));
10052
10053   CODING_ATTR_CATEGORY (attrs) = make_number (category);
10054   CODING_ATTR_PLIST (attrs)
10055     = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
10056                                 CODING_ATTR_PLIST (attrs)));
10057   CODING_ATTR_PLIST (attrs)
10058     = Fcons (QCascii_compatible_p,
10059              Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10060                     CODING_ATTR_PLIST (attrs)));
10061
10062   eol_type = args[coding_arg_eol_type];
10063   if (! NILP (eol_type)
10064       && ! EQ (eol_type, Qunix)
10065       && ! EQ (eol_type, Qdos)
10066       && ! EQ (eol_type, Qmac))
10067     error ("Invalid eol-type");
10068
10069   aliases = Fcons (name, Qnil);
10070
10071   if (NILP (eol_type))
10072     {
10073       eol_type = make_subsidiaries (name);
10074       for (i = 0; i < 3; i++)
10075         {
10076           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10077
10078           this_name = AREF (eol_type, i);
10079           this_aliases = Fcons (this_name, Qnil);
10080           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10081           this_spec = Fmake_vector (make_number (3), attrs);
10082           ASET (this_spec, 1, this_aliases);
10083           ASET (this_spec, 2, this_eol_type);
10084           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10085           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10086           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10087           if (NILP (val))
10088             Vcoding_system_alist
10089               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10090                        Vcoding_system_alist);
10091         }
10092     }
10093
10094   spec_vec = Fmake_vector (make_number (3), attrs);
10095   ASET (spec_vec, 1, aliases);
10096   ASET (spec_vec, 2, eol_type);
10097
10098   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10099   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10100   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10101   if (NILP (val))
10102     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10103                                   Vcoding_system_alist);
10104
10105   {
10106     int id = coding_categories[category].id;
10107
10108     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10109       setup_coding_system (name, &coding_categories[category]);
10110   }
10111
10112   return Qnil;
10113
10114  short_args:
10115   return Fsignal (Qwrong_number_of_arguments,
10116                   Fcons (intern ("define-coding-system-internal"),
10117                          make_number (nargs)));
10118 }
10119
10120
10121 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10122        3, 3, 0,
10123        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10124   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10125 {
10126   Lisp_Object spec, attrs;
10127
10128   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10129   attrs = AREF (spec, 0);
10130   if (EQ (prop, QCmnemonic))
10131     {
10132       if (! STRINGP (val))
10133         CHECK_CHARACTER (val);
10134       CODING_ATTR_MNEMONIC (attrs) = val;
10135     }
10136   else if (EQ (prop, QCdefault_char))
10137     {
10138       if (NILP (val))
10139         val = make_number (' ');
10140       else
10141         CHECK_CHARACTER (val);
10142       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
10143     }
10144   else if (EQ (prop, QCdecode_translation_table))
10145     {
10146       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10147         CHECK_SYMBOL (val);
10148       CODING_ATTR_DECODE_TBL (attrs) = val;
10149     }
10150   else if (EQ (prop, QCencode_translation_table))
10151     {
10152       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10153         CHECK_SYMBOL (val);
10154       CODING_ATTR_ENCODE_TBL (attrs) = val;
10155     }
10156   else if (EQ (prop, QCpost_read_conversion))
10157     {
10158       CHECK_SYMBOL (val);
10159       CODING_ATTR_POST_READ (attrs) = val;
10160     }
10161   else if (EQ (prop, QCpre_write_conversion))
10162     {
10163       CHECK_SYMBOL (val);
10164       CODING_ATTR_PRE_WRITE (attrs) = val;
10165     }
10166   else if (EQ (prop, QCascii_compatible_p))
10167     {
10168       CODING_ATTR_ASCII_COMPAT (attrs) = val;
10169     }
10170
10171   CODING_ATTR_PLIST (attrs)
10172     = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
10173   return val;
10174 }
10175
10176
10177 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10178        Sdefine_coding_system_alias, 2, 2, 0,
10179        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10180   (Lisp_Object alias, Lisp_Object coding_system)
10181 {
10182   Lisp_Object spec, aliases, eol_type, val;
10183
10184   CHECK_SYMBOL (alias);
10185   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10186   aliases = AREF (spec, 1);
10187   /* ALIASES should be a list of length more than zero, and the first
10188      element is a base coding system.  Append ALIAS at the tail of the
10189      list.  */
10190   while (!NILP (XCDR (aliases)))
10191     aliases = XCDR (aliases);
10192   XSETCDR (aliases, Fcons (alias, Qnil));
10193
10194   eol_type = AREF (spec, 2);
10195   if (VECTORP (eol_type))
10196     {
10197       Lisp_Object subsidiaries;
10198       int i;
10199
10200       subsidiaries = make_subsidiaries (alias);
10201       for (i = 0; i < 3; i++)
10202         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10203                                      AREF (eol_type, i));
10204     }
10205
10206   Fputhash (alias, spec, Vcoding_system_hash_table);
10207   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10208   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10209   if (NILP (val))
10210     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10211                                   Vcoding_system_alist);
10212
10213   return Qnil;
10214 }
10215
10216 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10217        1, 1, 0,
10218        doc: /* Return the base of CODING-SYSTEM.
10219 Any alias or subsidiary coding system is not a base coding system.  */)
10220   (Lisp_Object coding_system)
10221 {
10222   Lisp_Object spec, attrs;
10223
10224   if (NILP (coding_system))
10225     return (Qno_conversion);
10226   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10227   attrs = AREF (spec, 0);
10228   return CODING_ATTR_BASE_NAME (attrs);
10229 }
10230
10231 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10232        1, 1, 0,
10233        doc: "Return the property list of CODING-SYSTEM.")
10234   (Lisp_Object coding_system)
10235 {
10236   Lisp_Object spec, attrs;
10237
10238   if (NILP (coding_system))
10239     coding_system = Qno_conversion;
10240   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10241   attrs = AREF (spec, 0);
10242   return CODING_ATTR_PLIST (attrs);
10243 }
10244
10245
10246 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10247        1, 1, 0,
10248        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10249   (Lisp_Object coding_system)
10250 {
10251   Lisp_Object spec;
10252
10253   if (NILP (coding_system))
10254     coding_system = Qno_conversion;
10255   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10256   return AREF (spec, 1);
10257 }
10258
10259 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10260        Scoding_system_eol_type, 1, 1, 0,
10261        doc: /* Return eol-type of CODING-SYSTEM.
10262 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10263
10264 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10265 and CR respectively.
10266
10267 A vector value indicates that a format of end-of-line should be
10268 detected automatically.  Nth element of the vector is the subsidiary
10269 coding system whose eol-type is N.  */)
10270   (Lisp_Object coding_system)
10271 {
10272   Lisp_Object spec, eol_type;
10273   int n;
10274
10275   if (NILP (coding_system))
10276     coding_system = Qno_conversion;
10277   if (! CODING_SYSTEM_P (coding_system))
10278     return Qnil;
10279   spec = CODING_SYSTEM_SPEC (coding_system);
10280   eol_type = AREF (spec, 2);
10281   if (VECTORP (eol_type))
10282     return Fcopy_sequence (eol_type);
10283   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10284   return make_number (n);
10285 }
10286
10287 #endif /* emacs */
10288
10289 \f
10290 /*** 9. Post-amble ***/
10291
10292 void
10293 init_coding_once (void)
10294 {
10295   int i;
10296
10297   for (i = 0; i < coding_category_max; i++)
10298     {
10299       coding_categories[i].id = -1;
10300       coding_priorities[i] = i;
10301     }
10302
10303   /* ISO2022 specific initialize routine.  */
10304   for (i = 0; i < 0x20; i++)
10305     iso_code_class[i] = ISO_control_0;
10306   for (i = 0x21; i < 0x7F; i++)
10307     iso_code_class[i] = ISO_graphic_plane_0;
10308   for (i = 0x80; i < 0xA0; i++)
10309     iso_code_class[i] = ISO_control_1;
10310   for (i = 0xA1; i < 0xFF; i++)
10311     iso_code_class[i] = ISO_graphic_plane_1;
10312   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10313   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10314   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10315   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10316   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10317   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10318   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10319   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10320   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10321
10322   for (i = 0; i < 256; i++)
10323     {
10324       emacs_mule_bytes[i] = 1;
10325     }
10326   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10327   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10328   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10329   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10330 }
10331
10332 #ifdef emacs
10333
10334 void
10335 syms_of_coding (void)
10336 {
10337   staticpro (&Vcoding_system_hash_table);
10338   {
10339     Lisp_Object args[2];
10340     args[0] = QCtest;
10341     args[1] = Qeq;
10342     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10343   }
10344
10345   staticpro (&Vsjis_coding_system);
10346   Vsjis_coding_system = Qnil;
10347
10348   staticpro (&Vbig5_coding_system);
10349   Vbig5_coding_system = Qnil;
10350
10351   staticpro (&Vcode_conversion_reused_workbuf);
10352   Vcode_conversion_reused_workbuf = Qnil;
10353
10354   staticpro (&Vcode_conversion_workbuf_name);
10355   Vcode_conversion_workbuf_name = make_pure_c_string (" *code-conversion-work*");
10356
10357   reused_workbuf_in_use = 0;
10358
10359   DEFSYM (Qcharset, "charset");
10360   DEFSYM (Qtarget_idx, "target-idx");
10361   DEFSYM (Qcoding_system_history, "coding-system-history");
10362   Fset (Qcoding_system_history, Qnil);
10363
10364   /* Target FILENAME is the first argument.  */
10365   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10366   /* Target FILENAME is the third argument.  */
10367   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10368
10369   DEFSYM (Qcall_process, "call-process");
10370   /* Target PROGRAM is the first argument.  */
10371   Fput (Qcall_process, Qtarget_idx, make_number (0));
10372
10373   DEFSYM (Qcall_process_region, "call-process-region");
10374   /* Target PROGRAM is the third argument.  */
10375   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10376
10377   DEFSYM (Qstart_process, "start-process");
10378   /* Target PROGRAM is the third argument.  */
10379   Fput (Qstart_process, Qtarget_idx, make_number (2));
10380
10381   DEFSYM (Qopen_network_stream, "open-network-stream");
10382   /* Target SERVICE is the fourth argument.  */
10383   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10384
10385   DEFSYM (Qcoding_system, "coding-system");
10386   DEFSYM (Qcoding_aliases, "coding-aliases");
10387
10388   DEFSYM (Qeol_type, "eol-type");
10389   DEFSYM (Qunix, "unix");
10390   DEFSYM (Qdos, "dos");
10391
10392   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10393   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10394   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10395   DEFSYM (Qdefault_char, "default-char");
10396   DEFSYM (Qundecided, "undecided");
10397   DEFSYM (Qno_conversion, "no-conversion");
10398   DEFSYM (Qraw_text, "raw-text");
10399
10400   DEFSYM (Qiso_2022, "iso-2022");
10401
10402   DEFSYM (Qutf_8, "utf-8");
10403   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10404
10405   DEFSYM (Qutf_16, "utf-16");
10406   DEFSYM (Qbig, "big");
10407   DEFSYM (Qlittle, "little");
10408
10409   DEFSYM (Qshift_jis, "shift-jis");
10410   DEFSYM (Qbig5, "big5");
10411
10412   DEFSYM (Qcoding_system_p, "coding-system-p");
10413
10414   DEFSYM (Qcoding_system_error, "coding-system-error");
10415   Fput (Qcoding_system_error, Qerror_conditions,
10416         pure_cons (Qcoding_system_error, pure_cons (Qerror, Qnil)));
10417   Fput (Qcoding_system_error, Qerror_message,
10418         make_pure_c_string ("Invalid coding system"));
10419
10420   /* Intern this now in case it isn't already done.
10421      Setting this variable twice is harmless.
10422      But don't staticpro it here--that is done in alloc.c.  */
10423   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10424
10425   DEFSYM (Qtranslation_table, "translation-table");
10426   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10427   DEFSYM (Qtranslation_table_id, "translation-table-id");
10428   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10429   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10430
10431   DEFSYM (Qvalid_codes, "valid-codes");
10432
10433   DEFSYM (Qemacs_mule, "emacs-mule");
10434
10435   DEFSYM (QCcategory, ":category");
10436   DEFSYM (QCmnemonic, ":mnemonic");
10437   DEFSYM (QCdefault_char, ":default-char");
10438   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10439   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10440   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10441   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10442   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10443
10444   Vcoding_category_table
10445     = Fmake_vector (make_number (coding_category_max), Qnil);
10446   staticpro (&Vcoding_category_table);
10447   /* Followings are target of code detection.  */
10448   ASET (Vcoding_category_table, coding_category_iso_7,
10449         intern_c_string ("coding-category-iso-7"));
10450   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10451         intern_c_string ("coding-category-iso-7-tight"));
10452   ASET (Vcoding_category_table, coding_category_iso_8_1,
10453         intern_c_string ("coding-category-iso-8-1"));
10454   ASET (Vcoding_category_table, coding_category_iso_8_2,
10455         intern_c_string ("coding-category-iso-8-2"));
10456   ASET (Vcoding_category_table, coding_category_iso_7_else,
10457         intern_c_string ("coding-category-iso-7-else"));
10458   ASET (Vcoding_category_table, coding_category_iso_8_else,
10459         intern_c_string ("coding-category-iso-8-else"));
10460   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10461         intern_c_string ("coding-category-utf-8-auto"));
10462   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10463         intern_c_string ("coding-category-utf-8"));
10464   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10465         intern_c_string ("coding-category-utf-8-sig"));
10466   ASET (Vcoding_category_table, coding_category_utf_16_be,
10467         intern_c_string ("coding-category-utf-16-be"));
10468   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10469         intern_c_string ("coding-category-utf-16-auto"));
10470   ASET (Vcoding_category_table, coding_category_utf_16_le,
10471         intern_c_string ("coding-category-utf-16-le"));
10472   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10473         intern_c_string ("coding-category-utf-16-be-nosig"));
10474   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10475         intern_c_string ("coding-category-utf-16-le-nosig"));
10476   ASET (Vcoding_category_table, coding_category_charset,
10477         intern_c_string ("coding-category-charset"));
10478   ASET (Vcoding_category_table, coding_category_sjis,
10479         intern_c_string ("coding-category-sjis"));
10480   ASET (Vcoding_category_table, coding_category_big5,
10481         intern_c_string ("coding-category-big5"));
10482   ASET (Vcoding_category_table, coding_category_ccl,
10483         intern_c_string ("coding-category-ccl"));
10484   ASET (Vcoding_category_table, coding_category_emacs_mule,
10485         intern_c_string ("coding-category-emacs-mule"));
10486   /* Followings are NOT target of code detection.  */
10487   ASET (Vcoding_category_table, coding_category_raw_text,
10488         intern_c_string ("coding-category-raw-text"));
10489   ASET (Vcoding_category_table, coding_category_undecided,
10490         intern_c_string ("coding-category-undecided"));
10491
10492   DEFSYM (Qinsufficient_source, "insufficient-source");
10493   DEFSYM (Qinconsistent_eol, "inconsistent-eol");
10494   DEFSYM (Qinvalid_source, "invalid-source");
10495   DEFSYM (Qinterrupted, "interrupted");
10496   DEFSYM (Qinsufficient_memory, "insufficient-memory");
10497   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10498
10499   defsubr (&Scoding_system_p);
10500   defsubr (&Sread_coding_system);
10501   defsubr (&Sread_non_nil_coding_system);
10502   defsubr (&Scheck_coding_system);
10503   defsubr (&Sdetect_coding_region);
10504   defsubr (&Sdetect_coding_string);
10505   defsubr (&Sfind_coding_systems_region_internal);
10506   defsubr (&Sunencodable_char_position);
10507   defsubr (&Scheck_coding_systems_region);
10508   defsubr (&Sdecode_coding_region);
10509   defsubr (&Sencode_coding_region);
10510   defsubr (&Sdecode_coding_string);
10511   defsubr (&Sencode_coding_string);
10512   defsubr (&Sdecode_sjis_char);
10513   defsubr (&Sencode_sjis_char);
10514   defsubr (&Sdecode_big5_char);
10515   defsubr (&Sencode_big5_char);
10516   defsubr (&Sset_terminal_coding_system_internal);
10517   defsubr (&Sset_safe_terminal_coding_system_internal);
10518   defsubr (&Sterminal_coding_system);
10519   defsubr (&Sset_keyboard_coding_system_internal);
10520   defsubr (&Skeyboard_coding_system);
10521   defsubr (&Sfind_operation_coding_system);
10522   defsubr (&Sset_coding_system_priority);
10523   defsubr (&Sdefine_coding_system_internal);
10524   defsubr (&Sdefine_coding_system_alias);
10525   defsubr (&Scoding_system_put);
10526   defsubr (&Scoding_system_base);
10527   defsubr (&Scoding_system_plist);
10528   defsubr (&Scoding_system_aliases);
10529   defsubr (&Scoding_system_eol_type);
10530   defsubr (&Scoding_system_priority_list);
10531
10532   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
10533                doc: /* List of coding systems.
10534
10535 Do not alter the value of this variable manually.  This variable should be
10536 updated by the functions `define-coding-system' and
10537 `define-coding-system-alias'.  */);
10538   Vcoding_system_list = Qnil;
10539
10540   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
10541                doc: /* Alist of coding system names.
10542 Each element is one element list of coding system name.
10543 This variable is given to `completing-read' as COLLECTION argument.
10544
10545 Do not alter the value of this variable manually.  This variable should be
10546 updated by the functions `make-coding-system' and
10547 `define-coding-system-alias'.  */);
10548   Vcoding_system_alist = Qnil;
10549
10550   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
10551                doc: /* List of coding-categories (symbols) ordered by priority.
10552
10553 On detecting a coding system, Emacs tries code detection algorithms
10554 associated with each coding-category one by one in this order.  When
10555 one algorithm agrees with a byte sequence of source text, the coding
10556 system bound to the corresponding coding-category is selected.
10557
10558 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
10559   {
10560     int i;
10561
10562     Vcoding_category_list = Qnil;
10563     for (i = coding_category_max - 1; i >= 0; i--)
10564       Vcoding_category_list
10565         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
10566                  Vcoding_category_list);
10567   }
10568
10569   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
10570                doc: /* Specify the coding system for read operations.
10571 It is useful to bind this variable with `let', but do not set it globally.
10572 If the value is a coding system, it is used for decoding on read operation.
10573 If not, an appropriate element is used from one of the coding system alists.
10574 There are three such tables: `file-coding-system-alist',
10575 `process-coding-system-alist', and `network-coding-system-alist'.  */);
10576   Vcoding_system_for_read = Qnil;
10577
10578   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
10579                doc: /* Specify the coding system for write operations.
10580 Programs bind this variable with `let', but you should not set it globally.
10581 If the value is a coding system, it is used for encoding of output,
10582 when writing it to a file and when sending it to a file or subprocess.
10583
10584 If this does not specify a coding system, an appropriate element
10585 is used from one of the coding system alists.
10586 There are three such tables: `file-coding-system-alist',
10587 `process-coding-system-alist', and `network-coding-system-alist'.
10588 For output to files, if the above procedure does not specify a coding system,
10589 the value of `buffer-file-coding-system' is used.  */);
10590   Vcoding_system_for_write = Qnil;
10591
10592   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
10593                doc: /*
10594 Coding system used in the latest file or process I/O.  */);
10595   Vlast_coding_system_used = Qnil;
10596
10597   DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error,
10598                doc: /*
10599 Error status of the last code conversion.
10600
10601 When an error was detected in the last code conversion, this variable
10602 is set to one of the following symbols.
10603   `insufficient-source'
10604   `inconsistent-eol'
10605   `invalid-source'
10606   `interrupted'
10607   `insufficient-memory'
10608 When no error was detected, the value doesn't change.  So, to check
10609 the error status of a code conversion by this variable, you must
10610 explicitly set this variable to nil before performing code
10611 conversion.  */);
10612   Vlast_code_conversion_error = Qnil;
10613
10614   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
10615                doc: /*
10616 *Non-nil means always inhibit code conversion of end-of-line format.
10617 See info node `Coding Systems' and info node `Text and Binary' concerning
10618 such conversion.  */);
10619   inhibit_eol_conversion = 0;
10620
10621   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
10622                doc: /*
10623 Non-nil means process buffer inherits coding system of process output.
10624 Bind it to t if the process output is to be treated as if it were a file
10625 read from some filesystem.  */);
10626   inherit_process_coding_system = 0;
10627
10628   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
10629                doc: /*
10630 Alist to decide a coding system to use for a file I/O operation.
10631 The format is ((PATTERN . VAL) ...),
10632 where PATTERN is a regular expression matching a file name,
10633 VAL is a coding system, a cons of coding systems, or a function symbol.
10634 If VAL is a coding system, it is used for both decoding and encoding
10635 the file contents.
10636 If VAL is a cons of coding systems, the car part is used for decoding,
10637 and the cdr part is used for encoding.
10638 If VAL is a function symbol, the function must return a coding system
10639 or a cons of coding systems which are used as above.  The function is
10640 called with an argument that is a list of the arguments with which
10641 `find-operation-coding-system' was called.  If the function can't decide
10642 a coding system, it can return `undecided' so that the normal
10643 code-detection is performed.
10644
10645 See also the function `find-operation-coding-system'
10646 and the variable `auto-coding-alist'.  */);
10647   Vfile_coding_system_alist = Qnil;
10648
10649   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
10650                doc: /*
10651 Alist to decide a coding system to use for a process I/O operation.
10652 The format is ((PATTERN . VAL) ...),
10653 where PATTERN is a regular expression matching a program name,
10654 VAL is a coding system, a cons of coding systems, or a function symbol.
10655 If VAL is a coding system, it is used for both decoding what received
10656 from the program and encoding what sent to the program.
10657 If VAL is a cons of coding systems, the car part is used for decoding,
10658 and the cdr part is used for encoding.
10659 If VAL is a function symbol, the function must return a coding system
10660 or a cons of coding systems which are used as above.
10661
10662 See also the function `find-operation-coding-system'.  */);
10663   Vprocess_coding_system_alist = Qnil;
10664
10665   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
10666                doc: /*
10667 Alist to decide a coding system to use for a network I/O operation.
10668 The format is ((PATTERN . VAL) ...),
10669 where PATTERN is a regular expression matching a network service name
10670 or is a port number to connect to,
10671 VAL is a coding system, a cons of coding systems, or a function symbol.
10672 If VAL is a coding system, it is used for both decoding what received
10673 from the network stream and encoding what sent to the network stream.
10674 If VAL is a cons of coding systems, the car part is used for decoding,
10675 and the cdr part is used for encoding.
10676 If VAL is a function symbol, the function must return a coding system
10677 or a cons of coding systems which are used as above.
10678
10679 See also the function `find-operation-coding-system'.  */);
10680   Vnetwork_coding_system_alist = Qnil;
10681
10682   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
10683                doc: /* Coding system to use with system messages.
10684 Also used for decoding keyboard input on X Window system.  */);
10685   Vlocale_coding_system = Qnil;
10686
10687   /* The eol mnemonics are reset in startup.el system-dependently.  */
10688   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
10689                doc: /*
10690 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
10691   eol_mnemonic_unix = make_pure_c_string (":");
10692
10693   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
10694                doc: /*
10695 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
10696   eol_mnemonic_dos = make_pure_c_string ("\\");
10697
10698   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
10699                doc: /*
10700 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
10701   eol_mnemonic_mac = make_pure_c_string ("/");
10702
10703   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
10704                doc: /*
10705 *String displayed in mode line when end-of-line format is not yet determined.  */);
10706   eol_mnemonic_undecided = make_pure_c_string (":");
10707
10708   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
10709                doc: /*
10710 *Non-nil enables character translation while encoding and decoding.  */);
10711   Venable_character_translation = Qt;
10712
10713   DEFVAR_LISP ("standard-translation-table-for-decode",
10714                &Vstandard_translation_table_for_decode,
10715                doc: /* Table for translating characters while decoding.  */);
10716   Vstandard_translation_table_for_decode = Qnil;
10717
10718   DEFVAR_LISP ("standard-translation-table-for-encode",
10719                &Vstandard_translation_table_for_encode,
10720                doc: /* Table for translating characters while encoding.  */);
10721   Vstandard_translation_table_for_encode = Qnil;
10722
10723   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
10724                doc: /* Alist of charsets vs revision numbers.
10725 While encoding, if a charset (car part of an element) is found,
10726 designate it with the escape sequence identifying revision (cdr part
10727 of the element).  */);
10728   Vcharset_revision_table = Qnil;
10729
10730   DEFVAR_LISP ("default-process-coding-system",
10731                &Vdefault_process_coding_system,
10732                doc: /* Cons of coding systems used for process I/O by default.
10733 The car part is used for decoding a process output,
10734 the cdr part is used for encoding a text to be sent to a process.  */);
10735   Vdefault_process_coding_system = Qnil;
10736
10737   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
10738                doc: /*
10739 Table of extra Latin codes in the range 128..159 (inclusive).
10740 This is a vector of length 256.
10741 If Nth element is non-nil, the existence of code N in a file
10742 \(or output of subprocess) doesn't prevent it to be detected as
10743 a coding system of ISO 2022 variant which has a flag
10744 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10745 or reading output of a subprocess.
10746 Only 128th through 159th elements have a meaning.  */);
10747   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
10748
10749   DEFVAR_LISP ("select-safe-coding-system-function",
10750                &Vselect_safe_coding_system_function,
10751                doc: /*
10752 Function to call to select safe coding system for encoding a text.
10753
10754 If set, this function is called to force a user to select a proper
10755 coding system which can encode the text in the case that a default
10756 coding system used in each operation can't encode the text.  The
10757 function should take care that the buffer is not modified while
10758 the coding system is being selected.
10759
10760 The default value is `select-safe-coding-system' (which see).  */);
10761   Vselect_safe_coding_system_function = Qnil;
10762
10763   DEFVAR_BOOL ("coding-system-require-warning",
10764                &coding_system_require_warning,
10765                doc: /* Internal use only.
10766 If non-nil, on writing a file, `select-safe-coding-system-function' is
10767 called even if `coding-system-for-write' is non-nil.  The command
10768 `universal-coding-system-argument' binds this variable to t temporarily.  */);
10769   coding_system_require_warning = 0;
10770
10771
10772   DEFVAR_BOOL ("inhibit-iso-escape-detection",
10773                &inhibit_iso_escape_detection,
10774                doc: /*
10775 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
10776
10777 When Emacs reads text, it tries to detect how the text is encoded.
10778 This code detection is sensitive to escape sequences.  If Emacs sees
10779 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10780 of the ISO2022 encodings, and decodes text by the corresponding coding
10781 system (e.g. `iso-2022-7bit').
10782
10783 However, there may be a case that you want to read escape sequences in
10784 a file as is.  In such a case, you can set this variable to non-nil.
10785 Then the code detection will ignore any escape sequences, and no text is
10786 detected as encoded in some ISO-2022 encoding.  The result is that all
10787 escape sequences become visible in a buffer.
10788
10789 The default value is nil, and it is strongly recommended not to change
10790 it.  That is because many Emacs Lisp source files that contain
10791 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10792 in Emacs's distribution, and they won't be decoded correctly on
10793 reading if you suppress escape sequence detection.
10794
10795 The other way to read escape sequences in a file without decoding is
10796 to explicitly specify some coding system that doesn't use ISO-2022
10797 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
10798   inhibit_iso_escape_detection = 0;
10799
10800   DEFVAR_BOOL ("inhibit-null-byte-detection",
10801                &inhibit_null_byte_detection,
10802                doc: /* If non-nil, Emacs ignores null bytes on code detection.
10803 By default, Emacs treats it as binary data, and does not attempt to
10804 decode it.  The effect is as if you specified `no-conversion' for
10805 reading that text.
10806
10807 Set this to non-nil when a regular text happens to include null bytes.
10808 Examples are Index nodes of Info files and null-byte delimited output
10809 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
10810 decode text as usual.  */);
10811   inhibit_null_byte_detection = 0;
10812
10813   DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
10814                doc: /* Char table for translating self-inserting characters.
10815 This is applied to the result of input methods, not their input.
10816 See also `keyboard-translate-table'.
10817
10818 Use of this variable for character code unification was rendered
10819 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
10820 internal character representation.  */);
10821     Vtranslation_table_for_input = Qnil;
10822
10823   {
10824     Lisp_Object args[coding_arg_max];
10825     Lisp_Object plist[16];
10826     int i;
10827
10828     for (i = 0; i < coding_arg_max; i++)
10829       args[i] = Qnil;
10830
10831     plist[0] = intern_c_string (":name");
10832     plist[1] = args[coding_arg_name] = Qno_conversion;
10833     plist[2] = intern_c_string (":mnemonic");
10834     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10835     plist[4] = intern_c_string (":coding-type");
10836     plist[5] = args[coding_arg_coding_type] = Qraw_text;
10837     plist[6] = intern_c_string (":ascii-compatible-p");
10838     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10839     plist[8] = intern_c_string (":default-char");
10840     plist[9] = args[coding_arg_default_char] = make_number (0);
10841     plist[10] = intern_c_string (":for-unibyte");
10842     plist[11] = args[coding_arg_for_unibyte] = Qt;
10843     plist[12] = intern_c_string (":docstring");
10844     plist[13] = make_pure_c_string ("Do no conversion.\n\
10845 \n\
10846 When you visit a file with this coding, the file is read into a\n\
10847 unibyte buffer as is, thus each byte of a file is treated as a\n\
10848 character.");
10849     plist[14] = intern_c_string (":eol-type");
10850     plist[15] = args[coding_arg_eol_type] = Qunix;
10851     args[coding_arg_plist] = Flist (16, plist);
10852     Fdefine_coding_system_internal (coding_arg_max, args);
10853
10854     plist[1] = args[coding_arg_name] = Qundecided;
10855     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10856     plist[5] = args[coding_arg_coding_type] = Qundecided;
10857     /* This is already set.
10858        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
10859     plist[8] = intern_c_string (":charset-list");
10860     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10861     plist[11] = args[coding_arg_for_unibyte] = Qnil;
10862     plist[13] = make_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
10863     plist[15] = args[coding_arg_eol_type] = Qnil;
10864     args[coding_arg_plist] = Flist (16, plist);
10865     Fdefine_coding_system_internal (coding_arg_max, args);
10866   }
10867
10868   setup_coding_system (Qno_conversion, &safe_terminal_coding);
10869
10870   {
10871     int i;
10872
10873     for (i = 0; i < coding_category_max; i++)
10874       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10875   }
10876 #if defined (DOS_NT)
10877   system_eol_type = Qdos;
10878 #else
10879   system_eol_type = Qunix;
10880 #endif
10881   staticpro (&system_eol_type);
10882 }
10883
10884 char *
10885 emacs_strerror (int error_number)
10886 {
10887   char *str;
10888
10889   synchronize_system_messages_locale ();
10890   str = strerror (error_number);
10891
10892   if (! NILP (Vlocale_coding_system))
10893     {
10894       Lisp_Object dec = code_convert_string_norecord (build_string (str),
10895                                                       Vlocale_coding_system,
10896                                                       0);
10897       str = (char *) SDATA (dec);
10898     }
10899
10900   return str;
10901 }
10902
10903 #endif /* emacs */
10904