code.delx.au - gnu-emacs/blob - src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2016 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or (at
  16 your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  On
  59   the C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return true if the byte sequence conforms to XXX.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static bool
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   bool multibytep = coding->src_multibyte;
 162   ptrdiff_t consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   bool multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce a encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   bool multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   ptrdiff_t produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288
 289 #ifdef HAVE_WCHAR_H
 290 #include <wchar.h>
 291 #endif /* HAVE_WCHAR_H */
 292
 293 #include "lisp.h"
 294 #include "character.h"
 295 #include "buffer.h"
 296 #include "charset.h"
 297 #include "ccl.h"
 298 #include "composite.h"
 299 #include "coding.h"
 300 #include "termhooks.h"
 301
 302 Lisp_Object Vcoding_system_hash_table;
 303
 304 /* Format of end-of-line decided by system.  This is Qunix on
 305    Unix and Mac, Qdos on DOS/Windows.
 306    This has an effect only for external encoding (i.e. for output to
 307    file and process), not for in-buffer or Lisp string encoding.  */
 308 static Lisp_Object system_eol_type;
 309
 310 #ifdef emacs
 311
 312 /* Coding-systems are handed between Emacs Lisp programs and C internal
 313    routines by the following three variables.  */
 314 /* Coding system to be used to encode text for terminal display when
 315    terminal coding system is nil.  */
 316 struct coding_system safe_terminal_coding;
 317
 318 #endif /* emacs */
 319
 320 /* Two special coding systems.  */
 321 static Lisp_Object Vsjis_coding_system;
 322 static Lisp_Object Vbig5_coding_system;
 323
 324 /* ISO2022 section */
 325
 326 #define CODING_ISO_INITIAL(coding, reg)                 \
 327   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 328                      coding_attr_iso_initial),          \
 329                reg)))
 330
 331
 332 #define CODING_ISO_REQUEST(coding, charset_id)          \
 333   (((charset_id) <= (coding)->max_charset_id            \
 334     ? ((coding)->safe_charsets[charset_id] != 255       \
 335        ? (coding)->safe_charsets[charset_id]            \
 336        : -1)                                            \
 337     : -1))
 338
 339
 340 #define CODING_ISO_FLAGS(coding)        \
 341   ((coding)->spec.iso_2022.flags)
 342 #define CODING_ISO_DESIGNATION(coding, reg)     \
 343   ((coding)->spec.iso_2022.current_designation[reg])
 344 #define CODING_ISO_INVOCATION(coding, plane)    \
 345   ((coding)->spec.iso_2022.current_invocation[plane])
 346 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 347   ((coding)->spec.iso_2022.single_shifting)
 348 #define CODING_ISO_BOL(coding)  \
 349   ((coding)->spec.iso_2022.bol)
 350 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 351   (CODING_ISO_INVOCATION (coding, plane) < 0 ? -1       \
 352    : CODING_ISO_DESIGNATION (coding, CODING_ISO_INVOCATION (coding, plane)))
 353 #define CODING_ISO_CMP_STATUS(coding)   \
 354   (&(coding)->spec.iso_2022.cmp_status)
 355 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 356   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 357 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 358   ((coding)->spec.iso_2022.embedded_utf_8)
 359
 360 /* Control characters of ISO2022.  */
 361                         /* code */      /* function */
 362 #define ISO_CODE_SO     0x0E            /* shift-out */
 363 #define ISO_CODE_SI     0x0F            /* shift-in */
 364 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 365 #define ISO_CODE_ESC    0x1B            /* escape */
 366 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 367 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 368 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 369
 370 /* All code (1-byte) of ISO2022 is classified into one of the
 371    followings.  */
 372 enum iso_code_class_type
 373   {
 374     ISO_control_0,              /* Control codes in the range
 375                                    0x00..0x1F and 0x7F, except for the
 376                                    following 5 codes.  */
 377     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 378     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 379     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 380     ISO_escape,                 /* ISO_CODE_ESC (0x1B) */
 381     ISO_control_1,              /* Control codes in the range
 382                                    0x80..0x9F, except for the
 383                                    following 3 codes.  */
 384     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 385     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 386     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 387     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 388     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 389     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 390     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 391   };
 392
 393 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 394     `iso-flags' attribute of an iso2022 coding system.  */
 395
 396 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 397    instead of the correct short-form sequence (e.g. ESC $ A).  */
 398 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 399
 400 /* If set, reset graphic planes and registers at end-of-line to the
 401    initial state.  */
 402 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 403
 404 /* If set, reset graphic planes and registers before any control
 405    characters to the initial state.  */
 406 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 407
 408 /* If set, encode by 7-bit environment.  */
 409 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 410
 411 /* If set, use locking-shift function.  */
 412 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 413
 414 /* If set, use single-shift function.  Overwrite
 415    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 416 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 417
 418 /* If set, use designation escape sequence.  */
 419 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 420
 421 /* If set, produce revision number sequence.  */
 422 #define CODING_ISO_FLAG_REVISION        0x0080
 423
 424 /* If set, produce ISO6429's direction specifying sequence.  */
 425 #define CODING_ISO_FLAG_DIRECTION       0x0100
 426
 427 /* If set, assume designation states are reset at beginning of line on
 428    output.  */
 429 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 430
 431 /* If set, designation sequence should be placed at beginning of line
 432    on output.  */
 433 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 434
 435 /* If set, do not encode unsafe characters on output.  */
 436 #define CODING_ISO_FLAG_SAFE            0x0800
 437
 438 /* If set, extra latin codes (128..159) are accepted as a valid code
 439    on input.  */
 440 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 441
 442 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 443
 444 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
 445
 446 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 447
 448 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 449
 450 #define CODING_ISO_FLAG_LEVEL_4         0x20000
 451
 452 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 453
 454 /* A character to be produced on output if encoding of the original
 455    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 456 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 457
 458 /* UTF-8 section */
 459 #define CODING_UTF_8_BOM(coding)        \
 460   ((coding)->spec.utf_8_bom)
 461
 462 /* UTF-16 section */
 463 #define CODING_UTF_16_BOM(coding)       \
 464   ((coding)->spec.utf_16.bom)
 465
 466 #define CODING_UTF_16_ENDIAN(coding)    \
 467   ((coding)->spec.utf_16.endian)
 468
 469 #define CODING_UTF_16_SURROGATE(coding) \
 470   ((coding)->spec.utf_16.surrogate)
 471
 472
 473 /* CCL section */
 474 #define CODING_CCL_DECODER(coding)      \
 475   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 476 #define CODING_CCL_ENCODER(coding)      \
 477   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 478 #define CODING_CCL_VALIDS(coding)                                          \
 479   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 480
 481 /* Index for each coding category in `coding_categories' */
 482
 483 enum coding_category
 484   {
 485     coding_category_iso_7,
 486     coding_category_iso_7_tight,
 487     coding_category_iso_8_1,
 488     coding_category_iso_8_2,
 489     coding_category_iso_7_else,
 490     coding_category_iso_8_else,
 491     coding_category_utf_8_auto,
 492     coding_category_utf_8_nosig,
 493     coding_category_utf_8_sig,
 494     coding_category_utf_16_auto,
 495     coding_category_utf_16_be,
 496     coding_category_utf_16_le,
 497     coding_category_utf_16_be_nosig,
 498     coding_category_utf_16_le_nosig,
 499     coding_category_charset,
 500     coding_category_sjis,
 501     coding_category_big5,
 502     coding_category_ccl,
 503     coding_category_emacs_mule,
 504     /* All above are targets of code detection.  */
 505     coding_category_raw_text,
 506     coding_category_undecided,
 507     coding_category_max
 508   };
 509
 510 /* Definitions of flag bits used in detect_coding_XXXX.  */
 511 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 512 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 513 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 514 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 515 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 516 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 517 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 518 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 519 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 520 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 521 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 522 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 523 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 524 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 525 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 526 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 527 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 528 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 529 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 530 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 531
 532 /* This value is returned if detect_coding_mask () find nothing other
 533    than ASCII characters.  */
 534 #define CATEGORY_MASK_ANY               \
 535   (CATEGORY_MASK_ISO_7                  \
 536    | CATEGORY_MASK_ISO_7_TIGHT          \
 537    | CATEGORY_MASK_ISO_8_1              \
 538    | CATEGORY_MASK_ISO_8_2              \
 539    | CATEGORY_MASK_ISO_7_ELSE           \
 540    | CATEGORY_MASK_ISO_8_ELSE           \
 541    | CATEGORY_MASK_UTF_8_AUTO           \
 542    | CATEGORY_MASK_UTF_8_NOSIG          \
 543    | CATEGORY_MASK_UTF_8_SIG            \
 544    | CATEGORY_MASK_UTF_16_AUTO          \
 545    | CATEGORY_MASK_UTF_16_BE            \
 546    | CATEGORY_MASK_UTF_16_LE            \
 547    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 548    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 549    | CATEGORY_MASK_CHARSET              \
 550    | CATEGORY_MASK_SJIS                 \
 551    | CATEGORY_MASK_BIG5                 \
 552    | CATEGORY_MASK_CCL                  \
 553    | CATEGORY_MASK_EMACS_MULE)
 554
 555
 556 #define CATEGORY_MASK_ISO_7BIT \
 557   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 558
 559 #define CATEGORY_MASK_ISO_8BIT \
 560   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 561
 562 #define CATEGORY_MASK_ISO_ELSE \
 563   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 564
 565 #define CATEGORY_MASK_ISO_ESCAPE        \
 566   (CATEGORY_MASK_ISO_7                  \
 567    | CATEGORY_MASK_ISO_7_TIGHT          \
 568    | CATEGORY_MASK_ISO_7_ELSE           \
 569    | CATEGORY_MASK_ISO_8_ELSE)
 570
 571 #define CATEGORY_MASK_ISO       \
 572   (  CATEGORY_MASK_ISO_7BIT     \
 573      | CATEGORY_MASK_ISO_8BIT   \
 574      | CATEGORY_MASK_ISO_ELSE)
 575
 576 #define CATEGORY_MASK_UTF_16            \
 577   (CATEGORY_MASK_UTF_16_AUTO            \
 578    | CATEGORY_MASK_UTF_16_BE            \
 579    | CATEGORY_MASK_UTF_16_LE            \
 580    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 581    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 582
 583 #define CATEGORY_MASK_UTF_8     \
 584   (CATEGORY_MASK_UTF_8_AUTO     \
 585    | CATEGORY_MASK_UTF_8_NOSIG  \
 586    | CATEGORY_MASK_UTF_8_SIG)
 587
 588 /* Table of coding categories (Lisp symbols).  This variable is for
 589    internal use only.  */
 590 static Lisp_Object Vcoding_category_table;
 591
 592 /* Table of coding-categories ordered by priority.  */
 593 static enum coding_category coding_priorities[coding_category_max];
 594
 595 /* Nth element is a coding context for the coding system bound to the
 596    Nth coding category.  */
 597 static struct coding_system coding_categories[coding_category_max];
 598
 599 /* Encode a flag that can be nil, something else, or t as -1, 0, 1.  */
 600
 601 static int
 602 encode_inhibit_flag (Lisp_Object flag)
 603 {
 604   return NILP (flag) ? -1 : EQ (flag, Qt);
 605 }
 606
 607 /* True if the value of ENCODED_FLAG says a flag should be treated as set.
 608    1 means yes, -1 means no, 0 means ask the user variable VAR.  */
 609
 610 static bool
 611 inhibit_flag (int encoded_flag, bool var)
 612 {
 613   return 0 < encoded_flag + var;
 614 }
 615
 616 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 617   do {                                                  \
 618     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 619     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 620   } while (0)
 621
 622 static void
 623 CHECK_NATNUM_CAR (Lisp_Object x)
 624 {
 625   Lisp_Object tmp = XCAR (x);
 626   CHECK_NATNUM (tmp);
 627   XSETCAR (x, tmp);
 628 }
 629
 630 static void
 631 CHECK_NATNUM_CDR (Lisp_Object x)
 632 {
 633   Lisp_Object tmp = XCDR (x);
 634   CHECK_NATNUM (tmp);
 635   XSETCDR (x, tmp);
 636 }
 637
 638 /* True if CODING's destination can be grown.  */
 639
 640 static bool
 641 growable_destination (struct coding_system *coding)
 642 {
 643   return STRINGP (coding->dst_object) || BUFFERP (coding->dst_object);
 644 }
 645
 646
 647 /* Safely get one byte from the source text pointed by SRC which ends
 648    at SRC_END, and set C to that byte.  If there are not enough bytes
 649    in the source, it jumps to 'no_more_source'.  If MULTIBYTEP,
 650    and a multibyte character is found at SRC, set C to the
 651    negative value of the character code.  The caller should declare
 652    and set these variables appropriately in advance:
 653         src, src_end, multibytep */
 654
 655 #define ONE_MORE_BYTE(c)                                \
 656   do {                                                  \
 657     if (src == src_end)                                 \
 658       {                                                 \
 659         if (src_base < src)                             \
 660           record_conversion_result                      \
 661             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 662         goto no_more_source;                            \
 663       }                                                 \
 664     c = *src++;                                         \
 665     if (multibytep && (c & 0x80))                       \
 666       {                                                 \
 667         if ((c & 0xFE) == 0xC0)                         \
 668           c = ((c & 1) << 6) | *src++;                  \
 669         else                                            \
 670           {                                             \
 671             src--;                                      \
 672             c = - string_char (src, &src, NULL);        \
 673             record_conversion_result                    \
 674               (coding, CODING_RESULT_INVALID_SRC);      \
 675           }                                             \
 676       }                                                 \
 677     consumed_chars++;                                   \
 678   } while (0)
 679
 680 /* Safely get two bytes from the source text pointed by SRC which ends
 681    at SRC_END, and set C1 and C2 to those bytes while skipping the
 682    heading multibyte characters.  If there are not enough bytes in the
 683    source, it jumps to 'no_more_source'.  If MULTIBYTEP and
 684    a multibyte character is found for C2, set C2 to the negative value
 685    of the character code.  The caller should declare and set these
 686    variables appropriately in advance:
 687         src, src_end, multibytep
 688    It is intended that this macro is used in detect_coding_utf_16.  */
 689
 690 #define TWO_MORE_BYTES(c1, c2)                          \
 691   do {                                                  \
 692     do {                                                \
 693       if (src == src_end)                               \
 694         goto no_more_source;                            \
 695       c1 = *src++;                                      \
 696       if (multibytep && (c1 & 0x80))                    \
 697         {                                               \
 698           if ((c1 & 0xFE) == 0xC0)                      \
 699             c1 = ((c1 & 1) << 6) | *src++;              \
 700           else                                          \
 701             {                                           \
 702               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 703               c1 = -1;                                  \
 704             }                                           \
 705         }                                               \
 706     } while (c1 < 0);                                   \
 707     if (src == src_end)                                 \
 708       goto no_more_source;                              \
 709     c2 = *src++;                                        \
 710     if (multibytep && (c2 & 0x80))                      \
 711       {                                                 \
 712         if ((c2 & 0xFE) == 0xC0)                        \
 713           c2 = ((c2 & 1) << 6) | *src++;                \
 714         else                                            \
 715           c2 = -1;                                      \
 716       }                                                 \
 717   } while (0)
 718
 719
 720 /* Store a byte C in the place pointed by DST and increment DST to the
 721    next free point, and increment PRODUCED_CHARS.  The caller should
 722    assure that C is 0..127, and declare and set the variable `dst'
 723    appropriately in advance.
 724 */
 725
 726
 727 #define EMIT_ONE_ASCII_BYTE(c)  \
 728   do {                          \
 729     produced_chars++;           \
 730     *dst++ = (c);               \
 731   } while (0)
 732
 733
 734 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 735
 736 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 737   do {                                  \
 738     produced_chars += 2;                \
 739     *dst++ = (c1), *dst++ = (c2);       \
 740   } while (0)
 741
 742
 743 /* Store a byte C in the place pointed by DST and increment DST to the
 744    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP,
 745    store in an appropriate multibyte form.  The caller should
 746    declare and set the variables `dst' and `multibytep' appropriately
 747    in advance.  */
 748
 749 #define EMIT_ONE_BYTE(c)                \
 750   do {                                  \
 751     produced_chars++;                   \
 752     if (multibytep)                     \
 753       {                                 \
 754         unsigned ch = (c);              \
 755         if (ch >= 0x80)                 \
 756           ch = BYTE8_TO_CHAR (ch);      \
 757         CHAR_STRING_ADVANCE (ch, dst);  \
 758       }                                 \
 759     else                                \
 760       *dst++ = (c);                     \
 761   } while (0)
 762
 763
 764 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 765
 766 #define EMIT_TWO_BYTES(c1, c2)          \
 767   do {                                  \
 768     produced_chars += 2;                \
 769     if (multibytep)                     \
 770       {                                 \
 771         unsigned ch;                    \
 772                                         \
 773         ch = (c1);                      \
 774         if (ch >= 0x80)                 \
 775           ch = BYTE8_TO_CHAR (ch);      \
 776         CHAR_STRING_ADVANCE (ch, dst);  \
 777         ch = (c2);                      \
 778         if (ch >= 0x80)                 \
 779           ch = BYTE8_TO_CHAR (ch);      \
 780         CHAR_STRING_ADVANCE (ch, dst);  \
 781       }                                 \
 782     else                                \
 783       {                                 \
 784         *dst++ = (c1);                  \
 785         *dst++ = (c2);                  \
 786       }                                 \
 787   } while (0)
 788
 789
 790 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 791   do {                                  \
 792     EMIT_ONE_BYTE (c1);                 \
 793     EMIT_TWO_BYTES (c2, c3);            \
 794   } while (0)
 795
 796
 797 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 798   do {                                          \
 799     EMIT_TWO_BYTES (c1, c2);                    \
 800     EMIT_TWO_BYTES (c3, c4);                    \
 801   } while (0)
 802
 803
 804 static void
 805 record_conversion_result (struct coding_system *coding,
 806                           enum coding_result_code result)
 807 {
 808   coding->result = result;
 809   switch (result)
 810     {
 811     case CODING_RESULT_INSUFFICIENT_SRC:
 812       Vlast_code_conversion_error = Qinsufficient_source;
 813       break;
 814     case CODING_RESULT_INVALID_SRC:
 815       Vlast_code_conversion_error = Qinvalid_source;
 816       break;
 817     case CODING_RESULT_INTERRUPT:
 818       Vlast_code_conversion_error = Qinterrupted;
 819       break;
 820     case CODING_RESULT_INSUFFICIENT_DST:
 821       /* Don't record this error in Vlast_code_conversion_error
 822          because it happens just temporarily and is resolved when the
 823          whole conversion is finished.  */
 824       break;
 825     case CODING_RESULT_SUCCESS:
 826       break;
 827     default:
 828       Vlast_code_conversion_error = intern ("Unknown error");
 829     }
 830 }
 831
 832 /* These wrapper macros are used to preserve validity of pointers into
 833    buffer text across calls to decode_char, encode_char, etc, which
 834    could cause relocation of buffers if it loads a charset map,
 835    because loading a charset map allocates large structures.  */
 836
 837 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 838   do {                                                                       \
 839     ptrdiff_t offset;                                                        \
 840                                                                              \
 841     charset_map_loaded = 0;                                                  \
 842     c = DECODE_CHAR (charset, code);                                         \
 843     if (charset_map_loaded                                                   \
 844         && (offset = coding_change_source (coding)))                         \
 845       {                                                                      \
 846         src += offset;                                                       \
 847         src_base += offset;                                                  \
 848         src_end += offset;                                                   \
 849       }                                                                      \
 850   } while (0)
 851
 852 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code)      \
 853   do {                                                                  \
 854     ptrdiff_t offset;                                                   \
 855                                                                         \
 856     charset_map_loaded = 0;                                             \
 857     code = ENCODE_CHAR (charset, c);                                    \
 858     if (charset_map_loaded                                              \
 859         && (offset = coding_change_destination (coding)))               \
 860       {                                                                 \
 861         dst += offset;                                                  \
 862         dst_end += offset;                                              \
 863       }                                                                 \
 864   } while (0)
 865
 866 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
 867   do {                                                                  \
 868     ptrdiff_t offset;                                                   \
 869                                                                         \
 870     charset_map_loaded = 0;                                             \
 871     charset = char_charset (c, charset_list, code_return);              \
 872     if (charset_map_loaded                                              \
 873         && (offset = coding_change_destination (coding)))               \
 874       {                                                                 \
 875         dst += offset;                                                  \
 876         dst_end += offset;                                              \
 877       }                                                                 \
 878   } while (0)
 879
 880 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
 881   do {                                                                  \
 882     ptrdiff_t offset;                                                   \
 883                                                                         \
 884     charset_map_loaded = 0;                                             \
 885     result = CHAR_CHARSET_P (c, charset);                               \
 886     if (charset_map_loaded                                              \
 887         && (offset = coding_change_destination (coding)))               \
 888       {                                                                 \
 889         dst += offset;                                                  \
 890         dst_end += offset;                                              \
 891       }                                                                 \
 892   } while (0)
 893
 894
 895 /* If there are at least BYTES length of room at dst, allocate memory
 896    for coding->destination and update dst and dst_end.  We don't have
 897    to take care of coding->source which will be relocated.  It is
 898    handled by calling coding_set_source in encode_coding.  */
 899
 900 #define ASSURE_DESTINATION(bytes)                               \
 901   do {                                                          \
 902     if (dst + (bytes) >= dst_end)                               \
 903       {                                                         \
 904         ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
 905                                                                 \
 906         dst = alloc_destination (coding, more_bytes, dst);      \
 907         dst_end = coding->destination + coding->dst_bytes;      \
 908       }                                                         \
 909   } while (0)
 910
 911
 912 /* Store multibyte form of the character C in P, and advance P to the
 913    end of the multibyte form.  This used to be like CHAR_STRING_ADVANCE
 914    without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
 915    MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE.  */
 916
 917 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)  CHAR_STRING_ADVANCE(c, p)
 918
 919 /* Return the character code of character whose multibyte form is at
 920    P, and advance P to the end of the multibyte form.  This used to be
 921    like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
 922    nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR.  */
 923
 924 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
 925
 926 /* Set coding->source from coding->src_object.  */
 927
 928 static void
 929 coding_set_source (struct coding_system *coding)
 930 {
 931   if (BUFFERP (coding->src_object))
 932     {
 933       struct buffer *buf = XBUFFER (coding->src_object);
 934
 935       if (coding->src_pos < 0)
 936         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 937       else
 938         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 939     }
 940   else if (STRINGP (coding->src_object))
 941     {
 942       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 943     }
 944   else
 945     {
 946       /* Otherwise, the source is C string and is never relocated
 947          automatically.  Thus we don't have to update anything.  */
 948     }
 949 }
 950
 951
 952 /* Set coding->source from coding->src_object, and return how many
 953    bytes coding->source was changed.  */
 954
 955 static ptrdiff_t
 956 coding_change_source (struct coding_system *coding)
 957 {
 958   const unsigned char *orig = coding->source;
 959   coding_set_source (coding);
 960   return coding->source - orig;
 961 }
 962
 963
 964 /* Set coding->destination from coding->dst_object.  */
 965
 966 static void
 967 coding_set_destination (struct coding_system *coding)
 968 {
 969   if (BUFFERP (coding->dst_object))
 970     {
 971       if (BUFFERP (coding->src_object) && coding->src_pos < 0)
 972         {
 973           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
 974           coding->dst_bytes = (GAP_END_ADDR
 975                                - (coding->src_bytes - coding->consumed)
 976                                - coding->destination);
 977         }
 978       else
 979         {
 980           /* We are sure that coding->dst_pos_byte is before the gap
 981              of the buffer. */
 982           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
 983                                  + coding->dst_pos_byte - BEG_BYTE);
 984           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
 985                                - coding->destination);
 986         }
 987     }
 988   else
 989     {
 990       /* Otherwise, the destination is C string and is never relocated
 991          automatically.  Thus we don't have to update anything.  */
 992     }
 993 }
 994
 995
 996 /* Set coding->destination from coding->dst_object, and return how
 997    many bytes coding->destination was changed.  */
 998
 999 static ptrdiff_t
1000 coding_change_destination (struct coding_system *coding)
1001 {
1002   const unsigned char *orig = coding->destination;
1003   coding_set_destination (coding);
1004   return coding->destination - orig;
1005 }
1006
1007
1008 static void
1009 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1010 {
1011   ptrdiff_t newbytes;
1012   if (INT_ADD_WRAPV (coding->dst_bytes, bytes, &newbytes)
1013       || SIZE_MAX < newbytes)
1014     string_overflow ();
1015   coding->destination = xrealloc (coding->destination, newbytes);
1016   coding->dst_bytes = newbytes;
1017 }
1018
1019 static void
1020 coding_alloc_by_making_gap (struct coding_system *coding,
1021                             ptrdiff_t gap_head_used, ptrdiff_t bytes)
1022 {
1023   if (EQ (coding->src_object, coding->dst_object))
1024     {
1025       /* The gap may contain the produced data at the head and not-yet
1026          consumed data at the tail.  To preserve those data, we at
1027          first make the gap size to zero, then increase the gap
1028          size.  */
1029       ptrdiff_t add = GAP_SIZE;
1030
1031       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1032       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1033       make_gap (bytes);
1034       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1035       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1036     }
1037   else
1038     make_gap_1 (XBUFFER (coding->dst_object), bytes);
1039 }
1040
1041
1042 static unsigned char *
1043 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1044                    unsigned char *dst)
1045 {
1046   ptrdiff_t offset = dst - coding->destination;
1047
1048   if (BUFFERP (coding->dst_object))
1049     {
1050       struct buffer *buf = XBUFFER (coding->dst_object);
1051
1052       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1053     }
1054   else
1055     coding_alloc_by_realloc (coding, nbytes);
1056   coding_set_destination (coding);
1057   dst = coding->destination + offset;
1058   return dst;
1059 }
1060
1061 /** Macros for annotations.  */
1062
1063 /* An annotation data is stored in the array coding->charbuf in this
1064    format:
1065      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1066    LENGTH is the number of elements in the annotation.
1067    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1068    NCHARS is the number of characters in the text annotated.
1069
1070    The format of the following elements depend on ANNOTATION_MASK.
1071
1072    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1073    follows:
1074      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1075
1076    NBYTES is the number of bytes specified in the header part of
1077    old-style emacs-mule encoding, or 0 for the other kind of
1078    composition.
1079
1080    METHOD is one of enum composition_method.
1081
1082    Optional COMPOSITION-COMPONENTS are characters and composition
1083    rules.
1084
1085    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1086    follows.
1087
1088    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1089    recover from an invalid annotation, and should be skipped by
1090    produce_annotation.  */
1091
1092 /* Maximum length of the header of annotation data.  */
1093 #define MAX_ANNOTATION_LENGTH 5
1094
1095 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1096   do {                                                  \
1097     *(buf)++ = -(len);                                  \
1098     *(buf)++ = (mask);                                  \
1099     *(buf)++ = (nchars);                                \
1100     coding->annotated = 1;                              \
1101   } while (0);
1102
1103 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1104   do {                                                                      \
1105     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1106     *buf++ = nbytes;                                                        \
1107     *buf++ = method;                                                        \
1108   } while (0)
1109
1110
1111 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1112   do {                                                                  \
1113     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1114     *buf++ = id;                                                        \
1115   } while (0)
1116
1117
1118 /* Bitmasks for coding->eol_seen.  */
1119
1120 #define EOL_SEEN_NONE   0
1121 #define EOL_SEEN_LF     1
1122 #define EOL_SEEN_CR     2
1123 #define EOL_SEEN_CRLF   4
1124
1125 \f
1126 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1127
1128
1129
1130 \f
1131 /*** 3. UTF-8 ***/
1132
1133 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1134    Return true if a text is encoded in UTF-8.  */
1135
1136 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1137 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1138 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1139 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1140 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1141 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1142
1143 #define UTF_8_BOM_1 0xEF
1144 #define UTF_8_BOM_2 0xBB
1145 #define UTF_8_BOM_3 0xBF
1146
1147 /* Unlike the other detect_coding_XXX, this function counts the number
1148    of characters and checks the EOL format.  */
1149
1150 static bool
1151 detect_coding_utf_8 (struct coding_system *coding,
1152                      struct coding_detection_info *detect_info)
1153 {
1154   const unsigned char *src = coding->source, *src_base;
1155   const unsigned char *src_end = coding->source + coding->src_bytes;
1156   bool multibytep = coding->src_multibyte;
1157   ptrdiff_t consumed_chars = 0;
1158   bool bom_found = 0;
1159   ptrdiff_t nchars = coding->head_ascii;
1160   int eol_seen = coding->eol_seen;
1161
1162   detect_info->checked |= CATEGORY_MASK_UTF_8;
1163   /* A coding system of this category is always ASCII compatible.  */
1164   src += nchars;
1165
1166   if (src == coding->source     /* BOM should be at the head.  */
1167       && src + 3 < src_end      /* BOM is 3-byte long.  */
1168       && src[0] == UTF_8_BOM_1
1169       && src[1] == UTF_8_BOM_2
1170       && src[2] == UTF_8_BOM_3)
1171     {
1172       bom_found = 1;
1173       src += 3;
1174       nchars++;
1175     }
1176
1177   while (1)
1178     {
1179       int c, c1, c2, c3, c4;
1180
1181       src_base = src;
1182       ONE_MORE_BYTE (c);
1183       if (c < 0 || UTF_8_1_OCTET_P (c))
1184         {
1185           nchars++;
1186           if (c == '\r')
1187             {
1188               if (src < src_end && *src == '\n')
1189                 {
1190                   eol_seen |= EOL_SEEN_CRLF;
1191                   src++;
1192                   nchars++;
1193                 }
1194               else
1195                 eol_seen |= EOL_SEEN_CR;
1196             }
1197           else if (c == '\n')
1198             eol_seen |= EOL_SEEN_LF;
1199           continue;
1200         }
1201       ONE_MORE_BYTE (c1);
1202       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1203         break;
1204       if (UTF_8_2_OCTET_LEADING_P (c))
1205         {
1206           nchars++;
1207           continue;
1208         }
1209       ONE_MORE_BYTE (c2);
1210       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1211         break;
1212       if (UTF_8_3_OCTET_LEADING_P (c))
1213         {
1214           nchars++;
1215           continue;
1216         }
1217       ONE_MORE_BYTE (c3);
1218       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1219         break;
1220       if (UTF_8_4_OCTET_LEADING_P (c))
1221         {
1222           nchars++;
1223           continue;
1224         }
1225       ONE_MORE_BYTE (c4);
1226       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1227         break;
1228       if (UTF_8_5_OCTET_LEADING_P (c))
1229         {
1230           nchars++;
1231           continue;
1232         }
1233       break;
1234     }
1235   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1236   return 0;
1237
1238  no_more_source:
1239   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1240     {
1241       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1242       return 0;
1243     }
1244   if (bom_found)
1245     {
1246       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1247       detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1248     }
1249   else
1250     {
1251       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1252       if (nchars < src_end - coding->source)
1253         /* The found characters are less than source bytes, which
1254            means that we found a valid non-ASCII characters.  */
1255         detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_NOSIG;
1256     }
1257   coding->detected_utf8_bytes = src_base - coding->source;
1258   coding->detected_utf8_chars = nchars;
1259   return 1;
1260 }
1261
1262
1263 static void
1264 decode_coding_utf_8 (struct coding_system *coding)
1265 {
1266   const unsigned char *src = coding->source + coding->consumed;
1267   const unsigned char *src_end = coding->source + coding->src_bytes;
1268   const unsigned char *src_base;
1269   int *charbuf = coding->charbuf + coding->charbuf_used;
1270   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1271   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1272   bool multibytep = coding->src_multibyte;
1273   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1274   bool eol_dos
1275     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1276   int byte_after_cr = -1;
1277
1278   if (bom != utf_without_bom)
1279     {
1280       int c1, c2, c3;
1281
1282       src_base = src;
1283       ONE_MORE_BYTE (c1);
1284       if (! UTF_8_3_OCTET_LEADING_P (c1))
1285         src = src_base;
1286       else
1287         {
1288           ONE_MORE_BYTE (c2);
1289           if (! UTF_8_EXTRA_OCTET_P (c2))
1290             src = src_base;
1291           else
1292             {
1293               ONE_MORE_BYTE (c3);
1294               if (! UTF_8_EXTRA_OCTET_P (c3))
1295                 src = src_base;
1296               else
1297                 {
1298                   if ((c1 != UTF_8_BOM_1)
1299                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1300                     src = src_base;
1301                   else
1302                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1303                 }
1304             }
1305         }
1306     }
1307   CODING_UTF_8_BOM (coding) = utf_without_bom;
1308
1309   while (1)
1310     {
1311       int c, c1, c2, c3, c4, c5;
1312
1313       src_base = src;
1314       consumed_chars_base = consumed_chars;
1315
1316       if (charbuf >= charbuf_end)
1317         {
1318           if (byte_after_cr >= 0)
1319             src_base--;
1320           break;
1321         }
1322
1323       /* In the simple case, rapidly handle ordinary characters */
1324       if (multibytep && ! eol_dos
1325           && charbuf < charbuf_end - 6 && src < src_end - 6)
1326         {
1327           while (charbuf < charbuf_end - 6 && src < src_end - 6)
1328             {
1329               c1 = *src;
1330               if (c1 & 0x80)
1331                 break;
1332               src++;
1333               consumed_chars++;
1334               *charbuf++ = c1;
1335
1336               c1 = *src;
1337               if (c1 & 0x80)
1338                 break;
1339               src++;
1340               consumed_chars++;
1341               *charbuf++ = c1;
1342
1343               c1 = *src;
1344               if (c1 & 0x80)
1345                 break;
1346               src++;
1347               consumed_chars++;
1348               *charbuf++ = c1;
1349
1350               c1 = *src;
1351               if (c1 & 0x80)
1352                 break;
1353               src++;
1354               consumed_chars++;
1355               *charbuf++ = c1;
1356             }
1357           /* If we handled at least one character, restart the main loop.  */
1358           if (src != src_base)
1359             continue;
1360         }
1361
1362       if (byte_after_cr >= 0)
1363         c1 = byte_after_cr, byte_after_cr = -1;
1364       else
1365         ONE_MORE_BYTE (c1);
1366       if (c1 < 0)
1367         {
1368           c = - c1;
1369         }
1370       else if (UTF_8_1_OCTET_P (c1))
1371         {
1372           if (eol_dos && c1 == '\r')
1373             ONE_MORE_BYTE (byte_after_cr);
1374           c = c1;
1375         }
1376       else
1377         {
1378           ONE_MORE_BYTE (c2);
1379           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1380             goto invalid_code;
1381           if (UTF_8_2_OCTET_LEADING_P (c1))
1382             {
1383               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1384               /* Reject overlong sequences here and below.  Encoders
1385                  producing them are incorrect, they can be misleading,
1386                  and they mess up read/write invariance.  */
1387               if (c < 128)
1388                 goto invalid_code;
1389             }
1390           else
1391             {
1392               ONE_MORE_BYTE (c3);
1393               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1394                 goto invalid_code;
1395               if (UTF_8_3_OCTET_LEADING_P (c1))
1396                 {
1397                   c = (((c1 & 0xF) << 12)
1398                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1399                   if (c < 0x800
1400                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1401                     goto invalid_code;
1402                 }
1403               else
1404                 {
1405                   ONE_MORE_BYTE (c4);
1406                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1407                     goto invalid_code;
1408                   if (UTF_8_4_OCTET_LEADING_P (c1))
1409                     {
1410                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1411                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1412                     if (c < 0x10000)
1413                       goto invalid_code;
1414                     }
1415                   else
1416                     {
1417                       ONE_MORE_BYTE (c5);
1418                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1419                         goto invalid_code;
1420                       if (UTF_8_5_OCTET_LEADING_P (c1))
1421                         {
1422                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1423                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1424                                | (c5 & 0x3F));
1425                           if ((c > MAX_CHAR) || (c < 0x200000))
1426                             goto invalid_code;
1427                         }
1428                       else
1429                         goto invalid_code;
1430                     }
1431                 }
1432             }
1433         }
1434
1435       *charbuf++ = c;
1436       continue;
1437
1438     invalid_code:
1439       src = src_base;
1440       consumed_chars = consumed_chars_base;
1441       ONE_MORE_BYTE (c);
1442       *charbuf++ = ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
1443     }
1444
1445  no_more_source:
1446   coding->consumed_char += consumed_chars_base;
1447   coding->consumed = src_base - coding->source;
1448   coding->charbuf_used = charbuf - coding->charbuf;
1449 }
1450
1451
1452 static bool
1453 encode_coding_utf_8 (struct coding_system *coding)
1454 {
1455   bool multibytep = coding->dst_multibyte;
1456   int *charbuf = coding->charbuf;
1457   int *charbuf_end = charbuf + coding->charbuf_used;
1458   unsigned char *dst = coding->destination + coding->produced;
1459   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1460   ptrdiff_t produced_chars = 0;
1461   int c;
1462
1463   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1464     {
1465       ASSURE_DESTINATION (3);
1466       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1467       CODING_UTF_8_BOM (coding) = utf_without_bom;
1468     }
1469
1470   if (multibytep)
1471     {
1472       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1473
1474       while (charbuf < charbuf_end)
1475         {
1476           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1477
1478           ASSURE_DESTINATION (safe_room);
1479           c = *charbuf++;
1480           if (CHAR_BYTE8_P (c))
1481             {
1482               c = CHAR_TO_BYTE8 (c);
1483               EMIT_ONE_BYTE (c);
1484             }
1485           else
1486             {
1487               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1488               for (p = str; p < pend; p++)
1489                 EMIT_ONE_BYTE (*p);
1490             }
1491         }
1492     }
1493   else
1494     {
1495       int safe_room = MAX_MULTIBYTE_LENGTH;
1496
1497       while (charbuf < charbuf_end)
1498         {
1499           ASSURE_DESTINATION (safe_room);
1500           c = *charbuf++;
1501           if (CHAR_BYTE8_P (c))
1502             *dst++ = CHAR_TO_BYTE8 (c);
1503           else
1504             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1505         }
1506       produced_chars = dst - (coding->destination + coding->produced);
1507     }
1508   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1509   coding->produced_char += produced_chars;
1510   coding->produced = dst - coding->destination;
1511   return 0;
1512 }
1513
1514
1515 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1516    Return true if a text is encoded in one of UTF-16 based coding systems.  */
1517
1518 #define UTF_16_HIGH_SURROGATE_P(val) \
1519   (((val) & 0xFC00) == 0xD800)
1520
1521 #define UTF_16_LOW_SURROGATE_P(val) \
1522   (((val) & 0xFC00) == 0xDC00)
1523
1524
1525 static bool
1526 detect_coding_utf_16 (struct coding_system *coding,
1527                       struct coding_detection_info *detect_info)
1528 {
1529   const unsigned char *src = coding->source;
1530   const unsigned char *src_end = coding->source + coding->src_bytes;
1531   bool multibytep = coding->src_multibyte;
1532   int c1, c2;
1533
1534   detect_info->checked |= CATEGORY_MASK_UTF_16;
1535   if (coding->mode & CODING_MODE_LAST_BLOCK
1536       && (coding->src_chars & 1))
1537     {
1538       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1539       return 0;
1540     }
1541
1542   TWO_MORE_BYTES (c1, c2);
1543   if ((c1 == 0xFF) && (c2 == 0xFE))
1544     {
1545       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1546                              | CATEGORY_MASK_UTF_16_AUTO);
1547       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1548                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1549                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1550     }
1551   else if ((c1 == 0xFE) && (c2 == 0xFF))
1552     {
1553       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1554                              | CATEGORY_MASK_UTF_16_AUTO);
1555       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1556                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1557                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1558     }
1559   else if (c2 < 0)
1560     {
1561       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1562       return 0;
1563     }
1564   else
1565     {
1566       /* We check the dispersion of Eth and Oth bytes where E is even and
1567          O is odd.  If both are high, we assume binary data.*/
1568       unsigned char e[256], o[256];
1569       unsigned e_num = 1, o_num = 1;
1570
1571       memset (e, 0, 256);
1572       memset (o, 0, 256);
1573       e[c1] = 1;
1574       o[c2] = 1;
1575
1576       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1577                                 |CATEGORY_MASK_UTF_16_BE
1578                                 | CATEGORY_MASK_UTF_16_LE);
1579
1580       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1581              != CATEGORY_MASK_UTF_16)
1582         {
1583           TWO_MORE_BYTES (c1, c2);
1584           if (c2 < 0)
1585             break;
1586           if (! e[c1])
1587             {
1588               e[c1] = 1;
1589               e_num++;
1590               if (e_num >= 128)
1591                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1592             }
1593           if (! o[c2])
1594             {
1595               o[c2] = 1;
1596               o_num++;
1597               if (o_num >= 128)
1598                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1599             }
1600         }
1601       return 0;
1602     }
1603
1604  no_more_source:
1605   return 1;
1606 }
1607
1608 static void
1609 decode_coding_utf_16 (struct coding_system *coding)
1610 {
1611   const unsigned char *src = coding->source + coding->consumed;
1612   const unsigned char *src_end = coding->source + coding->src_bytes;
1613   const unsigned char *src_base;
1614   int *charbuf = coding->charbuf + coding->charbuf_used;
1615   /* We may produces at most 3 chars in one loop.  */
1616   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1617   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1618   bool multibytep = coding->src_multibyte;
1619   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1620   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1621   int surrogate = CODING_UTF_16_SURROGATE (coding);
1622   bool eol_dos
1623     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1624   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1625
1626   if (bom == utf_with_bom)
1627     {
1628       int c, c1, c2;
1629
1630       src_base = src;
1631       ONE_MORE_BYTE (c1);
1632       ONE_MORE_BYTE (c2);
1633       c = (c1 << 8) | c2;
1634
1635       if (endian == utf_16_big_endian
1636           ? c != 0xFEFF : c != 0xFFFE)
1637         {
1638           /* The first two bytes are not BOM.  Treat them as bytes
1639              for a normal character.  */
1640           src = src_base;
1641         }
1642       CODING_UTF_16_BOM (coding) = utf_without_bom;
1643     }
1644   else if (bom == utf_detect_bom)
1645     {
1646       /* We have already tried to detect BOM and failed in
1647          detect_coding.  */
1648       CODING_UTF_16_BOM (coding) = utf_without_bom;
1649     }
1650
1651   while (1)
1652     {
1653       int c, c1, c2;
1654
1655       src_base = src;
1656       consumed_chars_base = consumed_chars;
1657
1658       if (charbuf >= charbuf_end)
1659         {
1660           if (byte_after_cr1 >= 0)
1661             src_base -= 2;
1662           break;
1663         }
1664
1665       if (byte_after_cr1 >= 0)
1666         c1 = byte_after_cr1, byte_after_cr1 = -1;
1667       else
1668         ONE_MORE_BYTE (c1);
1669       if (c1 < 0)
1670         {
1671           *charbuf++ = -c1;
1672           continue;
1673         }
1674       if (byte_after_cr2 >= 0)
1675         c2 = byte_after_cr2, byte_after_cr2 = -1;
1676       else
1677         ONE_MORE_BYTE (c2);
1678       if (c2 < 0)
1679         {
1680           *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1681           *charbuf++ = -c2;
1682           continue;
1683         }
1684       c = (endian == utf_16_big_endian
1685            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1686
1687       if (surrogate)
1688         {
1689           if (! UTF_16_LOW_SURROGATE_P (c))
1690             {
1691               if (endian == utf_16_big_endian)
1692                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1693               else
1694                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1695               *charbuf++ = c1;
1696               *charbuf++ = c2;
1697               if (UTF_16_HIGH_SURROGATE_P (c))
1698                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1699               else
1700                 *charbuf++ = c;
1701             }
1702           else
1703             {
1704               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1705               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1706               *charbuf++ = 0x10000 + c;
1707             }
1708         }
1709       else
1710         {
1711           if (UTF_16_HIGH_SURROGATE_P (c))
1712             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1713           else
1714             {
1715               if (eol_dos && c == '\r')
1716                 {
1717                   ONE_MORE_BYTE (byte_after_cr1);
1718                   ONE_MORE_BYTE (byte_after_cr2);
1719                 }
1720               *charbuf++ = c;
1721             }
1722         }
1723     }
1724
1725  no_more_source:
1726   coding->consumed_char += consumed_chars_base;
1727   coding->consumed = src_base - coding->source;
1728   coding->charbuf_used = charbuf - coding->charbuf;
1729 }
1730
1731 static bool
1732 encode_coding_utf_16 (struct coding_system *coding)
1733 {
1734   bool multibytep = coding->dst_multibyte;
1735   int *charbuf = coding->charbuf;
1736   int *charbuf_end = charbuf + coding->charbuf_used;
1737   unsigned char *dst = coding->destination + coding->produced;
1738   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1739   int safe_room = 8;
1740   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1741   bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1742   ptrdiff_t produced_chars = 0;
1743   int c;
1744
1745   if (bom != utf_without_bom)
1746     {
1747       ASSURE_DESTINATION (safe_room);
1748       if (big_endian)
1749         EMIT_TWO_BYTES (0xFE, 0xFF);
1750       else
1751         EMIT_TWO_BYTES (0xFF, 0xFE);
1752       CODING_UTF_16_BOM (coding) = utf_without_bom;
1753     }
1754
1755   while (charbuf < charbuf_end)
1756     {
1757       ASSURE_DESTINATION (safe_room);
1758       c = *charbuf++;
1759       if (c > MAX_UNICODE_CHAR)
1760         c = coding->default_char;
1761
1762       if (c < 0x10000)
1763         {
1764           if (big_endian)
1765             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1766           else
1767             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1768         }
1769       else
1770         {
1771           int c1, c2;
1772
1773           c -= 0x10000;
1774           c1 = (c >> 10) + 0xD800;
1775           c2 = (c & 0x3FF) + 0xDC00;
1776           if (big_endian)
1777             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1778           else
1779             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1780         }
1781     }
1782   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1783   coding->produced = dst - coding->destination;
1784   coding->produced_char += produced_chars;
1785   return 0;
1786 }
1787
1788 \f
1789 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1790
1791 /* Emacs' internal format for representation of multiple character
1792    sets is a kind of multi-byte encoding, i.e. characters are
1793    represented by variable-length sequences of one-byte codes.
1794
1795    ASCII characters and control characters (e.g. `tab', `newline') are
1796    represented by one-byte sequences which are their ASCII codes, in
1797    the range 0x00 through 0x7F.
1798
1799    8-bit characters of the range 0x80..0x9F are represented by
1800    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1801    code + 0x20).
1802
1803    8-bit characters of the range 0xA0..0xFF are represented by
1804    one-byte sequences which are their 8-bit code.
1805
1806    The other characters are represented by a sequence of `base
1807    leading-code', optional `extended leading-code', and one or two
1808    `position-code's.  The length of the sequence is determined by the
1809    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1810    whereas extended leading-code and position-code take the range 0xA0
1811    through 0xFF.  See `charset.h' for more details about leading-code
1812    and position-code.
1813
1814    --- CODE RANGE of Emacs' internal format ---
1815    character set        range
1816    -------------        -----
1817    ascii                0x00..0x7F
1818    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1819    eight-bit-graphic    0xA0..0xBF
1820    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1821    ---------------------------------------------
1822
1823    As this is the internal character representation, the format is
1824    usually not used externally (i.e. in a file or in a data sent to a
1825    process).  But, it is possible to have a text externally in this
1826    format (i.e. by encoding by the coding system `emacs-mule').
1827
1828    In that case, a sequence of one-byte codes has a slightly different
1829    form.
1830
1831    At first, all characters in eight-bit-control are represented by
1832    one-byte sequences which are their 8-bit code.
1833
1834    Next, character composition data are represented by the byte
1835    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1836    where,
1837         METHOD is 0xF2 plus one of composition method (enum
1838         composition_method),
1839
1840         BYTES is 0xA0 plus a byte length of this composition data,
1841
1842         CHARS is 0xA0 plus a number of characters composed by this
1843         data,
1844
1845         COMPONENTs are characters of multibyte form or composition
1846         rules encoded by two-byte of ASCII codes.
1847
1848    In addition, for backward compatibility, the following formats are
1849    also recognized as composition data on decoding.
1850
1851    0x80 MSEQ ...
1852    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1853
1854    Here,
1855         MSEQ is a multibyte form but in these special format:
1856           ASCII: 0xA0 ASCII_CODE+0x80,
1857           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1858         RULE is a one byte code of the range 0xA0..0xF0 that
1859         represents a composition rule.
1860   */
1861
1862 char emacs_mule_bytes[256];
1863
1864
1865 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1866    Return true if a text is encoded in 'emacs-mule'.  */
1867
1868 static bool
1869 detect_coding_emacs_mule (struct coding_system *coding,
1870                           struct coding_detection_info *detect_info)
1871 {
1872   const unsigned char *src = coding->source, *src_base;
1873   const unsigned char *src_end = coding->source + coding->src_bytes;
1874   bool multibytep = coding->src_multibyte;
1875   ptrdiff_t consumed_chars = 0;
1876   int c;
1877   int found = 0;
1878
1879   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1880   /* A coding system of this category is always ASCII compatible.  */
1881   src += coding->head_ascii;
1882
1883   while (1)
1884     {
1885       src_base = src;
1886       ONE_MORE_BYTE (c);
1887       if (c < 0)
1888         continue;
1889       if (c == 0x80)
1890         {
1891           /* Perhaps the start of composite character.  We simply skip
1892              it because analyzing it is too heavy for detecting.  But,
1893              at least, we check that the composite character
1894              constitutes of more than 4 bytes.  */
1895           const unsigned char *src_start;
1896
1897         repeat:
1898           src_start = src;
1899           do
1900             {
1901               ONE_MORE_BYTE (c);
1902             }
1903           while (c >= 0xA0);
1904
1905           if (src - src_start <= 4)
1906             break;
1907           found = CATEGORY_MASK_EMACS_MULE;
1908           if (c == 0x80)
1909             goto repeat;
1910         }
1911
1912       if (c < 0x80)
1913         {
1914           if (c < 0x20
1915               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1916             break;
1917         }
1918       else
1919         {
1920           int more_bytes = emacs_mule_bytes[c] - 1;
1921
1922           while (more_bytes > 0)
1923             {
1924               ONE_MORE_BYTE (c);
1925               if (c < 0xA0)
1926                 {
1927                   src--;        /* Unread the last byte.  */
1928                   break;
1929                 }
1930               more_bytes--;
1931             }
1932           if (more_bytes != 0)
1933             break;
1934           found = CATEGORY_MASK_EMACS_MULE;
1935         }
1936     }
1937   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1938   return 0;
1939
1940  no_more_source:
1941   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1942     {
1943       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1944       return 0;
1945     }
1946   detect_info->found |= found;
1947   return 1;
1948 }
1949
1950
1951 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1952    character.  If CMP_STATUS indicates that we must expect MSEQ or
1953    RULE described above, decode it and return the negative value of
1954    the decoded character or rule.  If an invalid byte is found, return
1955    -1.  If SRC is too short, return -2.  */
1956
1957 static int
1958 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1959                  int *nbytes, int *nchars, int *id,
1960                  struct composition_status *cmp_status)
1961 {
1962   const unsigned char *src_end = coding->source + coding->src_bytes;
1963   const unsigned char *src_base = src;
1964   bool multibytep = coding->src_multibyte;
1965   int charset_ID;
1966   unsigned code;
1967   int c;
1968   ptrdiff_t consumed_chars = 0;
1969   bool mseq_found = 0;
1970
1971   ONE_MORE_BYTE (c);
1972   if (c < 0)
1973     {
1974       c = -c;
1975       charset_ID = emacs_mule_charset[0];
1976     }
1977   else
1978     {
1979       if (c >= 0xA0)
1980         {
1981           if (cmp_status->state != COMPOSING_NO
1982               && cmp_status->old_form)
1983             {
1984               if (cmp_status->state == COMPOSING_CHAR)
1985                 {
1986                   if (c == 0xA0)
1987                     {
1988                       ONE_MORE_BYTE (c);
1989                       c -= 0x80;
1990                       if (c < 0)
1991                         goto invalid_code;
1992                     }
1993                   else
1994                     c -= 0x20;
1995                   mseq_found = 1;
1996                 }
1997               else
1998                 {
1999                   *nbytes = src - src_base;
2000                   *nchars = consumed_chars;
2001                   return -c;
2002                 }
2003             }
2004           else
2005             goto invalid_code;
2006         }
2007
2008       switch (emacs_mule_bytes[c])
2009         {
2010         case 2:
2011           if ((charset_ID = emacs_mule_charset[c]) < 0)
2012             goto invalid_code;
2013           ONE_MORE_BYTE (c);
2014           if (c < 0xA0)
2015             goto invalid_code;
2016           code = c & 0x7F;
2017           break;
2018
2019         case 3:
2020           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2021               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2022             {
2023               ONE_MORE_BYTE (c);
2024               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
2025                 goto invalid_code;
2026               ONE_MORE_BYTE (c);
2027               if (c < 0xA0)
2028                 goto invalid_code;
2029               code = c & 0x7F;
2030             }
2031           else
2032             {
2033               if ((charset_ID = emacs_mule_charset[c]) < 0)
2034                 goto invalid_code;
2035               ONE_MORE_BYTE (c);
2036               if (c < 0xA0)
2037                 goto invalid_code;
2038               code = (c & 0x7F) << 8;
2039               ONE_MORE_BYTE (c);
2040               if (c < 0xA0)
2041                 goto invalid_code;
2042               code |= c & 0x7F;
2043             }
2044           break;
2045
2046         case 4:
2047           ONE_MORE_BYTE (c);
2048           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
2049             goto invalid_code;
2050           ONE_MORE_BYTE (c);
2051           if (c < 0xA0)
2052             goto invalid_code;
2053           code = (c & 0x7F) << 8;
2054           ONE_MORE_BYTE (c);
2055           if (c < 0xA0)
2056             goto invalid_code;
2057           code |= c & 0x7F;
2058           break;
2059
2060         case 1:
2061           code = c;
2062           charset_ID = ASCII_CHAR_P (code) ? charset_ascii : charset_eight_bit;
2063           break;
2064
2065         default:
2066           emacs_abort ();
2067         }
2068       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2069                           CHARSET_FROM_ID (charset_ID), code, c);
2070       if (c < 0)
2071         goto invalid_code;
2072     }
2073   *nbytes = src - src_base;
2074   *nchars = consumed_chars;
2075   if (id)
2076     *id = charset_ID;
2077   return (mseq_found ? -c : c);
2078
2079  no_more_source:
2080   return -2;
2081
2082  invalid_code:
2083   return -1;
2084 }
2085
2086
2087 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2088
2089 /* Handle these composition sequence ('|': the end of header elements,
2090    BYTES and CHARS >= 0xA0):
2091
2092    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2093    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2094    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2095
2096    and these old form:
2097
2098    (4) relative composition: 0x80 | MSEQ ... MSEQ
2099    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2100
2101    When the starter 0x80 and the following header elements are found,
2102    this annotation header is produced.
2103
2104         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2105
2106    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2107    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2108
2109    Then, upon reading the following elements, these codes are produced
2110    until the composition end is found:
2111
2112    (1) CHAR ... CHAR
2113    (2) ALT ... ALT CHAR ... CHAR
2114    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2115    (4) CHAR ... CHAR
2116    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2117
2118    When the composition end is found, LENGTH and NCHARS in the
2119    annotation header is updated as below:
2120
2121    (1) LENGTH: unchanged, NCHARS: unchanged
2122    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2123    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2124    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2125    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2126
2127    If an error is found while composing, the annotation header is
2128    changed to the original composition header (plus filler -1s) as
2129    below:
2130
2131    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2132    (5)          [ 0x80 0xFF -1 -1- -1 ]
2133
2134    and the sequence [ -2 DECODED-RULE ] is changed to the original
2135    byte sequence as below:
2136         o the original byte sequence is B: [ B -1 ]
2137         o the original byte sequence is B1 B2: [ B1 B2 ]
2138
2139    Most of the routines are implemented by macros because many
2140    variables and labels in the caller decode_coding_emacs_mule must be
2141    accessible, and they are usually called just once (thus doesn't
2142    increase the size of compiled object).  */
2143
2144 /* Decode a composition rule represented by C as a component of
2145    composition sequence of Emacs 20 style.  Set RULE to the decoded
2146    rule. */
2147
2148 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2149   do {                                                  \
2150     int gref, nref;                                     \
2151                                                         \
2152     c -= 0xA0;                                          \
2153     if (c < 0 || c >= 81)                               \
2154       goto invalid_code;                                \
2155     gref = c / 9, nref = c % 9;                         \
2156     if (gref == 4) gref = 10;                           \
2157     if (nref == 4) nref = 10;                           \
2158     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2159   } while (0)
2160
2161
2162 /* Decode a composition rule represented by C and the following byte
2163    at SRC as a component of composition sequence of Emacs 21 style.
2164    Set RULE to the decoded rule.  */
2165
2166 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2167   do {                                                  \
2168     int gref, nref;                                     \
2169                                                         \
2170     gref = c - 0x20;                                    \
2171     if (gref < 0 || gref >= 81)                         \
2172       goto invalid_code;                                \
2173     ONE_MORE_BYTE (c);                                  \
2174     nref = c - 0x20;                                    \
2175     if (nref < 0 || nref >= 81)                         \
2176       goto invalid_code;                                \
2177     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2178   } while (0)
2179
2180
2181 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2182    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2183    byte length of this composition information, CHARS is the number of
2184    characters composed by this composition.  */
2185
2186 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2187   do {                                                                  \
2188     enum composition_method method = c - 0xF2;                          \
2189     int nbytes, nchars;                                                 \
2190                                                                         \
2191     ONE_MORE_BYTE (c);                                                  \
2192     if (c < 0)                                                          \
2193       goto invalid_code;                                                \
2194     nbytes = c - 0xA0;                                                  \
2195     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2196       goto invalid_code;                                                \
2197     ONE_MORE_BYTE (c);                                                  \
2198     nchars = c - 0xA0;                                                  \
2199     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2200       goto invalid_code;                                                \
2201     cmp_status->old_form = 0;                                           \
2202     cmp_status->method = method;                                        \
2203     if (method == COMPOSITION_RELATIVE)                                 \
2204       cmp_status->state = COMPOSING_CHAR;                               \
2205     else                                                                \
2206       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2207     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2208     cmp_status->nchars = nchars;                                        \
2209     cmp_status->ncomps = nbytes - 4;                                    \
2210     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2211   } while (0)
2212
2213
2214 /* Start of Emacs 20 style format for relative composition.  */
2215
2216 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2217   do {                                                          \
2218     cmp_status->old_form = 1;                                   \
2219     cmp_status->method = COMPOSITION_RELATIVE;                  \
2220     cmp_status->state = COMPOSING_CHAR;                         \
2221     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2222     cmp_status->nchars = cmp_status->ncomps = 0;                \
2223     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2224   } while (0)
2225
2226
2227 /* Start of Emacs 20 style format for rule-base composition.  */
2228
2229 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2230   do {                                                          \
2231     cmp_status->old_form = 1;                                   \
2232     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2233     cmp_status->state = COMPOSING_CHAR;                         \
2234     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2235     cmp_status->nchars = cmp_status->ncomps = 0;                \
2236     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2237   } while (0)
2238
2239
2240 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2241   do {                                                  \
2242     const unsigned char *current_src = src;             \
2243                                                         \
2244     ONE_MORE_BYTE (c);                                  \
2245     if (c < 0)                                          \
2246       goto invalid_code;                                \
2247     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2248         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2249       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2250     else if (c < 0xA0)                                  \
2251       goto invalid_code;                                \
2252     else if (c < 0xC0)                                  \
2253       {                                                 \
2254         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2255         /* Re-read C as a composition component.  */    \
2256         src = current_src;                              \
2257       }                                                 \
2258     else if (c == 0xFF)                                 \
2259       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2260     else                                                \
2261       goto invalid_code;                                \
2262   } while (0)
2263
2264 #define EMACS_MULE_COMPOSITION_END()                            \
2265   do {                                                          \
2266     int idx = - cmp_status->length;                             \
2267                                                                 \
2268     if (cmp_status->old_form)                                   \
2269       charbuf[idx + 2] = cmp_status->nchars;                    \
2270     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2271       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2272     cmp_status->state = COMPOSING_NO;                           \
2273   } while (0)
2274
2275
2276 static int
2277 emacs_mule_finish_composition (int *charbuf,
2278                                struct composition_status *cmp_status)
2279 {
2280   int idx = - cmp_status->length;
2281   int new_chars;
2282
2283   if (cmp_status->old_form && cmp_status->nchars > 0)
2284     {
2285       charbuf[idx + 2] = cmp_status->nchars;
2286       new_chars = 0;
2287       if (cmp_status->method == COMPOSITION_WITH_RULE
2288           && cmp_status->state == COMPOSING_CHAR)
2289         {
2290           /* The last rule was invalid.  */
2291           int rule = charbuf[-1] + 0xA0;
2292
2293           charbuf[-2] = BYTE8_TO_CHAR (rule);
2294           charbuf[-1] = -1;
2295           new_chars = 1;
2296         }
2297     }
2298   else
2299     {
2300       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2301
2302       if (cmp_status->method == COMPOSITION_WITH_RULE)
2303         {
2304           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2305           charbuf[idx++] = -3;
2306           charbuf[idx++] = 0;
2307           new_chars = 1;
2308         }
2309       else
2310         {
2311           int nchars = charbuf[idx + 1] + 0xA0;
2312           int nbytes = charbuf[idx + 2] + 0xA0;
2313
2314           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2315           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2316           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2317           charbuf[idx++] = -1;
2318           new_chars = 4;
2319         }
2320     }
2321   cmp_status->state = COMPOSING_NO;
2322   return new_chars;
2323 }
2324
2325 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2326   do {                                                                    \
2327     if (cmp_status->state != COMPOSING_NO)                                \
2328       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2329   } while (0)
2330
2331
2332 static void
2333 decode_coding_emacs_mule (struct coding_system *coding)
2334 {
2335   const unsigned char *src = coding->source + coding->consumed;
2336   const unsigned char *src_end = coding->source + coding->src_bytes;
2337   const unsigned char *src_base;
2338   int *charbuf = coding->charbuf + coding->charbuf_used;
2339   /* We may produce two annotations (charset and composition) in one
2340      loop and one more charset annotation at the end.  */
2341   int *charbuf_end
2342     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2343       /* We can produce up to 2 characters in a loop.  */
2344       - 1;
2345   ptrdiff_t consumed_chars = 0, consumed_chars_base;
2346   bool multibytep = coding->src_multibyte;
2347   ptrdiff_t char_offset = coding->produced_char;
2348   ptrdiff_t last_offset = char_offset;
2349   int last_id = charset_ascii;
2350   bool eol_dos
2351     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2352   int byte_after_cr = -1;
2353   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2354
2355   if (cmp_status->state != COMPOSING_NO)
2356     {
2357       int i;
2358
2359       if (charbuf_end - charbuf < cmp_status->length)
2360         emacs_abort ();
2361       for (i = 0; i < cmp_status->length; i++)
2362         *charbuf++ = cmp_status->carryover[i];
2363       coding->annotated = 1;
2364     }
2365
2366   while (1)
2367     {
2368       int c, id IF_LINT (= 0);
2369
2370       src_base = src;
2371       consumed_chars_base = consumed_chars;
2372
2373       if (charbuf >= charbuf_end)
2374         {
2375           if (byte_after_cr >= 0)
2376             src_base--;
2377           break;
2378         }
2379
2380       if (byte_after_cr >= 0)
2381         c = byte_after_cr, byte_after_cr = -1;
2382       else
2383         ONE_MORE_BYTE (c);
2384
2385       if (c < 0 || c == 0x80)
2386         {
2387           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2388           if (c < 0)
2389             {
2390               *charbuf++ = -c;
2391               char_offset++;
2392             }
2393           else
2394             DECODE_EMACS_MULE_COMPOSITION_START ();
2395           continue;
2396         }
2397
2398       if (c < 0x80)
2399         {
2400           if (eol_dos && c == '\r')
2401             ONE_MORE_BYTE (byte_after_cr);
2402           id = charset_ascii;
2403           if (cmp_status->state != COMPOSING_NO)
2404             {
2405               if (cmp_status->old_form)
2406                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2407               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2408                 cmp_status->ncomps--;
2409             }
2410         }
2411       else
2412         {
2413           int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2414           /* emacs_mule_char can load a charset map from a file, which
2415              allocates a large structure and might cause buffer text
2416              to be relocated as result.  Thus, we need to remember the
2417              original pointer to buffer text, and fix up all related
2418              pointers after the call.  */
2419           const unsigned char *orig = coding->source;
2420           ptrdiff_t offset;
2421
2422           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2423                                cmp_status);
2424           offset = coding->source - orig;
2425           if (offset)
2426             {
2427               src += offset;
2428               src_base += offset;
2429               src_end += offset;
2430             }
2431           if (c < 0)
2432             {
2433               if (c == -1)
2434                 goto invalid_code;
2435               if (c == -2)
2436                 break;
2437             }
2438           src = src_base + nbytes;
2439           consumed_chars = consumed_chars_base + nchars;
2440           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2441             cmp_status->ncomps -= nchars;
2442         }
2443
2444       /* Now if C >= 0, we found a normally encoded character, if C <
2445          0, we found an old-style composition component character or
2446          rule.  */
2447
2448       if (cmp_status->state == COMPOSING_NO)
2449         {
2450           if (last_id != id)
2451             {
2452               if (last_id != charset_ascii)
2453                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2454                                   last_id);
2455               last_id = id;
2456               last_offset = char_offset;
2457             }
2458           *charbuf++ = c;
2459           char_offset++;
2460         }
2461       else if (cmp_status->state == COMPOSING_CHAR)
2462         {
2463           if (cmp_status->old_form)
2464             {
2465               if (c >= 0)
2466                 {
2467                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2468                   *charbuf++ = c;
2469                   char_offset++;
2470                 }
2471               else
2472                 {
2473                   *charbuf++ = -c;
2474                   cmp_status->nchars++;
2475                   cmp_status->length++;
2476                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2477                     EMACS_MULE_COMPOSITION_END ();
2478                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2479                     cmp_status->state = COMPOSING_RULE;
2480                 }
2481             }
2482           else
2483             {
2484               *charbuf++ = c;
2485               cmp_status->length++;
2486               cmp_status->nchars--;
2487               if (cmp_status->nchars == 0)
2488                 EMACS_MULE_COMPOSITION_END ();
2489             }
2490         }
2491       else if (cmp_status->state == COMPOSING_RULE)
2492         {
2493           int rule;
2494
2495           if (c >= 0)
2496             {
2497               EMACS_MULE_COMPOSITION_END ();
2498               *charbuf++ = c;
2499               char_offset++;
2500             }
2501           else
2502             {
2503               c = -c;
2504               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2505               if (rule < 0)
2506                 goto invalid_code;
2507               *charbuf++ = -2;
2508               *charbuf++ = rule;
2509               cmp_status->length += 2;
2510               cmp_status->state = COMPOSING_CHAR;
2511             }
2512         }
2513       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2514         {
2515           *charbuf++ = c;
2516           cmp_status->length++;
2517           if (cmp_status->ncomps == 0)
2518             cmp_status->state = COMPOSING_CHAR;
2519           else if (cmp_status->ncomps > 0)
2520             {
2521               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2522                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2523             }
2524           else
2525             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2526         }
2527       else                      /* COMPOSING_COMPONENT_RULE */
2528         {
2529           int rule;
2530
2531           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2532           if (rule < 0)
2533             goto invalid_code;
2534           *charbuf++ = -2;
2535           *charbuf++ = rule;
2536           cmp_status->length += 2;
2537           cmp_status->ncomps--;
2538           if (cmp_status->ncomps > 0)
2539             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2540           else
2541             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2542         }
2543       continue;
2544
2545     invalid_code:
2546       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2547       src = src_base;
2548       consumed_chars = consumed_chars_base;
2549       ONE_MORE_BYTE (c);
2550       *charbuf++ = ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
2551       char_offset++;
2552     }
2553
2554  no_more_source:
2555   if (cmp_status->state != COMPOSING_NO)
2556     {
2557       if (coding->mode & CODING_MODE_LAST_BLOCK)
2558         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2559       else
2560         {
2561           int i;
2562
2563           charbuf -= cmp_status->length;
2564           for (i = 0; i < cmp_status->length; i++)
2565             cmp_status->carryover[i] = charbuf[i];
2566         }
2567     }
2568   if (last_id != charset_ascii)
2569     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2570   coding->consumed_char += consumed_chars_base;
2571   coding->consumed = src_base - coding->source;
2572   coding->charbuf_used = charbuf - coding->charbuf;
2573 }
2574
2575
2576 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2577   do {                                          \
2578     if (id < 0xA0)                              \
2579       codes[0] = id, codes[1] = 0;              \
2580     else if (id < 0xE0)                         \
2581       codes[0] = 0x9A, codes[1] = id;           \
2582     else if (id < 0xF0)                         \
2583       codes[0] = 0x9B, codes[1] = id;           \
2584     else if (id < 0xF5)                         \
2585       codes[0] = 0x9C, codes[1] = id;           \
2586     else                                        \
2587       codes[0] = 0x9D, codes[1] = id;           \
2588   } while (0);
2589
2590
2591 static bool
2592 encode_coding_emacs_mule (struct coding_system *coding)
2593 {
2594   bool multibytep = coding->dst_multibyte;
2595   int *charbuf = coding->charbuf;
2596   int *charbuf_end = charbuf + coding->charbuf_used;
2597   unsigned char *dst = coding->destination + coding->produced;
2598   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2599   int safe_room = 8;
2600   ptrdiff_t produced_chars = 0;
2601   Lisp_Object attrs, charset_list;
2602   int c;
2603   int preferred_charset_id = -1;
2604
2605   CODING_GET_INFO (coding, attrs, charset_list);
2606   if (! EQ (charset_list, Vemacs_mule_charset_list))
2607     {
2608       charset_list = Vemacs_mule_charset_list;
2609       ASET (attrs, coding_attr_charset_list, charset_list);
2610     }
2611
2612   while (charbuf < charbuf_end)
2613     {
2614       ASSURE_DESTINATION (safe_room);
2615       c = *charbuf++;
2616
2617       if (c < 0)
2618         {
2619           /* Handle an annotation.  */
2620           switch (*charbuf)
2621             {
2622             case CODING_ANNOTATE_COMPOSITION_MASK:
2623               /* Not yet implemented.  */
2624               break;
2625             case CODING_ANNOTATE_CHARSET_MASK:
2626               preferred_charset_id = charbuf[3];
2627               if (preferred_charset_id >= 0
2628                   && NILP (Fmemq (make_number (preferred_charset_id),
2629                                   charset_list)))
2630                 preferred_charset_id = -1;
2631               break;
2632             default:
2633               emacs_abort ();
2634             }
2635           charbuf += -c - 1;
2636           continue;
2637         }
2638
2639       if (ASCII_CHAR_P (c))
2640         EMIT_ONE_ASCII_BYTE (c);
2641       else if (CHAR_BYTE8_P (c))
2642         {
2643           c = CHAR_TO_BYTE8 (c);
2644           EMIT_ONE_BYTE (c);
2645         }
2646       else
2647         {
2648           struct charset *charset;
2649           unsigned code;
2650           int dimension;
2651           int emacs_mule_id;
2652           unsigned char leading_codes[2];
2653
2654           if (preferred_charset_id >= 0)
2655             {
2656               bool result;
2657
2658               charset = CHARSET_FROM_ID (preferred_charset_id);
2659               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2660               if (result)
2661                 code = ENCODE_CHAR (charset, c);
2662               else
2663                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2664                                      &code, charset);
2665             }
2666           else
2667             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2668                                  &code, charset);
2669           if (! charset)
2670             {
2671               c = coding->default_char;
2672               if (ASCII_CHAR_P (c))
2673                 {
2674                   EMIT_ONE_ASCII_BYTE (c);
2675                   continue;
2676                 }
2677               CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2678                                    &code, charset);
2679             }
2680           dimension = CHARSET_DIMENSION (charset);
2681           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2682           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2683           EMIT_ONE_BYTE (leading_codes[0]);
2684           if (leading_codes[1])
2685             EMIT_ONE_BYTE (leading_codes[1]);
2686           if (dimension == 1)
2687             EMIT_ONE_BYTE (code | 0x80);
2688           else
2689             {
2690               code |= 0x8080;
2691               EMIT_ONE_BYTE (code >> 8);
2692               EMIT_ONE_BYTE (code & 0xFF);
2693             }
2694         }
2695     }
2696   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2697   coding->produced_char += produced_chars;
2698   coding->produced = dst - coding->destination;
2699   return 0;
2700 }
2701
2702 \f
2703 /*** 7. ISO2022 handlers ***/
2704
2705 /* The following note describes the coding system ISO2022 briefly.
2706    Since the intention of this note is to help understand the
2707    functions in this file, some parts are NOT ACCURATE or are OVERLY
2708    SIMPLIFIED.  For thorough understanding, please refer to the
2709    original document of ISO2022.  This is equivalent to the standard
2710    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2711
2712    ISO2022 provides many mechanisms to encode several character sets
2713    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2714    is encoded using bytes less than 128.  This may make the encoded
2715    text a little bit longer, but the text passes more easily through
2716    several types of gateway, some of which strip off the MSB (Most
2717    Significant Bit).
2718
2719    There are two kinds of character sets: control character sets and
2720    graphic character sets.  The former contain control characters such
2721    as `newline' and `escape' to provide control functions (control
2722    functions are also provided by escape sequences).  The latter
2723    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2724    two control character sets and many graphic character sets.
2725
2726    Graphic character sets are classified into one of the following
2727    four classes, according to the number of bytes (DIMENSION) and
2728    number of characters in one dimension (CHARS) of the set:
2729    - DIMENSION1_CHARS94
2730    - DIMENSION1_CHARS96
2731    - DIMENSION2_CHARS94
2732    - DIMENSION2_CHARS96
2733
2734    In addition, each character set is assigned an identification tag,
2735    unique for each set, called the "final character" (denoted as <F>
2736    hereafter).  The <F> of each character set is decided by ECMA(*)
2737    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2738    (0x30..0x3F are for private use only).
2739
2740    Note (*): ECMA = European Computer Manufacturers Association
2741
2742    Here are examples of graphic character sets [NAME(<F>)]:
2743         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2744         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2745         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2746         o DIMENSION2_CHARS96 -- none for the moment
2747
2748    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2749         C0 [0x00..0x1F] -- control character plane 0
2750         GL [0x20..0x7F] -- graphic character plane 0
2751         C1 [0x80..0x9F] -- control character plane 1
2752         GR [0xA0..0xFF] -- graphic character plane 1
2753
2754    A control character set is directly designated and invoked to C0 or
2755    C1 by an escape sequence.  The most common case is that:
2756    - ISO646's  control character set is designated/invoked to C0, and
2757    - ISO6429's control character set is designated/invoked to C1,
2758    and usually these designations/invocations are omitted in encoded
2759    text.  In a 7-bit environment, only C0 can be used, and a control
2760    character for C1 is encoded by an appropriate escape sequence to
2761    fit into the environment.  All control characters for C1 are
2762    defined to have corresponding escape sequences.
2763
2764    A graphic character set is at first designated to one of four
2765    graphic registers (G0 through G3), then these graphic registers are
2766    invoked to GL or GR.  These designations and invocations can be
2767    done independently.  The most common case is that G0 is invoked to
2768    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2769    these invocations and designations are omitted in encoded text.
2770    In a 7-bit environment, only GL can be used.
2771
2772    When a graphic character set of CHARS94 is invoked to GL, codes
2773    0x20 and 0x7F of the GL area work as control characters SPACE and
2774    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2775    be used.
2776
2777    There are two ways of invocation: locking-shift and single-shift.
2778    With locking-shift, the invocation lasts until the next different
2779    invocation, whereas with single-shift, the invocation affects the
2780    following character only and doesn't affect the locking-shift
2781    state.  Invocations are done by the following control characters or
2782    escape sequences:
2783
2784    ----------------------------------------------------------------------
2785    abbrev  function                  cntrl escape seq   description
2786    ----------------------------------------------------------------------
2787    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2788    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2789    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2790    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2791    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2792    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2793    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2794    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2795    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2796    ----------------------------------------------------------------------
2797    (*) These are not used by any known coding system.
2798
2799    Control characters for these functions are defined by macros
2800    ISO_CODE_XXX in `coding.h'.
2801
2802    Designations are done by the following escape sequences:
2803    ----------------------------------------------------------------------
2804    escape sequence      description
2805    ----------------------------------------------------------------------
2806    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2807    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2808    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2809    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2810    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2811    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2812    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2813    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2814    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2815    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2816    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2817    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2818    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2819    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2820    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2821    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2822    ----------------------------------------------------------------------
2823
2824    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2825    of dimension 1, chars 94, and final character <F>, etc...
2826
2827    Note (*): Although these designations are not allowed in ISO2022,
2828    Emacs accepts them on decoding, and produces them on encoding
2829    CHARS96 character sets in a coding system which is characterized as
2830    7-bit environment, non-locking-shift, and non-single-shift.
2831
2832    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2833    '(' must be omitted.  We refer to this as "short-form" hereafter.
2834
2835    Now you may notice that there are a lot of ways of encoding the
2836    same multilingual text in ISO2022.  Actually, there exist many
2837    coding systems such as Compound Text (used in X11's inter client
2838    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2839    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2840    localized platforms), and all of these are variants of ISO2022.
2841
2842    In addition to the above, Emacs handles two more kinds of escape
2843    sequences: ISO6429's direction specification and Emacs' private
2844    sequence for specifying character composition.
2845
2846    ISO6429's direction specification takes the following form:
2847         o CSI ']'      -- end of the current direction
2848         o CSI '0' ']'  -- end of the current direction
2849         o CSI '1' ']'  -- start of left-to-right text
2850         o CSI '2' ']'  -- start of right-to-left text
2851    The control character CSI (0x9B: control sequence introducer) is
2852    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2853
2854    Character composition specification takes the following form:
2855         o ESC '0' -- start relative composition
2856         o ESC '1' -- end composition
2857         o ESC '2' -- start rule-base composition (*)
2858         o ESC '3' -- start relative composition with alternate chars  (**)
2859         o ESC '4' -- start rule-base composition with alternate chars  (**)
2860   Since these are not standard escape sequences of any ISO standard,
2861   the use of them with these meanings is restricted to Emacs only.
2862
2863   (*) This form is used only in Emacs 20.7 and older versions,
2864   but newer versions can safely decode it.
2865   (**) This form is used only in Emacs 21.1 and newer versions,
2866   and older versions can't decode it.
2867
2868   Here's a list of example usages of these composition escape
2869   sequences (categorized by `enum composition_method').
2870
2871   COMPOSITION_RELATIVE:
2872         ESC 0 CHAR [ CHAR ] ESC 1
2873   COMPOSITION_WITH_RULE:
2874         ESC 2 CHAR [ RULE CHAR ] ESC 1
2875   COMPOSITION_WITH_ALTCHARS:
2876         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2877   COMPOSITION_WITH_RULE_ALTCHARS:
2878         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2879
2880 static enum iso_code_class_type iso_code_class[256];
2881
2882 #define SAFE_CHARSET_P(coding, id)      \
2883   ((id) <= (coding)->max_charset_id     \
2884    && (coding)->safe_charsets[id] != 255)
2885
2886 static void
2887 setup_iso_safe_charsets (Lisp_Object attrs)
2888 {
2889   Lisp_Object charset_list, safe_charsets;
2890   Lisp_Object request;
2891   Lisp_Object reg_usage;
2892   Lisp_Object tail;
2893   EMACS_INT reg94, reg96;
2894   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2895   int max_charset_id;
2896
2897   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2898   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2899       && ! EQ (charset_list, Viso_2022_charset_list))
2900     {
2901       charset_list = Viso_2022_charset_list;
2902       ASET (attrs, coding_attr_charset_list, charset_list);
2903       ASET (attrs, coding_attr_safe_charsets, Qnil);
2904     }
2905
2906   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2907     return;
2908
2909   max_charset_id = 0;
2910   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2911     {
2912       int id = XINT (XCAR (tail));
2913       if (max_charset_id < id)
2914         max_charset_id = id;
2915     }
2916
2917   safe_charsets = make_uninit_string (max_charset_id + 1);
2918   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2919   request = AREF (attrs, coding_attr_iso_request);
2920   reg_usage = AREF (attrs, coding_attr_iso_usage);
2921   reg94 = XINT (XCAR (reg_usage));
2922   reg96 = XINT (XCDR (reg_usage));
2923
2924   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2925     {
2926       Lisp_Object id;
2927       Lisp_Object reg;
2928       struct charset *charset;
2929
2930       id = XCAR (tail);
2931       charset = CHARSET_FROM_ID (XINT (id));
2932       reg = Fcdr (Fassq (id, request));
2933       if (! NILP (reg))
2934         SSET (safe_charsets, XINT (id), XINT (reg));
2935       else if (charset->iso_chars_96)
2936         {
2937           if (reg96 < 4)
2938             SSET (safe_charsets, XINT (id), reg96);
2939         }
2940       else
2941         {
2942           if (reg94 < 4)
2943             SSET (safe_charsets, XINT (id), reg94);
2944         }
2945     }
2946   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2947 }
2948
2949
2950 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2951    Return true if a text is encoded in one of ISO-2022 based coding
2952    systems.  */
2953
2954 static bool
2955 detect_coding_iso_2022 (struct coding_system *coding,
2956                         struct coding_detection_info *detect_info)
2957 {
2958   const unsigned char *src = coding->source, *src_base = src;
2959   const unsigned char *src_end = coding->source + coding->src_bytes;
2960   bool multibytep = coding->src_multibyte;
2961   bool single_shifting = 0;
2962   int id;
2963   int c, c1;
2964   ptrdiff_t consumed_chars = 0;
2965   int i;
2966   int rejected = 0;
2967   int found = 0;
2968   int composition_count = -1;
2969
2970   detect_info->checked |= CATEGORY_MASK_ISO;
2971
2972   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2973     {
2974       struct coding_system *this = &(coding_categories[i]);
2975       Lisp_Object attrs, val;
2976
2977       if (this->id < 0)
2978         continue;
2979       attrs = CODING_ID_ATTRS (this->id);
2980       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2981           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
2982         setup_iso_safe_charsets (attrs);
2983       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2984       this->max_charset_id = SCHARS (val) - 1;
2985       this->safe_charsets = SDATA (val);
2986     }
2987
2988   /* A coding system of this category is always ASCII compatible.  */
2989   src += coding->head_ascii;
2990
2991   while (rejected != CATEGORY_MASK_ISO)
2992     {
2993       src_base = src;
2994       ONE_MORE_BYTE (c);
2995       switch (c)
2996         {
2997         case ISO_CODE_ESC:
2998           if (inhibit_iso_escape_detection)
2999             break;
3000           single_shifting = 0;
3001           ONE_MORE_BYTE (c);
3002           if (c == 'N' || c == 'O')
3003             {
3004               /* ESC <Fe> for SS2 or SS3.  */
3005               single_shifting = 1;
3006               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3007             }
3008           else if (c == '1')
3009             {
3010               /* End of composition.  */
3011               if (composition_count < 0
3012                   || composition_count > MAX_COMPOSITION_COMPONENTS)
3013                 /* Invalid */
3014                 break;
3015               composition_count = -1;
3016               found |= CATEGORY_MASK_ISO;
3017             }
3018           else if (c >= '0' && c <= '4')
3019             {
3020               /* ESC <Fp> for start/end composition.  */
3021               composition_count = 0;
3022             }
3023           else
3024             {
3025               if (c >= '(' && c <= '/')
3026                 {
3027                   /* Designation sequence for a charset of dimension 1.  */
3028                   ONE_MORE_BYTE (c1);
3029                   if (c1 < ' ' || c1 >= 0x80
3030                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3031                     {
3032                       /* Invalid designation sequence.  Just ignore.  */
3033                       if (c1 >= 0x80)
3034                         rejected |= (CATEGORY_MASK_ISO_7BIT
3035                                      | CATEGORY_MASK_ISO_7_ELSE);
3036                       break;
3037                     }
3038                 }
3039               else if (c == '$')
3040                 {
3041                   /* Designation sequence for a charset of dimension 2.  */
3042                   ONE_MORE_BYTE (c);
3043                   if (c >= '@' && c <= 'B')
3044                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3045                     id = iso_charset_table[1][0][c];
3046                   else if (c >= '(' && c <= '/')
3047                     {
3048                       ONE_MORE_BYTE (c1);
3049                       if (c1 < ' ' || c1 >= 0x80
3050                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3051                         {
3052                           /* Invalid designation sequence.  Just ignore.  */
3053                           if (c1 >= 0x80)
3054                             rejected |= (CATEGORY_MASK_ISO_7BIT
3055                                          | CATEGORY_MASK_ISO_7_ELSE);
3056                           break;
3057                         }
3058                     }
3059                   else
3060                     {
3061                       /* Invalid designation sequence.  Just ignore it.  */
3062                       if (c >= 0x80)
3063                         rejected |= (CATEGORY_MASK_ISO_7BIT
3064                                      | CATEGORY_MASK_ISO_7_ELSE);
3065                       break;
3066                     }
3067                 }
3068               else
3069                 {
3070                   /* Invalid escape sequence.  Just ignore it.  */
3071                   if (c >= 0x80)
3072                     rejected |= (CATEGORY_MASK_ISO_7BIT
3073                                  | CATEGORY_MASK_ISO_7_ELSE);
3074                   break;
3075                 }
3076
3077               /* We found a valid designation sequence for CHARSET.  */
3078               rejected |= CATEGORY_MASK_ISO_8BIT;
3079               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3080                                   id))
3081                 found |= CATEGORY_MASK_ISO_7;
3082               else
3083                 rejected |= CATEGORY_MASK_ISO_7;
3084               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3085                                   id))
3086                 found |= CATEGORY_MASK_ISO_7_TIGHT;
3087               else
3088                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3089               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3090                                   id))
3091                 found |= CATEGORY_MASK_ISO_7_ELSE;
3092               else
3093                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3094               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3095                                   id))
3096                 found |= CATEGORY_MASK_ISO_8_ELSE;
3097               else
3098                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3099             }
3100           break;
3101
3102         case ISO_CODE_SO:
3103         case ISO_CODE_SI:
3104           /* Locking shift out/in.  */
3105           if (inhibit_iso_escape_detection)
3106             break;
3107           single_shifting = 0;
3108           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3109           break;
3110
3111         case ISO_CODE_CSI:
3112           /* Control sequence introducer.  */
3113           single_shifting = 0;
3114           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3115           found |= CATEGORY_MASK_ISO_8_ELSE;
3116           goto check_extra_latin;
3117
3118         case ISO_CODE_SS2:
3119         case ISO_CODE_SS3:
3120           /* Single shift.   */
3121           if (inhibit_iso_escape_detection)
3122             break;
3123           single_shifting = 0;
3124           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3125           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3126               & CODING_ISO_FLAG_SINGLE_SHIFT)
3127             {
3128               found |= CATEGORY_MASK_ISO_8_1;
3129               single_shifting = 1;
3130             }
3131           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3132               & CODING_ISO_FLAG_SINGLE_SHIFT)
3133             {
3134               found |= CATEGORY_MASK_ISO_8_2;
3135               single_shifting = 1;
3136             }
3137           if (single_shifting)
3138             break;
3139           goto check_extra_latin;
3140
3141         default:
3142           if (c < 0)
3143             continue;
3144           if (c < 0x80)
3145             {
3146               if (composition_count >= 0)
3147                 composition_count++;
3148               single_shifting = 0;
3149               break;
3150             }
3151           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3152           if (c >= 0xA0)
3153             {
3154               found |= CATEGORY_MASK_ISO_8_1;
3155               /* Check the length of succeeding codes of the range
3156                  0xA0..0FF.  If the byte length is even, we include
3157                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3158                  only when we are not single shifting.  */
3159               if (! single_shifting
3160                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3161                 {
3162                   ptrdiff_t len = 1;
3163                   while (src < src_end)
3164                     {
3165                       src_base = src;
3166                       ONE_MORE_BYTE (c);
3167                       if (c < 0xA0)
3168                         {
3169                           src = src_base;
3170                           break;
3171                         }
3172                       len++;
3173                     }
3174
3175                   if (len & 1 && src < src_end)
3176                     {
3177                       rejected |= CATEGORY_MASK_ISO_8_2;
3178                       if (composition_count >= 0)
3179                         composition_count += len;
3180                     }
3181                   else
3182                     {
3183                       found |= CATEGORY_MASK_ISO_8_2;
3184                       if (composition_count >= 0)
3185                         composition_count += len / 2;
3186                     }
3187                 }
3188               break;
3189             }
3190         check_extra_latin:
3191           if (! VECTORP (Vlatin_extra_code_table)
3192               || NILP (AREF (Vlatin_extra_code_table, c)))
3193             {
3194               rejected = CATEGORY_MASK_ISO;
3195               break;
3196             }
3197           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3198               & CODING_ISO_FLAG_LATIN_EXTRA)
3199             found |= CATEGORY_MASK_ISO_8_1;
3200           else
3201             rejected |= CATEGORY_MASK_ISO_8_1;
3202           rejected |= CATEGORY_MASK_ISO_8_2;
3203           break;
3204         }
3205     }
3206   detect_info->rejected |= CATEGORY_MASK_ISO;
3207   return 0;
3208
3209  no_more_source:
3210   detect_info->rejected |= rejected;
3211   detect_info->found |= (found & ~rejected);
3212   return 1;
3213 }
3214
3215
3216 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3217    escape sequence should be kept.  */
3218 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3219   do {                                                                  \
3220     int id, prev;                                                       \
3221                                                                         \
3222     if (final < '0' || final >= 128                                     \
3223         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3224         || !SAFE_CHARSET_P (coding, id))                                \
3225       {                                                                 \
3226         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3227         chars_96 = -1;                                                  \
3228         break;                                                          \
3229       }                                                                 \
3230     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3231     if (id == charset_jisx0201_roman)                                   \
3232       {                                                                 \
3233         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3234           id = charset_ascii;                                           \
3235       }                                                                 \
3236     else if (id == charset_jisx0208_1978)                               \
3237       {                                                                 \
3238         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3239           id = charset_jisx0208;                                        \
3240       }                                                                 \
3241     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3242     /* If there was an invalid designation to REG previously, and this  \
3243        designation is ASCII to REG, we should keep this designation     \
3244        sequence.  */                                                    \
3245     if (prev == -2 && id == charset_ascii)                              \
3246       chars_96 = -1;                                                    \
3247   } while (0)
3248
3249
3250 /* Handle these composition sequence (ALT: alternate char):
3251
3252    (1) relative composition: ESC 0 CHAR ... ESC 1
3253    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3254    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3255    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3256
3257    When the start sequence (ESC 0/2/3/4) is found, this annotation
3258    header is produced.
3259
3260         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3261
3262    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3263    produced until the end sequence (ESC 1) is found:
3264
3265    (1) CHAR ... CHAR
3266    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3267    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3268    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3269
3270    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3271    annotation header is updated as below:
3272
3273    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3274    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3275    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3276    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3277
3278    If an error is found while composing, the annotation header is
3279    changed to:
3280
3281         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3282
3283    and the sequence [ -2 DECODED-RULE ] is changed to the original
3284    byte sequence as below:
3285         o the original byte sequence is B: [ B -1 ]
3286         o the original byte sequence is B1 B2: [ B1 B2 ]
3287    and the sequence [ -1 -1 ] is changed to the original byte
3288    sequence:
3289         [ ESC '0' ]
3290 */
3291
3292 /* Decode a composition rule C1 and maybe one more byte from the
3293    source, and set RULE to the encoded composition rule.  If the rule
3294    is invalid, goto invalid_code.  */
3295
3296 #define DECODE_COMPOSITION_RULE(rule)                                   \
3297   do {                                                                  \
3298     rule = c1 - 32;                                                     \
3299     if (rule < 0)                                                       \
3300       goto invalid_code;                                                \
3301     if (rule < 81)              /* old format (before ver.21) */        \
3302       {                                                                 \
3303         int gref = (rule) / 9;                                          \
3304         int nref = (rule) % 9;                                          \
3305         if (gref == 4) gref = 10;                                       \
3306         if (nref == 4) nref = 10;                                       \
3307         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3308       }                                                                 \
3309     else                        /* new format (after ver.21) */         \
3310       {                                                                 \
3311         int b;                                                          \
3312                                                                         \
3313         ONE_MORE_BYTE (b);                                              \
3314         if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))        \
3315           goto invalid_code;                                            \
3316         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
3317         rule += 0x100;   /* Distinguish it from the old format.  */     \
3318       }                                                                 \
3319   } while (0)
3320
3321 #define ENCODE_COMPOSITION_RULE(rule)                           \
3322   do {                                                          \
3323     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3324                                                                 \
3325     if (rule < 0x100)           /* old format */                \
3326       {                                                         \
3327         if (gref == 10) gref = 4;                               \
3328         if (nref == 10) nref = 4;                               \
3329         charbuf[idx] = 32 + gref * 9 + nref;                    \
3330         charbuf[idx + 1] = -1;                                  \
3331         new_chars++;                                            \
3332       }                                                         \
3333     else                                /* new format */        \
3334       {                                                         \
3335         charbuf[idx] = 32 + 81 + gref;                          \
3336         charbuf[idx + 1] = 32 + nref;                           \
3337         new_chars += 2;                                         \
3338       }                                                         \
3339   } while (0)
3340
3341 /* Finish the current composition as invalid.  */
3342
3343 static int
3344 finish_composition (int *charbuf, struct composition_status *cmp_status)
3345 {
3346   int idx = - cmp_status->length;
3347   int new_chars;
3348
3349   /* Recover the original ESC sequence */
3350   charbuf[idx++] = ISO_CODE_ESC;
3351   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3352                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3353                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3354                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3355                     : '4');
3356   charbuf[idx++] = -2;
3357   charbuf[idx++] = 0;
3358   charbuf[idx++] = -1;
3359   new_chars = cmp_status->nchars;
3360   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3361     for (; idx < 0; idx++)
3362       {
3363         int elt = charbuf[idx];
3364
3365         if (elt == -2)
3366           {
3367             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3368             idx++;
3369           }
3370         else if (elt == -1)
3371           {
3372             charbuf[idx++] = ISO_CODE_ESC;
3373             charbuf[idx] = '0';
3374             new_chars += 2;
3375           }
3376       }
3377   cmp_status->state = COMPOSING_NO;
3378   return new_chars;
3379 }
3380
3381 /* If characters are under composition, finish the composition.  */
3382 #define MAYBE_FINISH_COMPOSITION()                              \
3383   do {                                                          \
3384     if (cmp_status->state != COMPOSING_NO)                      \
3385       char_offset += finish_composition (charbuf, cmp_status);  \
3386   } while (0)
3387
3388 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3389
3390    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3391    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3392    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3393    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3394
3395    Produce this annotation sequence now:
3396
3397    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3398 */
3399
3400 #define DECODE_COMPOSITION_START(c1)                                       \
3401   do {                                                                     \
3402     if (c1 == '0'                                                          \
3403         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3404              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3405             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3406                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3407       {                                                                    \
3408         *charbuf++ = -1;                                                   \
3409         *charbuf++= -1;                                                    \
3410         cmp_status->state = COMPOSING_CHAR;                                \
3411         cmp_status->length += 2;                                           \
3412       }                                                                    \
3413     else                                                                   \
3414       {                                                                    \
3415         MAYBE_FINISH_COMPOSITION ();                                       \
3416         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3417                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3418                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3419                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3420         cmp_status->state                                                  \
3421           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3422         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3423         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3424         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3425         coding->annotated = 1;                                             \
3426       }                                                                    \
3427   } while (0)
3428
3429
3430 /* Handle composition end sequence ESC 1.  */
3431
3432 #define DECODE_COMPOSITION_END()                                        \
3433   do {                                                                  \
3434     if (cmp_status->nchars == 0                                         \
3435         || ((cmp_status->state == COMPOSING_CHAR)                       \
3436             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3437       {                                                                 \
3438         MAYBE_FINISH_COMPOSITION ();                                    \
3439         goto invalid_code;                                              \
3440       }                                                                 \
3441     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3442       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3443     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3444       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3445     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3446     char_offset += cmp_status->nchars;                                  \
3447     cmp_status->state = COMPOSING_NO;                                   \
3448   } while (0)
3449
3450 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3451
3452 #define STORE_COMPOSITION_RULE(rule)    \
3453   do {                                  \
3454     *charbuf++ = -2;                    \
3455     *charbuf++ = rule;                  \
3456     cmp_status->length += 2;            \
3457     cmp_status->state--;                \
3458   } while (0)
3459
3460 /* Store a composed char or a component char C in charbuf, and update
3461    cmp_status.  */
3462
3463 #define STORE_COMPOSITION_CHAR(c)                                       \
3464   do {                                                                  \
3465     *charbuf++ = (c);                                                   \
3466     cmp_status->length++;                                               \
3467     if (cmp_status->state == COMPOSING_CHAR)                            \
3468       cmp_status->nchars++;                                             \
3469     else                                                                \
3470       cmp_status->ncomps++;                                             \
3471     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3472         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3473             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3474       cmp_status->state++;                                              \
3475   } while (0)
3476
3477
3478 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3479
3480 static void
3481 decode_coding_iso_2022 (struct coding_system *coding)
3482 {
3483   const unsigned char *src = coding->source + coding->consumed;
3484   const unsigned char *src_end = coding->source + coding->src_bytes;
3485   const unsigned char *src_base;
3486   int *charbuf = coding->charbuf + coding->charbuf_used;
3487   /* We may produce two annotations (charset and composition) in one
3488      loop and one more charset annotation at the end.  */
3489   int *charbuf_end
3490     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3491   ptrdiff_t consumed_chars = 0, consumed_chars_base;
3492   bool multibytep = coding->src_multibyte;
3493   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3494   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3495   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3496   int charset_id_2, charset_id_3;
3497   struct charset *charset;
3498   int c;
3499   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3500   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3501   ptrdiff_t char_offset = coding->produced_char;
3502   ptrdiff_t last_offset = char_offset;
3503   int last_id = charset_ascii;
3504   bool eol_dos
3505     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3506   int byte_after_cr = -1;
3507   int i;
3508
3509   setup_iso_safe_charsets (attrs);
3510   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3511
3512   if (cmp_status->state != COMPOSING_NO)
3513     {
3514       if (charbuf_end - charbuf < cmp_status->length)
3515         emacs_abort ();
3516       for (i = 0; i < cmp_status->length; i++)
3517         *charbuf++ = cmp_status->carryover[i];
3518       coding->annotated = 1;
3519     }
3520
3521   while (1)
3522     {
3523       int c1, c2, c3;
3524
3525       src_base = src;
3526       consumed_chars_base = consumed_chars;
3527
3528       if (charbuf >= charbuf_end)
3529         {
3530           if (byte_after_cr >= 0)
3531             src_base--;
3532           break;
3533         }
3534
3535       if (byte_after_cr >= 0)
3536         c1 = byte_after_cr, byte_after_cr = -1;
3537       else
3538         ONE_MORE_BYTE (c1);
3539       if (c1 < 0)
3540         goto invalid_code;
3541
3542       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3543         {
3544           *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3545           char_offset++;
3546           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3547           continue;
3548         }
3549
3550       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3551         {
3552           if (c1 == ISO_CODE_ESC)
3553             {
3554               if (src + 1 >= src_end)
3555                 goto no_more_source;
3556               *charbuf++ = ISO_CODE_ESC;
3557               char_offset++;
3558               if (src[0] == '%' && src[1] == '@')
3559                 {
3560                   src += 2;
3561                   consumed_chars += 2;
3562                   char_offset += 2;
3563                   /* We are sure charbuf can contain two more chars. */
3564                   *charbuf++ = '%';
3565                   *charbuf++ = '@';
3566                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3567                 }
3568             }
3569           else
3570             {
3571               *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3572               char_offset++;
3573             }
3574           continue;
3575         }
3576
3577       if ((cmp_status->state == COMPOSING_RULE
3578            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3579           && c1 != ISO_CODE_ESC)
3580         {
3581           int rule;
3582
3583           DECODE_COMPOSITION_RULE (rule);
3584           STORE_COMPOSITION_RULE (rule);
3585           continue;
3586         }
3587
3588       /* We produce at most one character.  */
3589       switch (iso_code_class [c1])
3590         {
3591         case ISO_0x20_or_0x7F:
3592           if (charset_id_0 < 0
3593               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3594             /* This is SPACE or DEL.  */
3595             charset = CHARSET_FROM_ID (charset_ascii);
3596           else
3597             charset = CHARSET_FROM_ID (charset_id_0);
3598           break;
3599
3600         case ISO_graphic_plane_0:
3601           if (charset_id_0 < 0)
3602             charset = CHARSET_FROM_ID (charset_ascii);
3603           else
3604             charset = CHARSET_FROM_ID (charset_id_0);
3605           break;
3606
3607         case ISO_0xA0_or_0xFF:
3608           if (charset_id_1 < 0
3609               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3610               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3611             goto invalid_code;
3612           /* This is a graphic character, we fall down ... */
3613
3614         case ISO_graphic_plane_1:
3615           if (charset_id_1 < 0)
3616             goto invalid_code;
3617           charset = CHARSET_FROM_ID (charset_id_1);
3618           break;
3619
3620         case ISO_control_0:
3621           if (eol_dos && c1 == '\r')
3622             ONE_MORE_BYTE (byte_after_cr);
3623           MAYBE_FINISH_COMPOSITION ();
3624           charset = CHARSET_FROM_ID (charset_ascii);
3625           break;
3626
3627         case ISO_control_1:
3628           goto invalid_code;
3629
3630         case ISO_shift_out:
3631           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3632               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3633             goto invalid_code;
3634           CODING_ISO_INVOCATION (coding, 0) = 1;
3635           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3636           continue;
3637
3638         case ISO_shift_in:
3639           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3640             goto invalid_code;
3641           CODING_ISO_INVOCATION (coding, 0) = 0;
3642           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3643           continue;
3644
3645         case ISO_single_shift_2_7:
3646           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3647             goto invalid_code;
3648         case ISO_single_shift_2:
3649           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3650             goto invalid_code;
3651           /* SS2 is handled as an escape sequence of ESC 'N' */
3652           c1 = 'N';
3653           goto label_escape_sequence;
3654
3655         case ISO_single_shift_3:
3656           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3657             goto invalid_code;
3658           /* SS2 is handled as an escape sequence of ESC 'O' */
3659           c1 = 'O';
3660           goto label_escape_sequence;
3661
3662         case ISO_control_sequence_introducer:
3663           /* CSI is handled as an escape sequence of ESC '[' ...  */
3664           c1 = '[';
3665           goto label_escape_sequence;
3666
3667         case ISO_escape:
3668           ONE_MORE_BYTE (c1);
3669         label_escape_sequence:
3670           /* Escape sequences handled here are invocation,
3671              designation, direction specification, and character
3672              composition specification.  */
3673           switch (c1)
3674             {
3675             case '&':           /* revision of following character set */
3676               ONE_MORE_BYTE (c1);
3677               if (!(c1 >= '@' && c1 <= '~'))
3678                 goto invalid_code;
3679               ONE_MORE_BYTE (c1);
3680               if (c1 != ISO_CODE_ESC)
3681                 goto invalid_code;
3682               ONE_MORE_BYTE (c1);
3683               goto label_escape_sequence;
3684
3685             case '$':           /* designation of 2-byte character set */
3686               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3687                 goto invalid_code;
3688               {
3689                 int reg, chars96;
3690
3691                 ONE_MORE_BYTE (c1);
3692                 if (c1 >= '@' && c1 <= 'B')
3693                   {     /* designation of JISX0208.1978, GB2312.1980,
3694                            or JISX0208.1980 */
3695                     reg = 0, chars96 = 0;
3696                   }
3697                 else if (c1 >= 0x28 && c1 <= 0x2B)
3698                   { /* designation of DIMENSION2_CHARS94 character set */
3699                     reg = c1 - 0x28, chars96 = 0;
3700                     ONE_MORE_BYTE (c1);
3701                   }
3702                 else if (c1 >= 0x2C && c1 <= 0x2F)
3703                   { /* designation of DIMENSION2_CHARS96 character set */
3704                     reg = c1 - 0x2C, chars96 = 1;
3705                     ONE_MORE_BYTE (c1);
3706                   }
3707                 else
3708                   goto invalid_code;
3709                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3710                 /* We must update these variables now.  */
3711                 if (reg == 0)
3712                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3713                 else if (reg == 1)
3714                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3715                 if (chars96 < 0)
3716                   goto invalid_code;
3717               }
3718               continue;
3719
3720             case 'n':           /* invocation of locking-shift-2 */
3721               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3722                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3723                 goto invalid_code;
3724               CODING_ISO_INVOCATION (coding, 0) = 2;
3725               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3726               continue;
3727
3728             case 'o':           /* invocation of locking-shift-3 */
3729               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3730                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3731                 goto invalid_code;
3732               CODING_ISO_INVOCATION (coding, 0) = 3;
3733               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3734               continue;
3735
3736             case 'N':           /* invocation of single-shift-2 */
3737               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3738                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3739                 goto invalid_code;
3740               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3741               if (charset_id_2 < 0)
3742                 charset = CHARSET_FROM_ID (charset_ascii);
3743               else
3744                 charset = CHARSET_FROM_ID (charset_id_2);
3745               ONE_MORE_BYTE (c1);
3746               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3747                   || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3748                       && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3749                           ? c1 >= 0x80 : c1 < 0x80)))
3750                 goto invalid_code;
3751               break;
3752
3753             case 'O':           /* invocation of single-shift-3 */
3754               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3755                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3756                 goto invalid_code;
3757               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3758               if (charset_id_3 < 0)
3759                 charset = CHARSET_FROM_ID (charset_ascii);
3760               else
3761                 charset = CHARSET_FROM_ID (charset_id_3);
3762               ONE_MORE_BYTE (c1);
3763               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3764                   || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3765                       && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3766                           ? c1 >= 0x80 : c1 < 0x80)))
3767                 goto invalid_code;
3768               break;
3769
3770             case '0': case '2': case '3': case '4': /* start composition */
3771               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3772                 goto invalid_code;
3773               if (last_id != charset_ascii)
3774                 {
3775                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3776                   last_id = charset_ascii;
3777                   last_offset = char_offset;
3778                 }
3779               DECODE_COMPOSITION_START (c1);
3780               continue;
3781
3782             case '1':           /* end composition */
3783               if (cmp_status->state == COMPOSING_NO)
3784                 goto invalid_code;
3785               DECODE_COMPOSITION_END ();
3786               continue;
3787
3788             case '[':           /* specification of direction */
3789               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3790                 goto invalid_code;
3791               /* For the moment, nested direction is not supported.
3792                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3793                  left-to-right, and nonzero means right-to-left.  */
3794               ONE_MORE_BYTE (c1);
3795               switch (c1)
3796                 {
3797                 case ']':       /* end of the current direction */
3798                   coding->mode &= ~CODING_MODE_DIRECTION;
3799
3800                 case '0':       /* end of the current direction */
3801                 case '1':       /* start of left-to-right direction */
3802                   ONE_MORE_BYTE (c1);
3803                   if (c1 == ']')
3804                     coding->mode &= ~CODING_MODE_DIRECTION;
3805                   else
3806                     goto invalid_code;
3807                   break;
3808
3809                 case '2':       /* start of right-to-left direction */
3810                   ONE_MORE_BYTE (c1);
3811                   if (c1 == ']')
3812                     coding->mode |= CODING_MODE_DIRECTION;
3813                   else
3814                     goto invalid_code;
3815                   break;
3816
3817                 default:
3818                   goto invalid_code;
3819                 }
3820               continue;
3821
3822             case '%':
3823               ONE_MORE_BYTE (c1);
3824               if (c1 == '/')
3825                 {
3826                   /* CTEXT extended segment:
3827                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3828                      We keep these bytes as is for the moment.
3829                      They may be decoded by post-read-conversion.  */
3830                   int dim, M, L;
3831                   int size;
3832
3833                   ONE_MORE_BYTE (dim);
3834                   if (dim < '0' || dim > '4')
3835                     goto invalid_code;
3836                   ONE_MORE_BYTE (M);
3837                   if (M < 128)
3838                     goto invalid_code;
3839                   ONE_MORE_BYTE (L);
3840                   if (L < 128)
3841                     goto invalid_code;
3842                   size = ((M - 128) * 128) + (L - 128);
3843                   if (charbuf + 6 > charbuf_end)
3844                     goto break_loop;
3845                   *charbuf++ = ISO_CODE_ESC;
3846                   *charbuf++ = '%';
3847                   *charbuf++ = '/';
3848                   *charbuf++ = dim;
3849                   *charbuf++ = BYTE8_TO_CHAR (M);
3850                   *charbuf++ = BYTE8_TO_CHAR (L);
3851                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3852                 }
3853               else if (c1 == 'G')
3854                 {
3855                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3856                      ESC % G --UTF-8-BYTES-- ESC % @
3857                      We keep these bytes as is for the moment.
3858                      They may be decoded by post-read-conversion.  */
3859                   if (charbuf + 3 > charbuf_end)
3860                     goto break_loop;
3861                   *charbuf++ = ISO_CODE_ESC;
3862                   *charbuf++ = '%';
3863                   *charbuf++ = 'G';
3864                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3865                 }
3866               else
3867                 goto invalid_code;
3868               continue;
3869               break;
3870
3871             default:
3872               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3873                 goto invalid_code;
3874               {
3875                 int reg, chars96;
3876
3877                 if (c1 >= 0x28 && c1 <= 0x2B)
3878                   { /* designation of DIMENSION1_CHARS94 character set */
3879                     reg = c1 - 0x28, chars96 = 0;
3880                     ONE_MORE_BYTE (c1);
3881                   }
3882                 else if (c1 >= 0x2C && c1 <= 0x2F)
3883                   { /* designation of DIMENSION1_CHARS96 character set */
3884                     reg = c1 - 0x2C, chars96 = 1;
3885                     ONE_MORE_BYTE (c1);
3886                   }
3887                 else
3888                   goto invalid_code;
3889                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3890                 /* We must update these variables now.  */
3891                 if (reg == 0)
3892                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3893                 else if (reg == 1)
3894                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3895                 if (chars96 < 0)
3896                   goto invalid_code;
3897               }
3898               continue;
3899             }
3900           break;
3901
3902         default:
3903           emacs_abort ();
3904         }
3905
3906       if (cmp_status->state == COMPOSING_NO
3907           && charset->id != charset_ascii
3908           && last_id != charset->id)
3909         {
3910           if (last_id != charset_ascii)
3911             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3912           last_id = charset->id;
3913           last_offset = char_offset;
3914         }
3915
3916       /* Now we know CHARSET and 1st position code C1 of a character.
3917          Produce a decoded character while getting 2nd and 3rd
3918          position codes C2, C3 if necessary.  */
3919       if (CHARSET_DIMENSION (charset) > 1)
3920         {
3921           ONE_MORE_BYTE (c2);
3922           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3923               || ((c1 & 0x80) != (c2 & 0x80)))
3924             /* C2 is not in a valid range.  */
3925             goto invalid_code;
3926           if (CHARSET_DIMENSION (charset) == 2)
3927             c1 = (c1 << 8) | c2;
3928           else
3929             {
3930               ONE_MORE_BYTE (c3);
3931               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3932                   || ((c1 & 0x80) != (c3 & 0x80)))
3933                 /* C3 is not in a valid range.  */
3934                 goto invalid_code;
3935               c1 = (c1 << 16) | (c2 << 8) | c2;
3936             }
3937         }
3938       c1 &= 0x7F7F7F;
3939       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3940       if (c < 0)
3941         {
3942           MAYBE_FINISH_COMPOSITION ();
3943           for (; src_base < src; src_base++, char_offset++)
3944             {
3945               if (ASCII_CHAR_P (*src_base))
3946                 *charbuf++ = *src_base;
3947               else
3948                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3949             }
3950         }
3951       else if (cmp_status->state == COMPOSING_NO)
3952         {
3953           *charbuf++ = c;
3954           char_offset++;
3955         }
3956       else if ((cmp_status->state == COMPOSING_CHAR
3957                 ? cmp_status->nchars
3958                 : cmp_status->ncomps)
3959                >= MAX_COMPOSITION_COMPONENTS)
3960         {
3961           /* Too long composition.  */
3962           MAYBE_FINISH_COMPOSITION ();
3963           *charbuf++ = c;
3964           char_offset++;
3965         }
3966       else
3967         STORE_COMPOSITION_CHAR (c);
3968       continue;
3969
3970     invalid_code:
3971       MAYBE_FINISH_COMPOSITION ();
3972       src = src_base;
3973       consumed_chars = consumed_chars_base;
3974       ONE_MORE_BYTE (c);
3975       *charbuf++ = c < 0 ? -c : ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
3976       char_offset++;
3977       /* Reset the invocation and designation status to the safest
3978          one; i.e. designate ASCII to the graphic register 0, and
3979          invoke that register to the graphic plane 0.  This typically
3980          helps the case that an designation sequence for ASCII "ESC (
3981          B" is somehow broken (e.g. broken by a newline).  */
3982       CODING_ISO_INVOCATION (coding, 0) = 0;
3983       CODING_ISO_DESIGNATION (coding, 0) = charset_ascii;
3984       charset_id_0 = charset_ascii;
3985       continue;
3986
3987     break_loop:
3988       break;
3989     }
3990
3991  no_more_source:
3992   if (cmp_status->state != COMPOSING_NO)
3993     {
3994       if (coding->mode & CODING_MODE_LAST_BLOCK)
3995         MAYBE_FINISH_COMPOSITION ();
3996       else
3997         {
3998           charbuf -= cmp_status->length;
3999           for (i = 0; i < cmp_status->length; i++)
4000             cmp_status->carryover[i] = charbuf[i];
4001         }
4002     }
4003   else if (last_id != charset_ascii)
4004     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4005   coding->consumed_char += consumed_chars_base;
4006   coding->consumed = src_base - coding->source;
4007   coding->charbuf_used = charbuf - coding->charbuf;
4008 }
4009
4010
4011 /* ISO2022 encoding stuff.  */
4012
4013 /*
4014    It is not enough to say just "ISO2022" on encoding, we have to
4015    specify more details.  In Emacs, each coding system of ISO2022
4016    variant has the following specifications:
4017         1. Initial designation to G0 thru G3.
4018         2. Allows short-form designation?
4019         3. ASCII should be designated to G0 before control characters?
4020         4. ASCII should be designated to G0 at end of line?
4021         5. 7-bit environment or 8-bit environment?
4022         6. Use locking-shift?
4023         7. Use Single-shift?
4024    And the following two are only for Japanese:
4025         8. Use ASCII in place of JIS0201-1976-Roman?
4026         9. Use JISX0208-1983 in place of JISX0208-1978?
4027    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4028    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
4029    details.
4030 */
4031
4032 /* Produce codes (escape sequence) for designating CHARSET to graphic
4033    register REG at DST, and increment DST.  If <final-char> of CHARSET is
4034    '@', 'A', or 'B' and the coding system CODING allows, produce
4035    designation sequence of short-form.  */
4036
4037 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
4038   do {                                                                  \
4039     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
4040     const char *intermediate_char_94 = "()*+";                          \
4041     const char *intermediate_char_96 = ",-./";                          \
4042     int revision = -1;                                                  \
4043                                                                         \
4044     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
4045       revision = CHARSET_ISO_REVISION (charset);                        \
4046                                                                         \
4047     if (revision >= 0)                                                  \
4048       {                                                                 \
4049         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4050         EMIT_ONE_BYTE ('@' + revision);                                 \
4051       }                                                                 \
4052     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4053     if (CHARSET_DIMENSION (charset) == 1)                               \
4054       {                                                                 \
4055         int b;                                                          \
4056         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4057           b = intermediate_char_94[reg];                                \
4058         else                                                            \
4059           b = intermediate_char_96[reg];                                \
4060         EMIT_ONE_ASCII_BYTE (b);                                        \
4061       }                                                                 \
4062     else                                                                \
4063       {                                                                 \
4064         EMIT_ONE_ASCII_BYTE ('$');                                      \
4065         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4066           {                                                             \
4067             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4068                 || reg != 0                                             \
4069                 || final_char < '@' || final_char > 'B')                \
4070               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4071           }                                                             \
4072         else                                                            \
4073           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4074       }                                                                 \
4075     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4076                                                                         \
4077     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4078   } while (0)
4079
4080
4081 /* The following two macros produce codes (control character or escape
4082    sequence) for ISO2022 single-shift functions (single-shift-2 and
4083    single-shift-3).  */
4084
4085 #define ENCODE_SINGLE_SHIFT_2                                           \
4086   do {                                                                  \
4087     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4088       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4089     else                                                                \
4090       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4091     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4092   } while (0)
4093
4094
4095 #define ENCODE_SINGLE_SHIFT_3                                           \
4096   do {                                                                  \
4097     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4098       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4099     else                                                                \
4100       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4101     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4102   } while (0)
4103
4104
4105 /* The following four macros produce codes (control character or
4106    escape sequence) for ISO2022 locking-shift functions (shift-in,
4107    shift-out, locking-shift-2, and locking-shift-3).  */
4108
4109 #define ENCODE_SHIFT_IN                                 \
4110   do {                                                  \
4111     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4112     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4113   } while (0)
4114
4115
4116 #define ENCODE_SHIFT_OUT                                \
4117   do {                                                  \
4118     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4119     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4120   } while (0)
4121
4122
4123 #define ENCODE_LOCKING_SHIFT_2                          \
4124   do {                                                  \
4125     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4126     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4127   } while (0)
4128
4129
4130 #define ENCODE_LOCKING_SHIFT_3                          \
4131   do {                                                  \
4132     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4133     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4134   } while (0)
4135
4136
4137 /* Produce codes for a DIMENSION1 character whose character set is
4138    CHARSET and whose position-code is C1.  Designation and invocation
4139    sequences are also produced in advance if necessary.  */
4140
4141 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4142   do {                                                                  \
4143     int id = CHARSET_ID (charset);                                      \
4144                                                                         \
4145     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4146         && id == charset_ascii)                                         \
4147       {                                                                 \
4148         id = charset_jisx0201_roman;                                    \
4149         charset = CHARSET_FROM_ID (id);                                 \
4150       }                                                                 \
4151                                                                         \
4152     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4153       {                                                                 \
4154         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4155           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4156         else                                                            \
4157           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4158         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4159         break;                                                          \
4160       }                                                                 \
4161     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4162       {                                                                 \
4163         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4164         break;                                                          \
4165       }                                                                 \
4166     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4167       {                                                                 \
4168         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4169         break;                                                          \
4170       }                                                                 \
4171     else                                                                \
4172       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4173          must invoke it, or, at first, designate it to some graphic     \
4174          register.  Then repeat the loop to actually produce the        \
4175          character.  */                                                 \
4176       dst = encode_invocation_designation (charset, coding, dst,        \
4177                                            &produced_chars);            \
4178   } while (1)
4179
4180
4181 /* Produce codes for a DIMENSION2 character whose character set is
4182    CHARSET and whose position-codes are C1 and C2.  Designation and
4183    invocation codes are also produced in advance if necessary.  */
4184
4185 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4186   do {                                                                  \
4187     int id = CHARSET_ID (charset);                                      \
4188                                                                         \
4189     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4190         && id == charset_jisx0208)                                      \
4191       {                                                                 \
4192         id = charset_jisx0208_1978;                                     \
4193         charset = CHARSET_FROM_ID (id);                                 \
4194       }                                                                 \
4195                                                                         \
4196     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4197       {                                                                 \
4198         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4199           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4200         else                                                            \
4201           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4202         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4203         break;                                                          \
4204       }                                                                 \
4205     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4206       {                                                                 \
4207         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4208         break;                                                          \
4209       }                                                                 \
4210     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4211       {                                                                 \
4212         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4213         break;                                                          \
4214       }                                                                 \
4215     else                                                                \
4216       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4217          must invoke it, or, at first, designate it to some graphic     \
4218          register.  Then repeat the loop to actually produce the        \
4219          character.  */                                                 \
4220       dst = encode_invocation_designation (charset, coding, dst,        \
4221                                            &produced_chars);            \
4222   } while (1)
4223
4224
4225 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4226   do {                                                                     \
4227     unsigned code;                                                         \
4228     CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code);       \
4229                                                                            \
4230     if (CHARSET_DIMENSION (charset) == 1)                                  \
4231       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4232     else                                                                   \
4233       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4234   } while (0)
4235
4236
4237 /* Produce designation and invocation codes at a place pointed by DST
4238    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4239    Return new DST.  */
4240
4241 static unsigned char *
4242 encode_invocation_designation (struct charset *charset,
4243                                struct coding_system *coding,
4244                                unsigned char *dst, ptrdiff_t *p_nchars)
4245 {
4246   bool multibytep = coding->dst_multibyte;
4247   ptrdiff_t produced_chars = *p_nchars;
4248   int reg;                      /* graphic register number */
4249   int id = CHARSET_ID (charset);
4250
4251   /* At first, check designations.  */
4252   for (reg = 0; reg < 4; reg++)
4253     if (id == CODING_ISO_DESIGNATION (coding, reg))
4254       break;
4255
4256   if (reg >= 4)
4257     {
4258       /* CHARSET is not yet designated to any graphic registers.  */
4259       /* At first check the requested designation.  */
4260       reg = CODING_ISO_REQUEST (coding, id);
4261       if (reg < 0)
4262         /* Since CHARSET requests no special designation, designate it
4263            to graphic register 0.  */
4264         reg = 0;
4265
4266       ENCODE_DESIGNATION (charset, reg, coding);
4267     }
4268
4269   if (CODING_ISO_INVOCATION (coding, 0) != reg
4270       && CODING_ISO_INVOCATION (coding, 1) != reg)
4271     {
4272       /* Since the graphic register REG is not invoked to any graphic
4273          planes, invoke it to graphic plane 0.  */
4274       switch (reg)
4275         {
4276         case 0:                 /* graphic register 0 */
4277           ENCODE_SHIFT_IN;
4278           break;
4279
4280         case 1:                 /* graphic register 1 */
4281           ENCODE_SHIFT_OUT;
4282           break;
4283
4284         case 2:                 /* graphic register 2 */
4285           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4286             ENCODE_SINGLE_SHIFT_2;
4287           else
4288             ENCODE_LOCKING_SHIFT_2;
4289           break;
4290
4291         case 3:                 /* graphic register 3 */
4292           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4293             ENCODE_SINGLE_SHIFT_3;
4294           else
4295             ENCODE_LOCKING_SHIFT_3;
4296           break;
4297
4298         default:
4299           break;
4300         }
4301     }
4302
4303   *p_nchars = produced_chars;
4304   return dst;
4305 }
4306
4307
4308 /* Produce codes for designation and invocation to reset the graphic
4309    planes and registers to initial state.  */
4310 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4311   do {                                                                  \
4312     int reg;                                                            \
4313     struct charset *charset;                                            \
4314                                                                         \
4315     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4316       ENCODE_SHIFT_IN;                                                  \
4317     for (reg = 0; reg < 4; reg++)                                       \
4318       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4319           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4320               != CODING_ISO_INITIAL (coding, reg)))                     \
4321         {                                                               \
4322           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4323           ENCODE_DESIGNATION (charset, reg, coding);                    \
4324         }                                                               \
4325   } while (0)
4326
4327
4328 /* Produce designation sequences of charsets in the line started from
4329    CHARBUF to a place pointed by DST, and return the number of
4330    produced bytes.  DST should not directly point a buffer text area
4331    which may be relocated by char_charset call.
4332
4333    If the current block ends before any end-of-line, we may fail to
4334    find all the necessary designations.  */
4335
4336 static ptrdiff_t
4337 encode_designation_at_bol (struct coding_system *coding,
4338                            int *charbuf, int *charbuf_end,
4339                            unsigned char *dst)
4340 {
4341   unsigned char *orig = dst;
4342   struct charset *charset;
4343   /* Table of charsets to be designated to each graphic register.  */
4344   int r[4];
4345   int c, found = 0, reg;
4346   ptrdiff_t produced_chars = 0;
4347   bool multibytep = coding->dst_multibyte;
4348   Lisp_Object attrs;
4349   Lisp_Object charset_list;
4350
4351   attrs = CODING_ID_ATTRS (coding->id);
4352   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4353   if (EQ (charset_list, Qiso_2022))
4354     charset_list = Viso_2022_charset_list;
4355
4356   for (reg = 0; reg < 4; reg++)
4357     r[reg] = -1;
4358
4359   while (charbuf < charbuf_end && found < 4)
4360     {
4361       int id;
4362
4363       c = *charbuf++;
4364       if (c == '\n')
4365         break;
4366       charset = char_charset (c, charset_list, NULL);
4367       id = CHARSET_ID (charset);
4368       reg = CODING_ISO_REQUEST (coding, id);
4369       if (reg >= 0 && r[reg] < 0)
4370         {
4371           found++;
4372           r[reg] = id;
4373         }
4374     }
4375
4376   if (found)
4377     {
4378       for (reg = 0; reg < 4; reg++)
4379         if (r[reg] >= 0
4380             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4381           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4382     }
4383
4384   return dst - orig;
4385 }
4386
4387 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4388
4389 static bool
4390 encode_coding_iso_2022 (struct coding_system *coding)
4391 {
4392   bool multibytep = coding->dst_multibyte;
4393   int *charbuf = coding->charbuf;
4394   int *charbuf_end = charbuf + coding->charbuf_used;
4395   unsigned char *dst = coding->destination + coding->produced;
4396   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4397   int safe_room = 16;
4398   bool bol_designation
4399     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4400        && CODING_ISO_BOL (coding));
4401   ptrdiff_t produced_chars = 0;
4402   Lisp_Object attrs, eol_type, charset_list;
4403   bool ascii_compatible;
4404   int c;
4405   int preferred_charset_id = -1;
4406
4407   CODING_GET_INFO (coding, attrs, charset_list);
4408   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4409   if (VECTORP (eol_type))
4410     eol_type = Qunix;
4411
4412   setup_iso_safe_charsets (attrs);
4413   /* Charset list may have been changed.  */
4414   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4415   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4416
4417   ascii_compatible
4418     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4419        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4420                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4421
4422   while (charbuf < charbuf_end)
4423     {
4424       ASSURE_DESTINATION (safe_room);
4425
4426       if (bol_designation)
4427         {
4428           /* We have to produce designation sequences if any now.  */
4429           unsigned char desig_buf[16];
4430           ptrdiff_t nbytes;
4431           ptrdiff_t offset;
4432
4433           charset_map_loaded = 0;
4434           nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4435                                               desig_buf);
4436           if (charset_map_loaded
4437               && (offset = coding_change_destination (coding)))
4438             {
4439               dst += offset;
4440               dst_end += offset;
4441             }
4442           memcpy (dst, desig_buf, nbytes);
4443           dst += nbytes;
4444           /* We are sure that designation sequences are all ASCII bytes.  */
4445           produced_chars += nbytes;
4446           bol_designation = 0;
4447           ASSURE_DESTINATION (safe_room);
4448         }
4449
4450       c = *charbuf++;
4451
4452       if (c < 0)
4453         {
4454           /* Handle an annotation.  */
4455           switch (*charbuf)
4456             {
4457             case CODING_ANNOTATE_COMPOSITION_MASK:
4458               /* Not yet implemented.  */
4459               break;
4460             case CODING_ANNOTATE_CHARSET_MASK:
4461               preferred_charset_id = charbuf[2];
4462               if (preferred_charset_id >= 0
4463                   && NILP (Fmemq (make_number (preferred_charset_id),
4464                                   charset_list)))
4465                 preferred_charset_id = -1;
4466               break;
4467             default:
4468               emacs_abort ();
4469             }
4470           charbuf += -c - 1;
4471           continue;
4472         }
4473
4474       /* Now encode the character C.  */
4475       if (c < 0x20 || c == 0x7F)
4476         {
4477           if (c == '\n'
4478               || (c == '\r' && EQ (eol_type, Qmac)))
4479             {
4480               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4481                 ENCODE_RESET_PLANE_AND_REGISTER ();
4482               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4483                 {
4484                   int i;
4485
4486                   for (i = 0; i < 4; i++)
4487                     CODING_ISO_DESIGNATION (coding, i)
4488                       = CODING_ISO_INITIAL (coding, i);
4489                 }
4490               bol_designation = ((CODING_ISO_FLAGS (coding)
4491                                   & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4492                                  != 0);
4493             }
4494           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4495             ENCODE_RESET_PLANE_AND_REGISTER ();
4496           EMIT_ONE_ASCII_BYTE (c);
4497         }
4498       else if (ASCII_CHAR_P (c))
4499         {
4500           if (ascii_compatible)
4501             EMIT_ONE_ASCII_BYTE (c);
4502           else
4503             {
4504               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4505               ENCODE_ISO_CHARACTER (charset, c);
4506             }
4507         }
4508       else if (CHAR_BYTE8_P (c))
4509         {
4510           c = CHAR_TO_BYTE8 (c);
4511           EMIT_ONE_BYTE (c);
4512         }
4513       else
4514         {
4515           struct charset *charset;
4516
4517           if (preferred_charset_id >= 0)
4518             {
4519               bool result;
4520
4521               charset = CHARSET_FROM_ID (preferred_charset_id);
4522               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4523               if (! result)
4524                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4525                                      NULL, charset);
4526             }
4527           else
4528             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4529                                  NULL, charset);
4530           if (!charset)
4531             {
4532               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4533                 {
4534                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4535                   charset = CHARSET_FROM_ID (charset_ascii);
4536                 }
4537               else
4538                 {
4539                   c = coding->default_char;
4540                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4541                                        charset_list, NULL, charset);
4542                 }
4543             }
4544           ENCODE_ISO_CHARACTER (charset, c);
4545         }
4546     }
4547
4548   if (coding->mode & CODING_MODE_LAST_BLOCK
4549       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4550     {
4551       ASSURE_DESTINATION (safe_room);
4552       ENCODE_RESET_PLANE_AND_REGISTER ();
4553     }
4554   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4555   CODING_ISO_BOL (coding) = bol_designation;
4556   coding->produced_char += produced_chars;
4557   coding->produced = dst - coding->destination;
4558   return 0;
4559 }
4560
4561 \f
4562 /*** 8,9. SJIS and BIG5 handlers ***/
4563
4564 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4565    quite widely.  So, for the moment, Emacs supports them in the bare
4566    C code.  But, in the future, they may be supported only by CCL.  */
4567
4568 /* SJIS is a coding system encoding three character sets: ASCII, right
4569    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4570    as is.  A character of charset katakana-jisx0201 is encoded by
4571    "position-code + 0x80".  A character of charset japanese-jisx0208
4572    is encoded in 2-byte but two position-codes are divided and shifted
4573    so that it fit in the range below.
4574
4575    --- CODE RANGE of SJIS ---
4576    (character set)      (range)
4577    ASCII                0x00 .. 0x7F
4578    KATAKANA-JISX0201    0xA0 .. 0xDF
4579    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4580             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4581    -------------------------------
4582
4583 */
4584
4585 /* BIG5 is a coding system encoding two character sets: ASCII and
4586    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4587    character set and is encoded in two-byte.
4588
4589    --- CODE RANGE of BIG5 ---
4590    (character set)      (range)
4591    ASCII                0x00 .. 0x7F
4592    Big5 (1st byte)      0xA1 .. 0xFE
4593         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4594    --------------------------
4595
4596   */
4597
4598 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4599    Return true if a text is encoded in SJIS.  */
4600
4601 static bool
4602 detect_coding_sjis (struct coding_system *coding,
4603                     struct coding_detection_info *detect_info)
4604 {
4605   const unsigned char *src = coding->source, *src_base;
4606   const unsigned char *src_end = coding->source + coding->src_bytes;
4607   bool multibytep = coding->src_multibyte;
4608   ptrdiff_t consumed_chars = 0;
4609   int found = 0;
4610   int c;
4611   Lisp_Object attrs, charset_list;
4612   int max_first_byte_of_2_byte_code;
4613
4614   CODING_GET_INFO (coding, attrs, charset_list);
4615   max_first_byte_of_2_byte_code
4616     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4617
4618   detect_info->checked |= CATEGORY_MASK_SJIS;
4619   /* A coding system of this category is always ASCII compatible.  */
4620   src += coding->head_ascii;
4621
4622   while (1)
4623     {
4624       src_base = src;
4625       ONE_MORE_BYTE (c);
4626       if (c < 0x80)
4627         continue;
4628       if ((c >= 0x81 && c <= 0x9F)
4629           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4630         {
4631           ONE_MORE_BYTE (c);
4632           if (c < 0x40 || c == 0x7F || c > 0xFC)
4633             break;
4634           found = CATEGORY_MASK_SJIS;
4635         }
4636       else if (c >= 0xA0 && c < 0xE0)
4637         found = CATEGORY_MASK_SJIS;
4638       else
4639         break;
4640     }
4641   detect_info->rejected |= CATEGORY_MASK_SJIS;
4642   return 0;
4643
4644  no_more_source:
4645   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4646     {
4647       detect_info->rejected |= CATEGORY_MASK_SJIS;
4648       return 0;
4649     }
4650   detect_info->found |= found;
4651   return 1;
4652 }
4653
4654 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4655    Return true if a text is encoded in BIG5.  */
4656
4657 static bool
4658 detect_coding_big5 (struct coding_system *coding,
4659                     struct coding_detection_info *detect_info)
4660 {
4661   const unsigned char *src = coding->source, *src_base;
4662   const unsigned char *src_end = coding->source + coding->src_bytes;
4663   bool multibytep = coding->src_multibyte;
4664   ptrdiff_t consumed_chars = 0;
4665   int found = 0;
4666   int c;
4667
4668   detect_info->checked |= CATEGORY_MASK_BIG5;
4669   /* A coding system of this category is always ASCII compatible.  */
4670   src += coding->head_ascii;
4671
4672   while (1)
4673     {
4674       src_base = src;
4675       ONE_MORE_BYTE (c);
4676       if (c < 0x80)
4677         continue;
4678       if (c >= 0xA1)
4679         {
4680           ONE_MORE_BYTE (c);
4681           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4682             return 0;
4683           found = CATEGORY_MASK_BIG5;
4684         }
4685       else
4686         break;
4687     }
4688   detect_info->rejected |= CATEGORY_MASK_BIG5;
4689   return 0;
4690
4691  no_more_source:
4692   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4693     {
4694       detect_info->rejected |= CATEGORY_MASK_BIG5;
4695       return 0;
4696     }
4697   detect_info->found |= found;
4698   return 1;
4699 }
4700
4701 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4702
4703 static void
4704 decode_coding_sjis (struct coding_system *coding)
4705 {
4706   const unsigned char *src = coding->source + coding->consumed;
4707   const unsigned char *src_end = coding->source + coding->src_bytes;
4708   const unsigned char *src_base;
4709   int *charbuf = coding->charbuf + coding->charbuf_used;
4710   /* We may produce one charset annotation in one loop and one more at
4711      the end.  */
4712   int *charbuf_end
4713     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4714   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4715   bool multibytep = coding->src_multibyte;
4716   struct charset *charset_roman, *charset_kanji, *charset_kana;
4717   struct charset *charset_kanji2;
4718   Lisp_Object attrs, charset_list, val;
4719   ptrdiff_t char_offset = coding->produced_char;
4720   ptrdiff_t last_offset = char_offset;
4721   int last_id = charset_ascii;
4722   bool eol_dos
4723     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4724   int byte_after_cr = -1;
4725
4726   CODING_GET_INFO (coding, attrs, charset_list);
4727
4728   val = charset_list;
4729   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4730   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4731   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4732   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4733
4734   while (1)
4735     {
4736       int c, c1;
4737       struct charset *charset;
4738
4739       src_base = src;
4740       consumed_chars_base = consumed_chars;
4741
4742       if (charbuf >= charbuf_end)
4743         {
4744           if (byte_after_cr >= 0)
4745             src_base--;
4746           break;
4747         }
4748
4749       if (byte_after_cr >= 0)
4750         c = byte_after_cr, byte_after_cr = -1;
4751       else
4752         ONE_MORE_BYTE (c);
4753       if (c < 0)
4754         goto invalid_code;
4755       if (c < 0x80)
4756         {
4757           if (eol_dos && c == '\r')
4758             ONE_MORE_BYTE (byte_after_cr);
4759           charset = charset_roman;
4760         }
4761       else if (c == 0x80 || c == 0xA0)
4762         goto invalid_code;
4763       else if (c >= 0xA1 && c <= 0xDF)
4764         {
4765           /* SJIS -> JISX0201-Kana */
4766           c &= 0x7F;
4767           charset = charset_kana;
4768         }
4769       else if (c <= 0xEF)
4770         {
4771           /* SJIS -> JISX0208 */
4772           ONE_MORE_BYTE (c1);
4773           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4774             goto invalid_code;
4775           c = (c << 8) | c1;
4776           SJIS_TO_JIS (c);
4777           charset = charset_kanji;
4778         }
4779       else if (c <= 0xFC && charset_kanji2)
4780         {
4781           /* SJIS -> JISX0213-2 */
4782           ONE_MORE_BYTE (c1);
4783           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4784             goto invalid_code;
4785           c = (c << 8) | c1;
4786           SJIS_TO_JIS2 (c);
4787           charset = charset_kanji2;
4788         }
4789       else
4790         goto invalid_code;
4791       if (charset->id != charset_ascii
4792           && last_id != charset->id)
4793         {
4794           if (last_id != charset_ascii)
4795             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4796           last_id = charset->id;
4797           last_offset = char_offset;
4798         }
4799       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4800       *charbuf++ = c;
4801       char_offset++;
4802       continue;
4803
4804     invalid_code:
4805       src = src_base;
4806       consumed_chars = consumed_chars_base;
4807       ONE_MORE_BYTE (c);
4808       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4809       char_offset++;
4810     }
4811
4812  no_more_source:
4813   if (last_id != charset_ascii)
4814     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4815   coding->consumed_char += consumed_chars_base;
4816   coding->consumed = src_base - coding->source;
4817   coding->charbuf_used = charbuf - coding->charbuf;
4818 }
4819
4820 static void
4821 decode_coding_big5 (struct coding_system *coding)
4822 {
4823   const unsigned char *src = coding->source + coding->consumed;
4824   const unsigned char *src_end = coding->source + coding->src_bytes;
4825   const unsigned char *src_base;
4826   int *charbuf = coding->charbuf + coding->charbuf_used;
4827   /* We may produce one charset annotation in one loop and one more at
4828      the end.  */
4829   int *charbuf_end
4830     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4831   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4832   bool multibytep = coding->src_multibyte;
4833   struct charset *charset_roman, *charset_big5;
4834   Lisp_Object attrs, charset_list, val;
4835   ptrdiff_t char_offset = coding->produced_char;
4836   ptrdiff_t last_offset = char_offset;
4837   int last_id = charset_ascii;
4838   bool eol_dos
4839     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4840   int byte_after_cr = -1;
4841
4842   CODING_GET_INFO (coding, attrs, charset_list);
4843   val = charset_list;
4844   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4845   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4846
4847   while (1)
4848     {
4849       int c, c1;
4850       struct charset *charset;
4851
4852       src_base = src;
4853       consumed_chars_base = consumed_chars;
4854
4855       if (charbuf >= charbuf_end)
4856         {
4857           if (byte_after_cr >= 0)
4858             src_base--;
4859           break;
4860         }
4861
4862       if (byte_after_cr >= 0)
4863         c = byte_after_cr, byte_after_cr = -1;
4864       else
4865         ONE_MORE_BYTE (c);
4866
4867       if (c < 0)
4868         goto invalid_code;
4869       if (c < 0x80)
4870         {
4871           if (eol_dos && c == '\r')
4872             ONE_MORE_BYTE (byte_after_cr);
4873           charset = charset_roman;
4874         }
4875       else
4876         {
4877           /* BIG5 -> Big5 */
4878           if (c < 0xA1 || c > 0xFE)
4879             goto invalid_code;
4880           ONE_MORE_BYTE (c1);
4881           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4882             goto invalid_code;
4883           c = c << 8 | c1;
4884           charset = charset_big5;
4885         }
4886       if (charset->id != charset_ascii
4887           && last_id != charset->id)
4888         {
4889           if (last_id != charset_ascii)
4890             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4891           last_id = charset->id;
4892           last_offset = char_offset;
4893         }
4894       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4895       *charbuf++ = c;
4896       char_offset++;
4897       continue;
4898
4899     invalid_code:
4900       src = src_base;
4901       consumed_chars = consumed_chars_base;
4902       ONE_MORE_BYTE (c);
4903       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4904       char_offset++;
4905     }
4906
4907  no_more_source:
4908   if (last_id != charset_ascii)
4909     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4910   coding->consumed_char += consumed_chars_base;
4911   coding->consumed = src_base - coding->source;
4912   coding->charbuf_used = charbuf - coding->charbuf;
4913 }
4914
4915 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4916    This function can encode charsets `ascii', `katakana-jisx0201',
4917    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4918    are sure that all these charsets are registered as official charset
4919    (i.e. do not have extended leading-codes).  Characters of other
4920    charsets are produced without any encoding.  */
4921
4922 static bool
4923 encode_coding_sjis (struct coding_system *coding)
4924 {
4925   bool multibytep = coding->dst_multibyte;
4926   int *charbuf = coding->charbuf;
4927   int *charbuf_end = charbuf + coding->charbuf_used;
4928   unsigned char *dst = coding->destination + coding->produced;
4929   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4930   int safe_room = 4;
4931   ptrdiff_t produced_chars = 0;
4932   Lisp_Object attrs, charset_list, val;
4933   bool ascii_compatible;
4934   struct charset *charset_kanji, *charset_kana;
4935   struct charset *charset_kanji2;
4936   int c;
4937
4938   CODING_GET_INFO (coding, attrs, charset_list);
4939   val = XCDR (charset_list);
4940   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4941   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4942   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4943
4944   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4945
4946   while (charbuf < charbuf_end)
4947     {
4948       ASSURE_DESTINATION (safe_room);
4949       c = *charbuf++;
4950       /* Now encode the character C.  */
4951       if (ASCII_CHAR_P (c) && ascii_compatible)
4952         EMIT_ONE_ASCII_BYTE (c);
4953       else if (CHAR_BYTE8_P (c))
4954         {
4955           c = CHAR_TO_BYTE8 (c);
4956           EMIT_ONE_BYTE (c);
4957         }
4958       else
4959         {
4960           unsigned code;
4961           struct charset *charset;
4962           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4963                                &code, charset);
4964
4965           if (!charset)
4966             {
4967               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4968                 {
4969                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4970                   charset = CHARSET_FROM_ID (charset_ascii);
4971                 }
4972               else
4973                 {
4974                   c = coding->default_char;
4975                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4976                                        charset_list, &code, charset);
4977                 }
4978             }
4979           if (code == CHARSET_INVALID_CODE (charset))
4980             emacs_abort ();
4981           if (charset == charset_kanji)
4982             {
4983               int c1, c2;
4984               JIS_TO_SJIS (code);
4985               c1 = code >> 8, c2 = code & 0xFF;
4986               EMIT_TWO_BYTES (c1, c2);
4987             }
4988           else if (charset == charset_kana)
4989             EMIT_ONE_BYTE (code | 0x80);
4990           else if (charset_kanji2 && charset == charset_kanji2)
4991             {
4992               int c1, c2;
4993
4994               c1 = code >> 8;
4995               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4996                   || c1 == 0x28
4997                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4998                 {
4999                   JIS_TO_SJIS2 (code);
5000                   c1 = code >> 8, c2 = code & 0xFF;
5001                   EMIT_TWO_BYTES (c1, c2);
5002                 }
5003               else
5004                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5005             }
5006           else
5007             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5008         }
5009     }
5010   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5011   coding->produced_char += produced_chars;
5012   coding->produced = dst - coding->destination;
5013   return 0;
5014 }
5015
5016 static bool
5017 encode_coding_big5 (struct coding_system *coding)
5018 {
5019   bool multibytep = coding->dst_multibyte;
5020   int *charbuf = coding->charbuf;
5021   int *charbuf_end = charbuf + coding->charbuf_used;
5022   unsigned char *dst = coding->destination + coding->produced;
5023   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5024   int safe_room = 4;
5025   ptrdiff_t produced_chars = 0;
5026   Lisp_Object attrs, charset_list, val;
5027   bool ascii_compatible;
5028   struct charset *charset_big5;
5029   int c;
5030
5031   CODING_GET_INFO (coding, attrs, charset_list);
5032   val = XCDR (charset_list);
5033   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5034   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5035
5036   while (charbuf < charbuf_end)
5037     {
5038       ASSURE_DESTINATION (safe_room);
5039       c = *charbuf++;
5040       /* Now encode the character C.  */
5041       if (ASCII_CHAR_P (c) && ascii_compatible)
5042         EMIT_ONE_ASCII_BYTE (c);
5043       else if (CHAR_BYTE8_P (c))
5044         {
5045           c = CHAR_TO_BYTE8 (c);
5046           EMIT_ONE_BYTE (c);
5047         }
5048       else
5049         {
5050           unsigned code;
5051           struct charset *charset;
5052           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5053                                &code, charset);
5054
5055           if (! charset)
5056             {
5057               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5058                 {
5059                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5060                   charset = CHARSET_FROM_ID (charset_ascii);
5061                 }
5062               else
5063                 {
5064                   c = coding->default_char;
5065                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5066                                        charset_list, &code, charset);
5067                 }
5068             }
5069           if (code == CHARSET_INVALID_CODE (charset))
5070             emacs_abort ();
5071           if (charset == charset_big5)
5072             {
5073               int c1, c2;
5074
5075               c1 = code >> 8, c2 = code & 0xFF;
5076               EMIT_TWO_BYTES (c1, c2);
5077             }
5078           else
5079             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5080         }
5081     }
5082   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5083   coding->produced_char += produced_chars;
5084   coding->produced = dst - coding->destination;
5085   return 0;
5086 }
5087
5088 \f
5089 /*** 10. CCL handlers ***/
5090
5091 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5092    Return true if a text is encoded in a coding system of which
5093    encoder/decoder are written in CCL program.  */
5094
5095 static bool
5096 detect_coding_ccl (struct coding_system *coding,
5097                    struct coding_detection_info *detect_info)
5098 {
5099   const unsigned char *src = coding->source, *src_base;
5100   const unsigned char *src_end = coding->source + coding->src_bytes;
5101   bool multibytep = coding->src_multibyte;
5102   ptrdiff_t consumed_chars = 0;
5103   int found = 0;
5104   unsigned char *valids;
5105   ptrdiff_t head_ascii = coding->head_ascii;
5106   Lisp_Object attrs;
5107
5108   detect_info->checked |= CATEGORY_MASK_CCL;
5109
5110   coding = &coding_categories[coding_category_ccl];
5111   valids = CODING_CCL_VALIDS (coding);
5112   attrs = CODING_ID_ATTRS (coding->id);
5113   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5114     src += head_ascii;
5115
5116   while (1)
5117     {
5118       int c;
5119
5120       src_base = src;
5121       ONE_MORE_BYTE (c);
5122       if (c < 0 || ! valids[c])
5123         break;
5124       if ((valids[c] > 1))
5125         found = CATEGORY_MASK_CCL;
5126     }
5127   detect_info->rejected |= CATEGORY_MASK_CCL;
5128   return 0;
5129
5130  no_more_source:
5131   detect_info->found |= found;
5132   return 1;
5133 }
5134
5135 static void
5136 decode_coding_ccl (struct coding_system *coding)
5137 {
5138   const unsigned char *src = coding->source + coding->consumed;
5139   const unsigned char *src_end = coding->source + coding->src_bytes;
5140   int *charbuf = coding->charbuf + coding->charbuf_used;
5141   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5142   ptrdiff_t consumed_chars = 0;
5143   bool multibytep = coding->src_multibyte;
5144   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5145   int source_charbuf[1024];
5146   int source_byteidx[1025];
5147   Lisp_Object attrs, charset_list;
5148
5149   CODING_GET_INFO (coding, attrs, charset_list);
5150
5151   while (1)
5152     {
5153       const unsigned char *p = src;
5154       ptrdiff_t offset;
5155       int i = 0;
5156
5157       if (multibytep)
5158         {
5159           while (i < 1024 && p < src_end)
5160             {
5161               source_byteidx[i] = p - src;
5162               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5163             }
5164           source_byteidx[i] = p - src;
5165         }
5166       else
5167         while (i < 1024 && p < src_end)
5168           source_charbuf[i++] = *p++;
5169
5170       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5171         ccl->last_block = true;
5172       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5173       charset_map_loaded = 0;
5174       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5175                   charset_list);
5176       if (charset_map_loaded
5177           && (offset = coding_change_source (coding)))
5178         {
5179           p += offset;
5180           src += offset;
5181           src_end += offset;
5182         }
5183       charbuf += ccl->produced;
5184       if (multibytep)
5185         src += source_byteidx[ccl->consumed];
5186       else
5187         src += ccl->consumed;
5188       consumed_chars += ccl->consumed;
5189       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5190         break;
5191     }
5192
5193   switch (ccl->status)
5194     {
5195     case CCL_STAT_SUSPEND_BY_SRC:
5196       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5197       break;
5198     case CCL_STAT_SUSPEND_BY_DST:
5199       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5200       break;
5201     case CCL_STAT_QUIT:
5202     case CCL_STAT_INVALID_CMD:
5203       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5204       break;
5205     default:
5206       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5207       break;
5208     }
5209   coding->consumed_char += consumed_chars;
5210   coding->consumed = src - coding->source;
5211   coding->charbuf_used = charbuf - coding->charbuf;
5212 }
5213
5214 static bool
5215 encode_coding_ccl (struct coding_system *coding)
5216 {
5217   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5218   bool multibytep = coding->dst_multibyte;
5219   int *charbuf = coding->charbuf;
5220   int *charbuf_end = charbuf + coding->charbuf_used;
5221   unsigned char *dst = coding->destination + coding->produced;
5222   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5223   int destination_charbuf[1024];
5224   ptrdiff_t produced_chars = 0;
5225   int i;
5226   Lisp_Object attrs, charset_list;
5227
5228   CODING_GET_INFO (coding, attrs, charset_list);
5229   if (coding->consumed_char == coding->src_chars
5230       && coding->mode & CODING_MODE_LAST_BLOCK)
5231     ccl->last_block = true;
5232
5233   do
5234     {
5235       ptrdiff_t offset;
5236
5237       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5238       charset_map_loaded = 0;
5239       ccl_driver (ccl, charbuf, destination_charbuf,
5240                   charbuf_end - charbuf, 1024, charset_list);
5241       if (charset_map_loaded
5242           && (offset = coding_change_destination (coding)))
5243         dst += offset;
5244       if (multibytep)
5245         {
5246           ASSURE_DESTINATION (ccl->produced * 2);
5247           for (i = 0; i < ccl->produced; i++)
5248             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5249         }
5250       else
5251         {
5252           ASSURE_DESTINATION (ccl->produced);
5253           for (i = 0; i < ccl->produced; i++)
5254             *dst++ = destination_charbuf[i] & 0xFF;
5255           produced_chars += ccl->produced;
5256         }
5257       charbuf += ccl->consumed;
5258       if (ccl->status == CCL_STAT_QUIT
5259           || ccl->status == CCL_STAT_INVALID_CMD)
5260         break;
5261     }
5262   while (charbuf < charbuf_end);
5263
5264   switch (ccl->status)
5265     {
5266     case CCL_STAT_SUSPEND_BY_SRC:
5267       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5268       break;
5269     case CCL_STAT_SUSPEND_BY_DST:
5270       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5271       break;
5272     case CCL_STAT_QUIT:
5273     case CCL_STAT_INVALID_CMD:
5274       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5275       break;
5276     default:
5277       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5278       break;
5279     }
5280
5281   coding->produced_char += produced_chars;
5282   coding->produced = dst - coding->destination;
5283   return 0;
5284 }
5285
5286 \f
5287 /*** 10, 11. no-conversion handlers ***/
5288
5289 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5290
5291 static void
5292 decode_coding_raw_text (struct coding_system *coding)
5293 {
5294   bool eol_dos
5295     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5296
5297   coding->chars_at_source = 1;
5298   coding->consumed_char = coding->src_chars;
5299   coding->consumed = coding->src_bytes;
5300   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5301     {
5302       coding->consumed_char--;
5303       coding->consumed--;
5304       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5305     }
5306   else
5307     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5308 }
5309
5310 static bool
5311 encode_coding_raw_text (struct coding_system *coding)
5312 {
5313   bool multibytep = coding->dst_multibyte;
5314   int *charbuf = coding->charbuf;
5315   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5316   unsigned char *dst = coding->destination + coding->produced;
5317   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5318   ptrdiff_t produced_chars = 0;
5319   int c;
5320
5321   if (multibytep)
5322     {
5323       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5324
5325       if (coding->src_multibyte)
5326         while (charbuf < charbuf_end)
5327           {
5328             ASSURE_DESTINATION (safe_room);
5329             c = *charbuf++;
5330             if (ASCII_CHAR_P (c))
5331               EMIT_ONE_ASCII_BYTE (c);
5332             else if (CHAR_BYTE8_P (c))
5333               {
5334                 c = CHAR_TO_BYTE8 (c);
5335                 EMIT_ONE_BYTE (c);
5336               }
5337             else
5338               {
5339                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5340
5341                 CHAR_STRING_ADVANCE (c, p1);
5342                 do
5343                   {
5344                     EMIT_ONE_BYTE (*p0);
5345                     p0++;
5346                   }
5347                 while (p0 < p1);
5348               }
5349           }
5350       else
5351         while (charbuf < charbuf_end)
5352           {
5353             ASSURE_DESTINATION (safe_room);
5354             c = *charbuf++;
5355             EMIT_ONE_BYTE (c);
5356           }
5357     }
5358   else
5359     {
5360       if (coding->src_multibyte)
5361         {
5362           int safe_room = MAX_MULTIBYTE_LENGTH;
5363
5364           while (charbuf < charbuf_end)
5365             {
5366               ASSURE_DESTINATION (safe_room);
5367               c = *charbuf++;
5368               if (ASCII_CHAR_P (c))
5369                 *dst++ = c;
5370               else if (CHAR_BYTE8_P (c))
5371                 *dst++ = CHAR_TO_BYTE8 (c);
5372               else
5373                 CHAR_STRING_ADVANCE (c, dst);
5374             }
5375         }
5376       else
5377         {
5378           ASSURE_DESTINATION (charbuf_end - charbuf);
5379           while (charbuf < charbuf_end && dst < dst_end)
5380             *dst++ = *charbuf++;
5381         }
5382       produced_chars = dst - (coding->destination + coding->produced);
5383     }
5384   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5385   coding->produced_char += produced_chars;
5386   coding->produced = dst - coding->destination;
5387   return 0;
5388 }
5389
5390 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5391    Return true if a text is encoded in a charset-based coding system.  */
5392
5393 static bool
5394 detect_coding_charset (struct coding_system *coding,
5395                        struct coding_detection_info *detect_info)
5396 {
5397   const unsigned char *src = coding->source, *src_base;
5398   const unsigned char *src_end = coding->source + coding->src_bytes;
5399   bool multibytep = coding->src_multibyte;
5400   ptrdiff_t consumed_chars = 0;
5401   Lisp_Object attrs, valids, name;
5402   int found = 0;
5403   ptrdiff_t head_ascii = coding->head_ascii;
5404   bool check_latin_extra = 0;
5405
5406   detect_info->checked |= CATEGORY_MASK_CHARSET;
5407
5408   coding = &coding_categories[coding_category_charset];
5409   attrs = CODING_ID_ATTRS (coding->id);
5410   valids = AREF (attrs, coding_attr_charset_valids);
5411   name = CODING_ID_NAME (coding->id);
5412   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5413                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5414       || strncmp (SSDATA (SYMBOL_NAME (name)),
5415                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5416     check_latin_extra = 1;
5417
5418   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5419     src += head_ascii;
5420
5421   while (1)
5422     {
5423       int c;
5424       Lisp_Object val;
5425       struct charset *charset;
5426       int dim, idx;
5427
5428       src_base = src;
5429       ONE_MORE_BYTE (c);
5430       if (c < 0)
5431         continue;
5432       val = AREF (valids, c);
5433       if (NILP (val))
5434         break;
5435       if (c >= 0x80)
5436         {
5437           if (c < 0xA0
5438               && check_latin_extra
5439               && (!VECTORP (Vlatin_extra_code_table)
5440                   || NILP (AREF (Vlatin_extra_code_table, c))))
5441             break;
5442           found = CATEGORY_MASK_CHARSET;
5443         }
5444       if (INTEGERP (val))
5445         {
5446           charset = CHARSET_FROM_ID (XFASTINT (val));
5447           dim = CHARSET_DIMENSION (charset);
5448           for (idx = 1; idx < dim; idx++)
5449             {
5450               if (src == src_end)
5451                 goto too_short;
5452               ONE_MORE_BYTE (c);
5453               if (c < charset->code_space[(dim - 1 - idx) * 4]
5454                   || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5455                 break;
5456             }
5457           if (idx < dim)
5458             break;
5459         }
5460       else
5461         {
5462           idx = 1;
5463           for (; CONSP (val); val = XCDR (val))
5464             {
5465               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5466               dim = CHARSET_DIMENSION (charset);
5467               while (idx < dim)
5468                 {
5469                   if (src == src_end)
5470                     goto too_short;
5471                   ONE_MORE_BYTE (c);
5472                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5473                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5474                     break;
5475                   idx++;
5476                 }
5477               if (idx == dim)
5478                 {
5479                   val = Qnil;
5480                   break;
5481                 }
5482             }
5483           if (CONSP (val))
5484             break;
5485         }
5486     }
5487  too_short:
5488   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5489   return 0;
5490
5491  no_more_source:
5492   detect_info->found |= found;
5493   return 1;
5494 }
5495
5496 static void
5497 decode_coding_charset (struct coding_system *coding)
5498 {
5499   const unsigned char *src = coding->source + coding->consumed;
5500   const unsigned char *src_end = coding->source + coding->src_bytes;
5501   const unsigned char *src_base;
5502   int *charbuf = coding->charbuf + coding->charbuf_used;
5503   /* We may produce one charset annotation in one loop and one more at
5504      the end.  */
5505   int *charbuf_end
5506     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5507   ptrdiff_t consumed_chars = 0, consumed_chars_base;
5508   bool multibytep = coding->src_multibyte;
5509   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5510   Lisp_Object valids;
5511   ptrdiff_t char_offset = coding->produced_char;
5512   ptrdiff_t last_offset = char_offset;
5513   int last_id = charset_ascii;
5514   bool eol_dos
5515     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5516   int byte_after_cr = -1;
5517
5518   valids = AREF (attrs, coding_attr_charset_valids);
5519
5520   while (1)
5521     {
5522       int c;
5523       Lisp_Object val;
5524       struct charset *charset;
5525       int dim;
5526       int len = 1;
5527       unsigned code;
5528
5529       src_base = src;
5530       consumed_chars_base = consumed_chars;
5531
5532       if (charbuf >= charbuf_end)
5533         {
5534           if (byte_after_cr >= 0)
5535             src_base--;
5536           break;
5537         }
5538
5539       if (byte_after_cr >= 0)
5540         {
5541           c = byte_after_cr;
5542           byte_after_cr = -1;
5543         }
5544       else
5545         {
5546           ONE_MORE_BYTE (c);
5547           if (eol_dos && c == '\r')
5548             ONE_MORE_BYTE (byte_after_cr);
5549         }
5550       if (c < 0)
5551         goto invalid_code;
5552       code = c;
5553
5554       val = AREF (valids, c);
5555       if (! INTEGERP (val) && ! CONSP (val))
5556         goto invalid_code;
5557       if (INTEGERP (val))
5558         {
5559           charset = CHARSET_FROM_ID (XFASTINT (val));
5560           dim = CHARSET_DIMENSION (charset);
5561           while (len < dim)
5562             {
5563               ONE_MORE_BYTE (c);
5564               code = (code << 8) | c;
5565               len++;
5566             }
5567           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5568                               charset, code, c);
5569         }
5570       else
5571         {
5572           /* VAL is a list of charset IDs.  It is assured that the
5573              list is sorted by charset dimensions (smaller one
5574              comes first).  */
5575           while (CONSP (val))
5576             {
5577               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5578               dim = CHARSET_DIMENSION (charset);
5579               while (len < dim)
5580                 {
5581                   ONE_MORE_BYTE (c);
5582                   code = (code << 8) | c;
5583                   len++;
5584                 }
5585               CODING_DECODE_CHAR (coding, src, src_base,
5586                                   src_end, charset, code, c);
5587               if (c >= 0)
5588                 break;
5589               val = XCDR (val);
5590             }
5591         }
5592       if (c < 0)
5593         goto invalid_code;
5594       if (charset->id != charset_ascii
5595           && last_id != charset->id)
5596         {
5597           if (last_id != charset_ascii)
5598             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5599           last_id = charset->id;
5600           last_offset = char_offset;
5601         }
5602
5603       *charbuf++ = c;
5604       char_offset++;
5605       continue;
5606
5607     invalid_code:
5608       src = src_base;
5609       consumed_chars = consumed_chars_base;
5610       ONE_MORE_BYTE (c);
5611       *charbuf++ = c < 0 ? -c : ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
5612       char_offset++;
5613     }
5614
5615  no_more_source:
5616   if (last_id != charset_ascii)
5617     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5618   coding->consumed_char += consumed_chars_base;
5619   coding->consumed = src_base - coding->source;
5620   coding->charbuf_used = charbuf - coding->charbuf;
5621 }
5622
5623 static bool
5624 encode_coding_charset (struct coding_system *coding)
5625 {
5626   bool multibytep = coding->dst_multibyte;
5627   int *charbuf = coding->charbuf;
5628   int *charbuf_end = charbuf + coding->charbuf_used;
5629   unsigned char *dst = coding->destination + coding->produced;
5630   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5631   int safe_room = MAX_MULTIBYTE_LENGTH;
5632   ptrdiff_t produced_chars = 0;
5633   Lisp_Object attrs, charset_list;
5634   bool ascii_compatible;
5635   int c;
5636
5637   CODING_GET_INFO (coding, attrs, charset_list);
5638   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5639
5640   while (charbuf < charbuf_end)
5641     {
5642       struct charset *charset;
5643       unsigned code;
5644
5645       ASSURE_DESTINATION (safe_room);
5646       c = *charbuf++;
5647       if (ascii_compatible && ASCII_CHAR_P (c))
5648         EMIT_ONE_ASCII_BYTE (c);
5649       else if (CHAR_BYTE8_P (c))
5650         {
5651           c = CHAR_TO_BYTE8 (c);
5652           EMIT_ONE_BYTE (c);
5653         }
5654       else
5655         {
5656           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5657                                &code, charset);
5658
5659           if (charset)
5660             {
5661               if (CHARSET_DIMENSION (charset) == 1)
5662                 EMIT_ONE_BYTE (code);
5663               else if (CHARSET_DIMENSION (charset) == 2)
5664                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5665               else if (CHARSET_DIMENSION (charset) == 3)
5666                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5667               else
5668                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5669                                  (code >> 8) & 0xFF, code & 0xFF);
5670             }
5671           else
5672             {
5673               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5674                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5675               else
5676                 c = coding->default_char;
5677               EMIT_ONE_BYTE (c);
5678             }
5679         }
5680     }
5681
5682   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5683   coding->produced_char += produced_chars;
5684   coding->produced = dst - coding->destination;
5685   return 0;
5686 }
5687
5688 \f
5689 /*** 7. C library functions ***/
5690
5691 /* Setup coding context CODING from information about CODING_SYSTEM.
5692    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5693    CODING_SYSTEM is invalid, signal an error.  */
5694
5695 void
5696 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5697 {
5698   Lisp_Object attrs;
5699   Lisp_Object eol_type;
5700   Lisp_Object coding_type;
5701   Lisp_Object val;
5702
5703   if (NILP (coding_system))
5704     coding_system = Qundecided;
5705
5706   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5707
5708   attrs = CODING_ID_ATTRS (coding->id);
5709   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5710
5711   coding->mode = 0;
5712   if (VECTORP (eol_type))
5713     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5714                             | CODING_REQUIRE_DETECTION_MASK);
5715   else if (! EQ (eol_type, Qunix))
5716     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5717                             | CODING_REQUIRE_ENCODING_MASK);
5718   else
5719     coding->common_flags = 0;
5720   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5721     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5722   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5723     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5724   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5725     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5726
5727   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5728   coding->max_charset_id = SCHARS (val) - 1;
5729   coding->safe_charsets = SDATA (val);
5730   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5731   coding->carryover_bytes = 0;
5732   coding->raw_destination = 0;
5733
5734   coding_type = CODING_ATTR_TYPE (attrs);
5735   if (EQ (coding_type, Qundecided))
5736     {
5737       coding->detector = NULL;
5738       coding->decoder = decode_coding_raw_text;
5739       coding->encoder = encode_coding_raw_text;
5740       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5741       coding->spec.undecided.inhibit_nbd
5742         = (encode_inhibit_flag
5743            (AREF (attrs, coding_attr_undecided_inhibit_null_byte_detection)));
5744       coding->spec.undecided.inhibit_ied
5745         = (encode_inhibit_flag
5746            (AREF (attrs, coding_attr_undecided_inhibit_iso_escape_detection)));
5747       coding->spec.undecided.prefer_utf_8
5748         = ! NILP (AREF (attrs, coding_attr_undecided_prefer_utf_8));
5749     }
5750   else if (EQ (coding_type, Qiso_2022))
5751     {
5752       int i;
5753       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5754
5755       /* Invoke graphic register 0 to plane 0.  */
5756       CODING_ISO_INVOCATION (coding, 0) = 0;
5757       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5758       CODING_ISO_INVOCATION (coding, 1)
5759         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5760       /* Setup the initial status of designation.  */
5761       for (i = 0; i < 4; i++)
5762         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5763       /* Not single shifting initially.  */
5764       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5765       /* Beginning of buffer should also be regarded as bol. */
5766       CODING_ISO_BOL (coding) = 1;
5767       coding->detector = detect_coding_iso_2022;
5768       coding->decoder = decode_coding_iso_2022;
5769       coding->encoder = encode_coding_iso_2022;
5770       if (flags & CODING_ISO_FLAG_SAFE)
5771         coding->mode |= CODING_MODE_SAFE_ENCODING;
5772       coding->common_flags
5773         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5774             | CODING_REQUIRE_FLUSHING_MASK);
5775       if (flags & CODING_ISO_FLAG_COMPOSITION)
5776         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5777       if (flags & CODING_ISO_FLAG_DESIGNATION)
5778         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5779       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5780         {
5781           setup_iso_safe_charsets (attrs);
5782           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5783           coding->max_charset_id = SCHARS (val) - 1;
5784           coding->safe_charsets = SDATA (val);
5785         }
5786       CODING_ISO_FLAGS (coding) = flags;
5787       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5788       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5789       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5790       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5791     }
5792   else if (EQ (coding_type, Qcharset))
5793     {
5794       coding->detector = detect_coding_charset;
5795       coding->decoder = decode_coding_charset;
5796       coding->encoder = encode_coding_charset;
5797       coding->common_flags
5798         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5799     }
5800   else if (EQ (coding_type, Qutf_8))
5801     {
5802       val = AREF (attrs, coding_attr_utf_bom);
5803       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5804                                    : EQ (val, Qt) ? utf_with_bom
5805                                    : utf_without_bom);
5806       coding->detector = detect_coding_utf_8;
5807       coding->decoder = decode_coding_utf_8;
5808       coding->encoder = encode_coding_utf_8;
5809       coding->common_flags
5810         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5811       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5812         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5813     }
5814   else if (EQ (coding_type, Qutf_16))
5815     {
5816       val = AREF (attrs, coding_attr_utf_bom);
5817       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5818                                     : EQ (val, Qt) ? utf_with_bom
5819                                     : utf_without_bom);
5820       val = AREF (attrs, coding_attr_utf_16_endian);
5821       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5822                                        : utf_16_little_endian);
5823       CODING_UTF_16_SURROGATE (coding) = 0;
5824       coding->detector = detect_coding_utf_16;
5825       coding->decoder = decode_coding_utf_16;
5826       coding->encoder = encode_coding_utf_16;
5827       coding->common_flags
5828         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5829       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5830         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5831     }
5832   else if (EQ (coding_type, Qccl))
5833     {
5834       coding->detector = detect_coding_ccl;
5835       coding->decoder = decode_coding_ccl;
5836       coding->encoder = encode_coding_ccl;
5837       coding->common_flags
5838         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5839             | CODING_REQUIRE_FLUSHING_MASK);
5840     }
5841   else if (EQ (coding_type, Qemacs_mule))
5842     {
5843       coding->detector = detect_coding_emacs_mule;
5844       coding->decoder = decode_coding_emacs_mule;
5845       coding->encoder = encode_coding_emacs_mule;
5846       coding->common_flags
5847         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5848       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5849           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5850         {
5851           Lisp_Object tail, safe_charsets;
5852           int max_charset_id = 0;
5853
5854           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5855                tail = XCDR (tail))
5856             if (max_charset_id < XFASTINT (XCAR (tail)))
5857               max_charset_id = XFASTINT (XCAR (tail));
5858           safe_charsets = make_uninit_string (max_charset_id + 1);
5859           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5860           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5861                tail = XCDR (tail))
5862             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5863           coding->max_charset_id = max_charset_id;
5864           coding->safe_charsets = SDATA (safe_charsets);
5865         }
5866       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5867       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5868     }
5869   else if (EQ (coding_type, Qshift_jis))
5870     {
5871       coding->detector = detect_coding_sjis;
5872       coding->decoder = decode_coding_sjis;
5873       coding->encoder = encode_coding_sjis;
5874       coding->common_flags
5875         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5876     }
5877   else if (EQ (coding_type, Qbig5))
5878     {
5879       coding->detector = detect_coding_big5;
5880       coding->decoder = decode_coding_big5;
5881       coding->encoder = encode_coding_big5;
5882       coding->common_flags
5883         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5884     }
5885   else                          /* EQ (coding_type, Qraw_text) */
5886     {
5887       coding->detector = NULL;
5888       coding->decoder = decode_coding_raw_text;
5889       coding->encoder = encode_coding_raw_text;
5890       if (! EQ (eol_type, Qunix))
5891         {
5892           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5893           if (! VECTORP (eol_type))
5894             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5895         }
5896
5897     }
5898
5899   return;
5900 }
5901
5902 /* Return a list of charsets supported by CODING.  */
5903
5904 Lisp_Object
5905 coding_charset_list (struct coding_system *coding)
5906 {
5907   Lisp_Object attrs, charset_list;
5908
5909   CODING_GET_INFO (coding, attrs, charset_list);
5910   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5911     {
5912       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5913
5914       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5915         charset_list = Viso_2022_charset_list;
5916     }
5917   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5918     {
5919       charset_list = Vemacs_mule_charset_list;
5920     }
5921   return charset_list;
5922 }
5923
5924
5925 /* Return a list of charsets supported by CODING-SYSTEM.  */
5926
5927 Lisp_Object
5928 coding_system_charset_list (Lisp_Object coding_system)
5929 {
5930   ptrdiff_t id;
5931   Lisp_Object attrs, charset_list;
5932
5933   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5934   attrs = CODING_ID_ATTRS (id);
5935
5936   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5937     {
5938       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5939
5940       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5941         charset_list = Viso_2022_charset_list;
5942       else
5943         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5944     }
5945   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5946     {
5947       charset_list = Vemacs_mule_charset_list;
5948     }
5949   else
5950     {
5951       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5952     }
5953   return charset_list;
5954 }
5955
5956
5957 /* Return raw-text or one of its subsidiaries that has the same
5958    eol_type as CODING-SYSTEM.  */
5959
5960 Lisp_Object
5961 raw_text_coding_system (Lisp_Object coding_system)
5962 {
5963   Lisp_Object spec, attrs;
5964   Lisp_Object eol_type, raw_text_eol_type;
5965
5966   if (NILP (coding_system))
5967     return Qraw_text;
5968   spec = CODING_SYSTEM_SPEC (coding_system);
5969   attrs = AREF (spec, 0);
5970
5971   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5972     return coding_system;
5973
5974   eol_type = AREF (spec, 2);
5975   if (VECTORP (eol_type))
5976     return Qraw_text;
5977   spec = CODING_SYSTEM_SPEC (Qraw_text);
5978   raw_text_eol_type = AREF (spec, 2);
5979   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5980           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5981           : AREF (raw_text_eol_type, 2));
5982 }
5983
5984 /* Return true if CODING corresponds to raw-text coding-system.  */
5985
5986 bool
5987 raw_text_coding_system_p (struct coding_system *coding)
5988 {
5989   return (coding->decoder == decode_coding_raw_text
5990           && coding->encoder == encode_coding_raw_text) ? true : false;
5991 }
5992
5993
5994 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5995    the subsidiary that has the same eol-spec as PARENT (if it is not
5996    nil and specifies end-of-line format) or the system's setting
5997    (system_eol_type).  */
5998
5999 Lisp_Object
6000 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
6001 {
6002   Lisp_Object spec, eol_type;
6003
6004   if (NILP (coding_system))
6005     coding_system = Qraw_text;
6006   else
6007     CHECK_CODING_SYSTEM (coding_system);
6008   spec = CODING_SYSTEM_SPEC (coding_system);
6009   eol_type = AREF (spec, 2);
6010   if (VECTORP (eol_type))
6011     {
6012       Lisp_Object parent_eol_type;
6013
6014       if (! NILP (parent))
6015         {
6016           Lisp_Object parent_spec;
6017
6018           CHECK_CODING_SYSTEM (parent);
6019           parent_spec = CODING_SYSTEM_SPEC (parent);
6020           parent_eol_type = AREF (parent_spec, 2);
6021           if (VECTORP (parent_eol_type))
6022             parent_eol_type = system_eol_type;
6023         }
6024       else
6025         parent_eol_type = system_eol_type;
6026       if (EQ (parent_eol_type, Qunix))
6027         coding_system = AREF (eol_type, 0);
6028       else if (EQ (parent_eol_type, Qdos))
6029         coding_system = AREF (eol_type, 1);
6030       else if (EQ (parent_eol_type, Qmac))
6031         coding_system = AREF (eol_type, 2);
6032     }
6033   return coding_system;
6034 }
6035
6036
6037 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
6038    decided for writing to a process.  If not, complement them, and
6039    return a new coding system.  */
6040
6041 Lisp_Object
6042 complement_process_encoding_system (Lisp_Object coding_system)
6043 {
6044   Lisp_Object coding_base = Qnil, eol_base = Qnil;
6045   Lisp_Object spec, attrs;
6046   int i;
6047
6048   for (i = 0; i < 3; i++)
6049     {
6050       if (i == 1)
6051         coding_system = CDR_SAFE (Vdefault_process_coding_system);
6052       else if (i == 2)
6053         coding_system = preferred_coding_system ();
6054       spec = CODING_SYSTEM_SPEC (coding_system);
6055       if (NILP (spec))
6056         continue;
6057       attrs = AREF (spec, 0);
6058       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6059         coding_base = CODING_ATTR_BASE_NAME (attrs);
6060       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
6061         eol_base = coding_system;
6062       if (! NILP (coding_base) && ! NILP (eol_base))
6063         break;
6064     }
6065
6066   if (i > 0)
6067     /* The original CODING_SYSTEM didn't specify text-conversion or
6068        eol-conversion.  Be sure that we return a fully complemented
6069        coding system.  */
6070     coding_system = coding_inherit_eol_type (coding_base, eol_base);
6071   return coding_system;
6072 }
6073
6074
6075 /* Emacs has a mechanism to automatically detect a coding system if it
6076    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
6077    it's impossible to distinguish some coding systems accurately
6078    because they use the same range of codes.  So, at first, coding
6079    systems are categorized into 7, those are:
6080
6081    o coding-category-emacs-mule
6082
6083         The category for a coding system which has the same code range
6084         as Emacs' internal format.  Assigned the coding-system (Lisp
6085         symbol) `emacs-mule' by default.
6086
6087    o coding-category-sjis
6088
6089         The category for a coding system which has the same code range
6090         as SJIS.  Assigned the coding-system (Lisp
6091         symbol) `japanese-shift-jis' by default.
6092
6093    o coding-category-iso-7
6094
6095         The category for a coding system which has the same code range
6096         as ISO2022 of 7-bit environment.  This doesn't use any locking
6097         shift and single shift functions.  This can encode/decode all
6098         charsets.  Assigned the coding-system (Lisp symbol)
6099         `iso-2022-7bit' by default.
6100
6101    o coding-category-iso-7-tight
6102
6103         Same as coding-category-iso-7 except that this can
6104         encode/decode only the specified charsets.
6105
6106    o coding-category-iso-8-1
6107
6108         The category for a coding system which has the same code range
6109         as ISO2022 of 8-bit environment and graphic plane 1 used only
6110         for DIMENSION1 charset.  This doesn't use any locking shift
6111         and single shift functions.  Assigned the coding-system (Lisp
6112         symbol) `iso-latin-1' by default.
6113
6114    o coding-category-iso-8-2
6115
6116         The category for a coding system which has the same code range
6117         as ISO2022 of 8-bit environment and graphic plane 1 used only
6118         for DIMENSION2 charset.  This doesn't use any locking shift
6119         and single shift functions.  Assigned the coding-system (Lisp
6120         symbol) `japanese-iso-8bit' by default.
6121
6122    o coding-category-iso-7-else
6123
6124         The category for a coding system which has the same code range
6125         as ISO2022 of 7-bit environment but uses locking shift or
6126         single shift functions.  Assigned the coding-system (Lisp
6127         symbol) `iso-2022-7bit-lock' by default.
6128
6129    o coding-category-iso-8-else
6130
6131         The category for a coding system which has the same code range
6132         as ISO2022 of 8-bit environment but uses locking shift or
6133         single shift functions.  Assigned the coding-system (Lisp
6134         symbol) `iso-2022-8bit-ss2' by default.
6135
6136    o coding-category-big5
6137
6138         The category for a coding system which has the same code range
6139         as BIG5.  Assigned the coding-system (Lisp symbol)
6140         `cn-big5' by default.
6141
6142    o coding-category-utf-8
6143
6144         The category for a coding system which has the same code range
6145         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6146         symbol) `utf-8' by default.
6147
6148    o coding-category-utf-16-be
6149
6150         The category for a coding system in which a text has an
6151         Unicode signature (cf. Unicode Standard) in the order of BIG
6152         endian at the head.  Assigned the coding-system (Lisp symbol)
6153         `utf-16-be' by default.
6154
6155    o coding-category-utf-16-le
6156
6157         The category for a coding system in which a text has an
6158         Unicode signature (cf. Unicode Standard) in the order of
6159         LITTLE endian at the head.  Assigned the coding-system (Lisp
6160         symbol) `utf-16-le' by default.
6161
6162    o coding-category-ccl
6163
6164         The category for a coding system of which encoder/decoder is
6165         written in CCL programs.  The default value is nil, i.e., no
6166         coding system is assigned.
6167
6168    o coding-category-binary
6169
6170         The category for a coding system not categorized in any of the
6171         above.  Assigned the coding-system (Lisp symbol)
6172         `no-conversion' by default.
6173
6174    Each of them is a Lisp symbol and the value is an actual
6175    `coding-system's (this is also a Lisp symbol) assigned by a user.
6176    What Emacs does actually is to detect a category of coding system.
6177    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6178    decide only one possible category, it selects a category of the
6179    highest priority.  Priorities of categories are also specified by a
6180    user in a Lisp variable `coding-category-list'.
6181
6182 */
6183
6184 static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,
6185                                            int eol_seen);
6186
6187
6188 /* Return the number of ASCII characters at the head of the source.
6189    By side effects, set coding->head_ascii and update
6190    coding->eol_seen.  The value of coding->eol_seen is "logical or" of
6191    EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is
6192    reliable only when all the source bytes are ASCII.  */
6193
6194 static ptrdiff_t
6195 check_ascii (struct coding_system *coding)
6196 {
6197   const unsigned char *src, *end;
6198   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6199   int eol_seen = coding->eol_seen;
6200
6201   coding_set_source (coding);
6202   src = coding->source;
6203   end = src + coding->src_bytes;
6204
6205   if (inhibit_eol_conversion
6206       || SYMBOLP (eol_type))
6207     {
6208       /* We don't have to check EOL format.  */
6209       while (src < end && !( *src & 0x80))
6210         {
6211           if (*src++ == '\n')
6212             eol_seen |= EOL_SEEN_LF;
6213         }
6214     }
6215   else
6216     {
6217       end--;                /* We look ahead one byte for "CR LF".  */
6218       while (src < end)
6219         {
6220           int c = *src;
6221
6222           if (c & 0x80)
6223             break;
6224           src++;
6225           if (c == '\r')
6226             {
6227               if (*src == '\n')
6228                 {
6229                   eol_seen |= EOL_SEEN_CRLF;
6230                   src++;
6231                 }
6232               else
6233                 eol_seen |= EOL_SEEN_CR;
6234             }
6235           else if (c == '\n')
6236             eol_seen |= EOL_SEEN_LF;
6237         }
6238       if (src == end)
6239         {
6240           int c = *src;
6241
6242           /* All bytes but the last one C are ASCII.  */
6243           if (! (c & 0x80))
6244             {
6245               if (c == '\r')
6246                 eol_seen |= EOL_SEEN_CR;
6247               else if (c  == '\n')
6248                 eol_seen |= EOL_SEEN_LF;
6249               src++;
6250             }
6251         }
6252     }
6253   coding->head_ascii = src - coding->source;
6254   coding->eol_seen = eol_seen;
6255   return (coding->head_ascii);
6256 }
6257
6258
6259 /* Return the number of characters at the source if all the bytes are
6260    valid UTF-8 (of Unicode range).  Otherwise, return -1.  By side
6261    effects, update coding->eol_seen.  The value of coding->eol_seen is
6262    "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but
6263    the value is reliable only when all the source bytes are valid
6264    UTF-8.  */
6265
6266 static ptrdiff_t
6267 check_utf_8 (struct coding_system *coding)
6268 {
6269   const unsigned char *src, *end;
6270   int eol_seen;
6271   ptrdiff_t nchars = coding->head_ascii;
6272
6273   if (coding->head_ascii < 0)
6274     check_ascii (coding);
6275   else
6276     coding_set_source (coding);
6277   src = coding->source + coding->head_ascii;
6278   /* We look ahead one byte for CR LF.  */
6279   end = coding->source + coding->src_bytes - 1;
6280   eol_seen = coding->eol_seen;
6281   while (src < end)
6282     {
6283       int c = *src;
6284
6285       if (UTF_8_1_OCTET_P (*src))
6286         {
6287           src++;
6288           if (c < 0x20)
6289             {
6290               if (c == '\r')
6291                 {
6292                   if (*src == '\n')
6293                     {
6294                       eol_seen |= EOL_SEEN_CRLF;
6295                       src++;
6296                       nchars++;
6297                     }
6298                   else
6299                     eol_seen |= EOL_SEEN_CR;
6300                 }
6301               else if (c == '\n')
6302                 eol_seen |= EOL_SEEN_LF;
6303             }
6304         }
6305       else if (UTF_8_2_OCTET_LEADING_P (c))
6306         {
6307           if (c < 0xC2          /* overlong sequence */
6308               || src + 1 >= end
6309               || ! UTF_8_EXTRA_OCTET_P (src[1]))
6310             return -1;
6311           src += 2;
6312         }
6313       else if (UTF_8_3_OCTET_LEADING_P (c))
6314         {
6315           if (src + 2 >= end
6316               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6317                     && UTF_8_EXTRA_OCTET_P (src[2])))
6318             return -1;
6319           c = (((c & 0xF) << 12)
6320                | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
6321           if (c < 0x800                       /* overlong sequence */
6322               || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
6323             return -1;
6324           src += 3;
6325         }
6326       else if (UTF_8_4_OCTET_LEADING_P (c))
6327         {
6328           if (src + 3 >= end
6329               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6330                     && UTF_8_EXTRA_OCTET_P (src[2])
6331                     && UTF_8_EXTRA_OCTET_P (src[3])))
6332             return -1;
6333           c = (((c & 0x7) << 18) | ((src[1] & 0x3F) << 12)
6334                | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
6335           if (c < 0x10000       /* overlong sequence */
6336               || c >= 0x110000) /* non-Unicode character  */
6337             return -1;
6338           src += 4;
6339         }
6340       else
6341         return -1;
6342       nchars++;
6343     }
6344
6345   if (src == end)
6346     {
6347       if (! UTF_8_1_OCTET_P (*src))
6348         return -1;
6349       nchars++;
6350       if (*src == '\r')
6351         eol_seen |= EOL_SEEN_CR;
6352       else if (*src  == '\n')
6353         eol_seen |= EOL_SEEN_LF;
6354     }
6355   coding->eol_seen = eol_seen;
6356   return nchars;
6357 }
6358
6359
6360 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6361    SOURCE is encoded.  If CATEGORY is one of
6362    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6363    two-byte, else they are encoded by one-byte.
6364
6365    Return one of EOL_SEEN_XXX.  */
6366
6367 #define MAX_EOL_CHECK_COUNT 3
6368
6369 static int
6370 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6371             enum coding_category category)
6372 {
6373   const unsigned char *src = source, *src_end = src + src_bytes;
6374   unsigned char c;
6375   int total  = 0;
6376   int eol_seen = EOL_SEEN_NONE;
6377
6378   if ((1 << category) & CATEGORY_MASK_UTF_16)
6379     {
6380       bool msb = category == (coding_category_utf_16_le
6381                               | coding_category_utf_16_le_nosig);
6382       bool lsb = !msb;
6383
6384       while (src + 1 < src_end)
6385         {
6386           c = src[lsb];
6387           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6388             {
6389               int this_eol;
6390
6391               if (c == '\n')
6392                 this_eol = EOL_SEEN_LF;
6393               else if (src + 3 >= src_end
6394                        || src[msb + 2] != 0
6395                        || src[lsb + 2] != '\n')
6396                 this_eol = EOL_SEEN_CR;
6397               else
6398                 {
6399                   this_eol = EOL_SEEN_CRLF;
6400                   src += 2;
6401                 }
6402
6403               if (eol_seen == EOL_SEEN_NONE)
6404                 /* This is the first end-of-line.  */
6405                 eol_seen = this_eol;
6406               else if (eol_seen != this_eol)
6407                 {
6408                   /* The found type is different from what found before.
6409                      Allow for stray ^M characters in DOS EOL files.  */
6410                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6411                       || (eol_seen == EOL_SEEN_CRLF
6412                           && this_eol == EOL_SEEN_CR))
6413                     eol_seen = EOL_SEEN_CRLF;
6414                   else
6415                     {
6416                       eol_seen = EOL_SEEN_LF;
6417                       break;
6418                     }
6419                 }
6420               if (++total == MAX_EOL_CHECK_COUNT)
6421                 break;
6422             }
6423           src += 2;
6424         }
6425     }
6426   else
6427     while (src < src_end)
6428       {
6429         c = *src++;
6430         if (c == '\n' || c == '\r')
6431           {
6432             int this_eol;
6433
6434             if (c == '\n')
6435               this_eol = EOL_SEEN_LF;
6436             else if (src >= src_end || *src != '\n')
6437               this_eol = EOL_SEEN_CR;
6438             else
6439               this_eol = EOL_SEEN_CRLF, src++;
6440
6441             if (eol_seen == EOL_SEEN_NONE)
6442               /* This is the first end-of-line.  */
6443               eol_seen = this_eol;
6444             else if (eol_seen != this_eol)
6445               {
6446                 /* The found type is different from what found before.
6447                    Allow for stray ^M characters in DOS EOL files.  */
6448                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6449                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6450                   eol_seen = EOL_SEEN_CRLF;
6451                 else
6452                   {
6453                     eol_seen = EOL_SEEN_LF;
6454                     break;
6455                   }
6456               }
6457             if (++total == MAX_EOL_CHECK_COUNT)
6458               break;
6459           }
6460       }
6461   return eol_seen;
6462 }
6463
6464
6465 static Lisp_Object
6466 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6467 {
6468   Lisp_Object eol_type;
6469
6470   eol_type = CODING_ID_EOL_TYPE (coding->id);
6471   if (! VECTORP (eol_type))
6472     /* Already adjusted.  */
6473     return eol_type;
6474   if (eol_seen & EOL_SEEN_LF)
6475     {
6476       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6477       eol_type = Qunix;
6478     }
6479   else if (eol_seen & EOL_SEEN_CRLF)
6480     {
6481       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6482       eol_type = Qdos;
6483     }
6484   else if (eol_seen & EOL_SEEN_CR)
6485     {
6486       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6487       eol_type = Qmac;
6488     }
6489   return eol_type;
6490 }
6491
6492 /* Detect how a text specified in CODING is encoded.  If a coding
6493    system is detected, update fields of CODING by the detected coding
6494    system.  */
6495
6496 static void
6497 detect_coding (struct coding_system *coding)
6498 {
6499   const unsigned char *src, *src_end;
6500   unsigned int saved_mode = coding->mode;
6501   Lisp_Object found = Qnil;
6502   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6503
6504   coding->consumed = coding->consumed_char = 0;
6505   coding->produced = coding->produced_char = 0;
6506   coding_set_source (coding);
6507
6508   src_end = coding->source + coding->src_bytes;
6509
6510   coding->eol_seen = EOL_SEEN_NONE;
6511   /* If we have not yet decided the text encoding type, detect it
6512      now.  */
6513   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6514     {
6515       int c, i;
6516       struct coding_detection_info detect_info;
6517       bool null_byte_found = 0, eight_bit_found = 0;
6518       bool inhibit_nbd = inhibit_flag (coding->spec.undecided.inhibit_nbd,
6519                                        inhibit_null_byte_detection);
6520       bool inhibit_ied = inhibit_flag (coding->spec.undecided.inhibit_ied,
6521                                        inhibit_iso_escape_detection);
6522       bool prefer_utf_8 = coding->spec.undecided.prefer_utf_8;
6523
6524       coding->head_ascii = 0;
6525       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6526       for (src = coding->source; src < src_end; src++)
6527         {
6528           c = *src;
6529           if (c & 0x80)
6530             {
6531               eight_bit_found = 1;
6532               if (null_byte_found)
6533                 break;
6534             }
6535           else if (c < 0x20)
6536             {
6537               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6538                   && ! inhibit_ied
6539                   && ! detect_info.checked)
6540                 {
6541                   if (detect_coding_iso_2022 (coding, &detect_info))
6542                     {
6543                       /* We have scanned the whole data.  */
6544                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6545                         {
6546                           /* We didn't find an 8-bit code.  We may
6547                              have found a null-byte, but it's very
6548                              rare that a binary file conforms to
6549                              ISO-2022.  */
6550                           src = src_end;
6551                           coding->head_ascii = src - coding->source;
6552                         }
6553                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6554                       break;
6555                     }
6556                 }
6557               else if (! c && !inhibit_nbd)
6558                 {
6559                   null_byte_found = 1;
6560                   if (eight_bit_found)
6561                     break;
6562                 }
6563               else if (! disable_ascii_optimization
6564                        && ! inhibit_eol_conversion)
6565                 {
6566                   if (c == '\r')
6567                     {
6568                       if (src < src_end && src[1] == '\n')
6569                         {
6570                           coding->eol_seen |= EOL_SEEN_CRLF;
6571                           src++;
6572                           if (! eight_bit_found)
6573                             coding->head_ascii++;
6574                         }
6575                       else
6576                         coding->eol_seen |= EOL_SEEN_CR;
6577                     }
6578                   else if (c == '\n')
6579                     {
6580                       coding->eol_seen |= EOL_SEEN_LF;
6581                     }
6582                 }
6583
6584               if (! eight_bit_found)
6585                 coding->head_ascii++;
6586             }
6587           else if (! eight_bit_found)
6588             coding->head_ascii++;
6589         }
6590
6591       if (null_byte_found || eight_bit_found
6592           || coding->head_ascii < coding->src_bytes
6593           || detect_info.found)
6594         {
6595           enum coding_category category;
6596           struct coding_system *this;
6597
6598           if (coding->head_ascii == coding->src_bytes)
6599             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6600             for (i = 0; i < coding_category_raw_text; i++)
6601               {
6602                 category = coding_priorities[i];
6603                 this = coding_categories + category;
6604                 if (detect_info.found & (1 << category))
6605                   break;
6606               }
6607           else
6608             {
6609               if (null_byte_found)
6610                 {
6611                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6612                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6613                 }
6614               else if (prefer_utf_8
6615                        && detect_coding_utf_8 (coding, &detect_info))
6616                 {
6617                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
6618                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
6619                 }
6620               for (i = 0; i < coding_category_raw_text; i++)
6621                 {
6622                   category = coding_priorities[i];
6623                   this = coding_categories + category;
6624                   /* Some of this->detector (e.g. detect_coding_sjis)
6625                      require this information.  */
6626                   coding->id = this->id;
6627                   if (this->id < 0)
6628                     {
6629                       /* No coding system of this category is defined.  */
6630                       detect_info.rejected |= (1 << category);
6631                     }
6632                   else if (category >= coding_category_raw_text)
6633                     continue;
6634                   else if (detect_info.checked & (1 << category))
6635                     {
6636                       if (detect_info.found & (1 << category))
6637                         break;
6638                     }
6639                   else if ((*(this->detector)) (coding, &detect_info)
6640                            && detect_info.found & (1 << category))
6641                     break;
6642                 }
6643             }
6644
6645           if (i < coding_category_raw_text)
6646             {
6647               if (category == coding_category_utf_8_auto)
6648                 {
6649                   Lisp_Object coding_systems;
6650
6651                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6652                                          coding_attr_utf_bom);
6653                   if (CONSP (coding_systems))
6654                     {
6655                       if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6656                         found = XCAR (coding_systems);
6657                       else
6658                         found = XCDR (coding_systems);
6659                     }
6660                   else
6661                     found = CODING_ID_NAME (this->id);
6662                 }
6663               else if (category == coding_category_utf_16_auto)
6664                 {
6665                   Lisp_Object coding_systems;
6666
6667                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6668                                          coding_attr_utf_bom);
6669                   if (CONSP (coding_systems))
6670                     {
6671                       if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6672                         found = XCAR (coding_systems);
6673                       else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6674                         found = XCDR (coding_systems);
6675                     }
6676                   else
6677                     found = CODING_ID_NAME (this->id);
6678                 }
6679               else
6680                 found = CODING_ID_NAME (this->id);
6681             }
6682           else if (null_byte_found)
6683             found = Qno_conversion;
6684           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6685                    == CATEGORY_MASK_ANY)
6686             found = Qraw_text;
6687           else if (detect_info.rejected)
6688             for (i = 0; i < coding_category_raw_text; i++)
6689               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6690                 {
6691                   this = coding_categories + coding_priorities[i];
6692                   found = CODING_ID_NAME (this->id);
6693                   break;
6694                 }
6695         }
6696     }
6697   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6698            == coding_category_utf_8_auto)
6699     {
6700       Lisp_Object coding_systems;
6701       struct coding_detection_info detect_info;
6702
6703       coding_systems
6704         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6705       detect_info.found = detect_info.rejected = 0;
6706       if (check_ascii (coding) == coding->src_bytes)
6707         {
6708           if (CONSP (coding_systems))
6709             found = XCDR (coding_systems);
6710         }
6711       else
6712         {
6713           if (CONSP (coding_systems)
6714               && detect_coding_utf_8 (coding, &detect_info))
6715             {
6716               if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6717                 found = XCAR (coding_systems);
6718               else
6719                 found = XCDR (coding_systems);
6720             }
6721         }
6722     }
6723   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6724            == coding_category_utf_16_auto)
6725     {
6726       Lisp_Object coding_systems;
6727       struct coding_detection_info detect_info;
6728
6729       coding_systems
6730         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6731       detect_info.found = detect_info.rejected = 0;
6732       coding->head_ascii = 0;
6733       if (CONSP (coding_systems)
6734           && detect_coding_utf_16 (coding, &detect_info))
6735         {
6736           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6737             found = XCAR (coding_systems);
6738           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6739             found = XCDR (coding_systems);
6740         }
6741     }
6742
6743   if (! NILP (found))
6744     {
6745       int specified_eol = (VECTORP (eol_type) ? EOL_SEEN_NONE
6746                            : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
6747                            : EQ (eol_type, Qmac) ? EOL_SEEN_CR
6748                            : EOL_SEEN_LF);
6749
6750       setup_coding_system (found, coding);
6751       if (specified_eol != EOL_SEEN_NONE)
6752         adjust_coding_eol_type (coding, specified_eol);
6753     }
6754
6755   coding->mode = saved_mode;
6756 }
6757
6758
6759 static void
6760 decode_eol (struct coding_system *coding)
6761 {
6762   Lisp_Object eol_type;
6763   unsigned char *p, *pbeg, *pend;
6764
6765   eol_type = CODING_ID_EOL_TYPE (coding->id);
6766   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6767     return;
6768
6769   if (NILP (coding->dst_object))
6770     pbeg = coding->destination;
6771   else
6772     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6773   pend = pbeg + coding->produced;
6774
6775   if (VECTORP (eol_type))
6776     {
6777       int eol_seen = EOL_SEEN_NONE;
6778
6779       for (p = pbeg; p < pend; p++)
6780         {
6781           if (*p == '\n')
6782             eol_seen |= EOL_SEEN_LF;
6783           else if (*p == '\r')
6784             {
6785               if (p + 1 < pend && *(p + 1) == '\n')
6786                 {
6787                   eol_seen |= EOL_SEEN_CRLF;
6788                   p++;
6789                 }
6790               else
6791                 eol_seen |= EOL_SEEN_CR;
6792             }
6793         }
6794       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6795       if ((eol_seen & EOL_SEEN_CRLF) != 0
6796           && (eol_seen & EOL_SEEN_CR) != 0
6797           && (eol_seen & EOL_SEEN_LF) == 0)
6798         eol_seen = EOL_SEEN_CRLF;
6799       else if (eol_seen != EOL_SEEN_NONE
6800           && eol_seen != EOL_SEEN_LF
6801           && eol_seen != EOL_SEEN_CRLF
6802           && eol_seen != EOL_SEEN_CR)
6803         eol_seen = EOL_SEEN_LF;
6804       if (eol_seen != EOL_SEEN_NONE)
6805         eol_type = adjust_coding_eol_type (coding, eol_seen);
6806     }
6807
6808   if (EQ (eol_type, Qmac))
6809     {
6810       for (p = pbeg; p < pend; p++)
6811         if (*p == '\r')
6812           *p = '\n';
6813     }
6814   else if (EQ (eol_type, Qdos))
6815     {
6816       ptrdiff_t n = 0;
6817       ptrdiff_t pos = coding->dst_pos;
6818       ptrdiff_t pos_byte = coding->dst_pos_byte;
6819       ptrdiff_t pos_end = pos_byte + coding->produced - 1;
6820
6821       /* This assertion is here instead of code, now deleted, that
6822          handled the NILP case, which no longer happens with the
6823          current codebase.  */
6824       eassert (!NILP (coding->dst_object));
6825
6826       while (pos_byte < pos_end)
6827         {
6828           p = BYTE_POS_ADDR (pos_byte);
6829           if (*p == '\r' && p[1] == '\n')
6830             {
6831               del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6832               n++;
6833               pos_end--;
6834             }
6835           pos++;
6836           if (coding->dst_multibyte)
6837             pos_byte += BYTES_BY_CHAR_HEAD (*p);
6838           else
6839             pos_byte++;
6840         }
6841       coding->produced -= n;
6842       coding->produced_char -= n;
6843     }
6844 }
6845
6846
6847 /* MAX_LOOKUP's maximum value.  MAX_LOOKUP is an int and so cannot
6848    exceed INT_MAX.  Also, MAX_LOOKUP is multiplied by sizeof (int) for
6849    alloca, so it cannot exceed MAX_ALLOCA / sizeof (int).  */
6850 enum { MAX_LOOKUP_MAX = min (INT_MAX, MAX_ALLOCA / sizeof (int)) };
6851
6852 /* Return a translation table (or list of them) from coding system
6853    attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6854    not ENCODEP). */
6855
6856 static Lisp_Object
6857 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6858 {
6859   Lisp_Object standard, translation_table;
6860   Lisp_Object val;
6861
6862   if (NILP (Venable_character_translation))
6863     {
6864       if (max_lookup)
6865         *max_lookup = 0;
6866       return Qnil;
6867     }
6868   if (encodep)
6869     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6870       standard = Vstandard_translation_table_for_encode;
6871   else
6872     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6873       standard = Vstandard_translation_table_for_decode;
6874   if (NILP (translation_table))
6875     translation_table = standard;
6876   else
6877     {
6878       if (SYMBOLP (translation_table))
6879         translation_table = Fget (translation_table, Qtranslation_table);
6880       else if (CONSP (translation_table))
6881         {
6882           translation_table = Fcopy_sequence (translation_table);
6883           for (val = translation_table; CONSP (val); val = XCDR (val))
6884             if (SYMBOLP (XCAR (val)))
6885               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6886         }
6887       if (CHAR_TABLE_P (standard))
6888         {
6889           if (CONSP (translation_table))
6890             translation_table = nconc2 (translation_table, list1 (standard));
6891           else
6892             translation_table = list2 (translation_table, standard);
6893         }
6894     }
6895
6896   if (max_lookup)
6897     {
6898       *max_lookup = 1;
6899       if (CHAR_TABLE_P (translation_table)
6900           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6901         {
6902           val = XCHAR_TABLE (translation_table)->extras[1];
6903           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6904             *max_lookup = min (XFASTINT (val), MAX_LOOKUP_MAX);
6905         }
6906       else if (CONSP (translation_table))
6907         {
6908           Lisp_Object tail;
6909
6910           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6911             if (CHAR_TABLE_P (XCAR (tail))
6912                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6913               {
6914                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6915                 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6916                   *max_lookup = min (XFASTINT (tailval), MAX_LOOKUP_MAX);
6917               }
6918         }
6919     }
6920   return translation_table;
6921 }
6922
6923 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6924   do {                                                          \
6925     trans = Qnil;                                               \
6926     if (CHAR_TABLE_P (table))                                   \
6927       {                                                         \
6928         trans = CHAR_TABLE_REF (table, c);                      \
6929         if (CHARACTERP (trans))                                 \
6930           c = XFASTINT (trans), trans = Qnil;                   \
6931       }                                                         \
6932     else if (CONSP (table))                                     \
6933       {                                                         \
6934         Lisp_Object tail;                                       \
6935                                                                 \
6936         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6937           if (CHAR_TABLE_P (XCAR (tail)))                       \
6938             {                                                   \
6939               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6940               if (CHARACTERP (trans))                           \
6941                 c = XFASTINT (trans), trans = Qnil;             \
6942               else if (! NILP (trans))                          \
6943                 break;                                          \
6944             }                                                   \
6945       }                                                         \
6946   } while (0)
6947
6948
6949 /* Return a translation of character(s) at BUF according to TRANS.
6950    TRANS is TO-CHAR, [TO-CHAR ...], or ((FROM .  TO) ...) where FROM =
6951    [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].  The return value
6952    is TO-CHAR or [TO-CHAR ...] if a translation is found, Qnil if not
6953    found, or Qt if BUF is too short to lookup characters in FROM.  As
6954    a side effect, if a translation is found, *NCHARS is set to the
6955    number of characters being translated.  */
6956
6957 static Lisp_Object
6958 get_translation (Lisp_Object trans, int *buf, int *buf_end, ptrdiff_t *nchars)
6959 {
6960   if (INTEGERP (trans) || VECTORP (trans))
6961     {
6962       *nchars = 1;
6963       return trans;
6964     }
6965   for (; CONSP (trans); trans = XCDR (trans))
6966     {
6967       Lisp_Object val = XCAR (trans);
6968       Lisp_Object from = XCAR (val);
6969       ptrdiff_t len = ASIZE (from);
6970       ptrdiff_t i;
6971
6972       for (i = 0; i < len; i++)
6973         {
6974           if (buf + i == buf_end)
6975             return Qt;
6976           if (XINT (AREF (from, i)) != buf[i])
6977             break;
6978         }
6979       if (i == len)
6980         {
6981           *nchars = len;
6982           return XCDR (val);
6983         }
6984     }
6985   return Qnil;
6986 }
6987
6988
6989 static int
6990 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6991                bool last_block)
6992 {
6993   unsigned char *dst = coding->destination + coding->produced;
6994   unsigned char *dst_end = coding->destination + coding->dst_bytes;
6995   ptrdiff_t produced;
6996   ptrdiff_t produced_chars = 0;
6997   int carryover = 0;
6998
6999   if (! coding->chars_at_source)
7000     {
7001       /* Source characters are in coding->charbuf.  */
7002       int *buf = coding->charbuf;
7003       int *buf_end = buf + coding->charbuf_used;
7004
7005       if (EQ (coding->src_object, coding->dst_object)
7006           && ! NILP (coding->dst_object))
7007         {
7008           eassert (growable_destination (coding));
7009           coding_set_source (coding);
7010           dst_end = ((unsigned char *) coding->source) + coding->consumed;
7011         }
7012
7013       while (buf < buf_end)
7014         {
7015           int c = *buf;
7016           ptrdiff_t i;
7017
7018           if (c >= 0)
7019             {
7020               ptrdiff_t from_nchars = 1, to_nchars = 1;
7021               Lisp_Object trans = Qnil;
7022
7023               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7024               if (! NILP (trans))
7025                 {
7026                   trans = get_translation (trans, buf, buf_end, &from_nchars);
7027                   if (INTEGERP (trans))
7028                     c = XINT (trans);
7029                   else if (VECTORP (trans))
7030                     {
7031                       to_nchars = ASIZE (trans);
7032                       c = XINT (AREF (trans, 0));
7033                     }
7034                   else if (EQ (trans, Qt) && ! last_block)
7035                     break;
7036                 }
7037
7038               if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
7039                 {
7040                   eassert (growable_destination (coding));
7041                   ptrdiff_t dst_size;
7042                   if (INT_MULTIPLY_WRAPV (to_nchars, MAX_MULTIBYTE_LENGTH,
7043                                           &dst_size)
7044                       || INT_ADD_WRAPV (buf_end - buf, dst_size, &dst_size))
7045                     memory_full (SIZE_MAX);
7046                   dst = alloc_destination (coding, dst_size, dst);
7047                   if (EQ (coding->src_object, coding->dst_object))
7048                     {
7049                       coding_set_source (coding);
7050                       dst_end = (((unsigned char *) coding->source)
7051                                  + coding->consumed);
7052                     }
7053                   else
7054                     dst_end = coding->destination + coding->dst_bytes;
7055                 }
7056
7057               for (i = 0; i < to_nchars; i++)
7058                 {
7059                   if (i > 0)
7060                     c = XINT (AREF (trans, i));
7061                   if (coding->dst_multibyte
7062                       || ! CHAR_BYTE8_P (c))
7063                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
7064                   else
7065                     *dst++ = CHAR_TO_BYTE8 (c);
7066                 }
7067               produced_chars += to_nchars;
7068               buf += from_nchars;
7069             }
7070           else
7071             /* This is an annotation datum.  (-C) is the length.  */
7072             buf += -c;
7073         }
7074       carryover = buf_end - buf;
7075     }
7076   else
7077     {
7078       /* Source characters are at coding->source.  */
7079       const unsigned char *src = coding->source;
7080       const unsigned char *src_end = src + coding->consumed;
7081
7082       if (EQ (coding->dst_object, coding->src_object))
7083         {
7084           eassert (growable_destination (coding));
7085           dst_end = (unsigned char *) src;
7086         }
7087       if (coding->src_multibyte != coding->dst_multibyte)
7088         {
7089           if (coding->src_multibyte)
7090             {
7091               bool multibytep = 1;
7092               ptrdiff_t consumed_chars = 0;
7093
7094               while (1)
7095                 {
7096                   const unsigned char *src_base = src;
7097                   int c;
7098
7099                   ONE_MORE_BYTE (c);
7100                   if (dst == dst_end)
7101                     {
7102                       eassert (growable_destination (coding));
7103                       if (EQ (coding->src_object, coding->dst_object))
7104                         dst_end = (unsigned char *) src;
7105                       if (dst == dst_end)
7106                         {
7107                           ptrdiff_t offset = src - coding->source;
7108
7109                           dst = alloc_destination (coding, src_end - src + 1,
7110                                                    dst);
7111                           dst_end = coding->destination + coding->dst_bytes;
7112                           coding_set_source (coding);
7113                           src = coding->source + offset;
7114                           src_end = coding->source + coding->consumed;
7115                           if (EQ (coding->src_object, coding->dst_object))
7116                             dst_end = (unsigned char *) src;
7117                         }
7118                     }
7119                   *dst++ = c;
7120                   produced_chars++;
7121                 }
7122             no_more_source:
7123               ;
7124             }
7125           else
7126             while (src < src_end)
7127               {
7128                 bool multibytep = 1;
7129                 int c = *src++;
7130
7131                 if (dst >= dst_end - 1)
7132                   {
7133                     eassert (growable_destination (coding));
7134                     if (EQ (coding->src_object, coding->dst_object))
7135                       dst_end = (unsigned char *) src;
7136                     if (dst >= dst_end - 1)
7137                       {
7138                         ptrdiff_t offset = src - coding->source;
7139                         ptrdiff_t more_bytes;
7140
7141                         if (EQ (coding->src_object, coding->dst_object))
7142                           more_bytes = ((src_end - src) / 2) + 2;
7143                         else
7144                           more_bytes = src_end - src + 2;
7145                         dst = alloc_destination (coding, more_bytes, dst);
7146                         dst_end = coding->destination + coding->dst_bytes;
7147                         coding_set_source (coding);
7148                         src = coding->source + offset;
7149                         src_end = coding->source + coding->consumed;
7150                         if (EQ (coding->src_object, coding->dst_object))
7151                           dst_end = (unsigned char *) src;
7152                       }
7153                   }
7154                 EMIT_ONE_BYTE (c);
7155               }
7156         }
7157       else
7158         {
7159           if (!EQ (coding->src_object, coding->dst_object))
7160             {
7161               ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
7162
7163               if (require > 0)
7164                 {
7165                   ptrdiff_t offset = src - coding->source;
7166
7167                   dst = alloc_destination (coding, require, dst);
7168                   coding_set_source (coding);
7169                   src = coding->source + offset;
7170                   src_end = coding->source + coding->consumed;
7171                 }
7172             }
7173           produced_chars = coding->consumed_char;
7174           while (src < src_end)
7175             *dst++ = *src++;
7176         }
7177     }
7178
7179   produced = dst - (coding->destination + coding->produced);
7180   if (BUFFERP (coding->dst_object) && produced_chars > 0)
7181     insert_from_gap (produced_chars, produced, 0);
7182   coding->produced += produced;
7183   coding->produced_char += produced_chars;
7184   return carryover;
7185 }
7186
7187 /* Compose text in CODING->object according to the annotation data at
7188    CHARBUF.  CHARBUF is an array:
7189      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
7190  */
7191
7192 static void
7193 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7194 {
7195   int len;
7196   ptrdiff_t to;
7197   enum composition_method method;
7198   Lisp_Object components;
7199
7200   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
7201   to = pos + charbuf[2];
7202   method = (enum composition_method) (charbuf[4]);
7203
7204   if (method == COMPOSITION_RELATIVE)
7205     components = Qnil;
7206   else
7207     {
7208       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
7209       int i, j;
7210
7211       if (method == COMPOSITION_WITH_RULE)
7212         len = charbuf[2] * 3 - 2;
7213       charbuf += MAX_ANNOTATION_LENGTH;
7214       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
7215       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
7216         {
7217           if (charbuf[i] >= 0)
7218             args[j] = make_number (charbuf[i]);
7219           else
7220             {
7221               i++;
7222               args[j] = make_number (charbuf[i] % 0x100);
7223             }
7224         }
7225       components = (i == j ? Fstring (j, args) : Fvector (j, args));
7226     }
7227   compose_text (pos, to, components, Qnil, coding->dst_object);
7228 }
7229
7230
7231 /* Put `charset' property on text in CODING->object according to
7232    the annotation data at CHARBUF.  CHARBUF is an array:
7233      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
7234  */
7235
7236 static void
7237 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7238 {
7239   ptrdiff_t from = pos - charbuf[2];
7240   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
7241
7242   Fput_text_property (make_number (from), make_number (pos),
7243                       Qcharset, CHARSET_NAME (charset),
7244                       coding->dst_object);
7245 }
7246
7247 #define MAX_CHARBUF_SIZE 0x4000
7248 /* How many units decoding functions expect in coding->charbuf at
7249    most.  Currently, decode_coding_emacs_mule expects the following
7250    size, and that is the largest value.  */
7251 #define MAX_CHARBUF_EXTRA_SIZE ((MAX_ANNOTATION_LENGTH * 3) + 1)
7252
7253 #define ALLOC_CONVERSION_WORK_AREA(coding, size)                \
7254   do {                                                          \
7255     ptrdiff_t units = min ((size) + MAX_CHARBUF_EXTRA_SIZE,     \
7256                            MAX_CHARBUF_SIZE);                   \
7257     coding->charbuf = SAFE_ALLOCA (units * sizeof (int));       \
7258     coding->charbuf_size = units;                               \
7259   } while (0)
7260
7261 static void
7262 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
7263 {
7264   int *charbuf = coding->charbuf;
7265   int *charbuf_end = charbuf + coding->charbuf_used;
7266
7267   if (NILP (coding->dst_object))
7268     return;
7269
7270   while (charbuf < charbuf_end)
7271     {
7272       if (*charbuf >= 0)
7273         pos++, charbuf++;
7274       else
7275         {
7276           int len = -*charbuf;
7277
7278           if (len > 2)
7279             switch (charbuf[1])
7280               {
7281               case CODING_ANNOTATE_COMPOSITION_MASK:
7282                 produce_composition (coding, charbuf, pos);
7283                 break;
7284               case CODING_ANNOTATE_CHARSET_MASK:
7285                 produce_charset (coding, charbuf, pos);
7286                 break;
7287               default:
7288                 break;
7289               }
7290           charbuf += len;
7291         }
7292     }
7293 }
7294
7295 /* Decode the data at CODING->src_object into CODING->dst_object.
7296    CODING->src_object is a buffer, a string, or nil.
7297    CODING->dst_object is a buffer.
7298
7299    If CODING->src_object is a buffer, it must be the current buffer.
7300    In this case, if CODING->src_pos is positive, it is a position of
7301    the source text in the buffer, otherwise, the source text is in the
7302    gap area of the buffer, and CODING->src_pos specifies the offset of
7303    the text from GPT (which must be the same as PT).  If this is the
7304    same buffer as CODING->dst_object, CODING->src_pos must be
7305    negative.
7306
7307    If CODING->src_object is a string, CODING->src_pos is an index to
7308    that string.
7309
7310    If CODING->src_object is nil, CODING->source must already point to
7311    the non-relocatable memory area.  In this case, CODING->src_pos is
7312    an offset from CODING->source.
7313
7314    The decoded data is inserted at the current point of the buffer
7315    CODING->dst_object.
7316 */
7317
7318 static void
7319 decode_coding (struct coding_system *coding)
7320 {
7321   Lisp_Object attrs;
7322   Lisp_Object undo_list;
7323   Lisp_Object translation_table;
7324   struct ccl_spec cclspec;
7325   int carryover;
7326   int i;
7327
7328   USE_SAFE_ALLOCA;
7329
7330   if (BUFFERP (coding->src_object)
7331       && coding->src_pos > 0
7332       && coding->src_pos < GPT
7333       && coding->src_pos + coding->src_chars > GPT)
7334     move_gap_both (coding->src_pos, coding->src_pos_byte);
7335
7336   undo_list = Qt;
7337   if (BUFFERP (coding->dst_object))
7338     {
7339       set_buffer_internal (XBUFFER (coding->dst_object));
7340       if (GPT != PT)
7341         move_gap_both (PT, PT_BYTE);
7342
7343       /* We must disable undo_list in order to record the whole insert
7344          transaction via record_insert at the end.  But doing so also
7345          disables the recording of the first change to the undo_list.
7346          Therefore we check for first change here and record it via
7347          record_first_change if needed.  */
7348       if (MODIFF <= SAVE_MODIFF)
7349         record_first_change ();
7350
7351       undo_list = BVAR (current_buffer, undo_list);
7352       bset_undo_list (current_buffer, Qt);
7353     }
7354
7355   coding->consumed = coding->consumed_char = 0;
7356   coding->produced = coding->produced_char = 0;
7357   coding->chars_at_source = 0;
7358   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7359
7360   ALLOC_CONVERSION_WORK_AREA (coding, coding->src_bytes);
7361
7362   attrs = CODING_ID_ATTRS (coding->id);
7363   translation_table = get_translation_table (attrs, 0, NULL);
7364
7365   carryover = 0;
7366   if (coding->decoder == decode_coding_ccl)
7367     {
7368       coding->spec.ccl = &cclspec;
7369       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7370     }
7371   do
7372     {
7373       ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7374
7375       coding_set_source (coding);
7376       coding->annotated = 0;
7377       coding->charbuf_used = carryover;
7378       (*(coding->decoder)) (coding);
7379       coding_set_destination (coding);
7380       carryover = produce_chars (coding, translation_table, 0);
7381       if (coding->annotated)
7382         produce_annotation (coding, pos);
7383       for (i = 0; i < carryover; i++)
7384         coding->charbuf[i]
7385           = coding->charbuf[coding->charbuf_used - carryover + i];
7386     }
7387   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7388          || (coding->consumed < coding->src_bytes
7389              && (coding->result == CODING_RESULT_SUCCESS
7390                  || coding->result == CODING_RESULT_INVALID_SRC)));
7391
7392   if (carryover > 0)
7393     {
7394       coding_set_destination (coding);
7395       coding->charbuf_used = carryover;
7396       produce_chars (coding, translation_table, 1);
7397     }
7398
7399   coding->carryover_bytes = 0;
7400   if (coding->consumed < coding->src_bytes)
7401     {
7402       ptrdiff_t nbytes = coding->src_bytes - coding->consumed;
7403       const unsigned char *src;
7404
7405       coding_set_source (coding);
7406       coding_set_destination (coding);
7407       src = coding->source + coding->consumed;
7408
7409       if (coding->mode & CODING_MODE_LAST_BLOCK)
7410         {
7411           /* Flush out unprocessed data as binary chars.  We are sure
7412              that the number of data is less than the size of
7413              coding->charbuf.  */
7414           coding->charbuf_used = 0;
7415           coding->chars_at_source = 0;
7416
7417           while (nbytes-- > 0)
7418             {
7419               int c = *src++;
7420
7421               if (c & 0x80)
7422                 c = BYTE8_TO_CHAR (c);
7423               coding->charbuf[coding->charbuf_used++] = c;
7424             }
7425           produce_chars (coding, Qnil, 1);
7426         }
7427       else
7428         {
7429           /* Record unprocessed bytes in coding->carryover.  We are
7430              sure that the number of data is less than the size of
7431              coding->carryover.  */
7432           unsigned char *p = coding->carryover;
7433
7434           if (nbytes > sizeof coding->carryover)
7435             nbytes = sizeof coding->carryover;
7436           coding->carryover_bytes = nbytes;
7437           while (nbytes-- > 0)
7438             *p++ = *src++;
7439         }
7440       coding->consumed = coding->src_bytes;
7441     }
7442
7443   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7444       && !inhibit_eol_conversion)
7445     decode_eol (coding);
7446   if (BUFFERP (coding->dst_object))
7447     {
7448       bset_undo_list (current_buffer, undo_list);
7449       record_insert (coding->dst_pos, coding->produced_char);
7450     }
7451
7452   SAFE_FREE ();
7453 }
7454
7455
7456 /* Extract an annotation datum from a composition starting at POS and
7457    ending before LIMIT of CODING->src_object (buffer or string), store
7458    the data in BUF, set *STOP to a starting position of the next
7459    composition (if any) or to LIMIT, and return the address of the
7460    next element of BUF.
7461
7462    If such an annotation is not found, set *STOP to a starting
7463    position of a composition after POS (if any) or to LIMIT, and
7464    return BUF.  */
7465
7466 static int *
7467 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7468                                struct coding_system *coding, int *buf,
7469                                ptrdiff_t *stop)
7470 {
7471   ptrdiff_t start, end;
7472   Lisp_Object prop;
7473
7474   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7475       || end > limit)
7476     *stop = limit;
7477   else if (start > pos)
7478     *stop = start;
7479   else
7480     {
7481       if (start == pos)
7482         {
7483           /* We found a composition.  Store the corresponding
7484              annotation data in BUF.  */
7485           int *head = buf;
7486           enum composition_method method = composition_method (prop);
7487           int nchars = COMPOSITION_LENGTH (prop);
7488
7489           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7490           if (method != COMPOSITION_RELATIVE)
7491             {
7492               Lisp_Object components;
7493               ptrdiff_t i, len, i_byte;
7494
7495               components = COMPOSITION_COMPONENTS (prop);
7496               if (VECTORP (components))
7497                 {
7498                   len = ASIZE (components);
7499                   for (i = 0; i < len; i++)
7500                     *buf++ = XINT (AREF (components, i));
7501                 }
7502               else if (STRINGP (components))
7503                 {
7504                   len = SCHARS (components);
7505                   i = i_byte = 0;
7506                   while (i < len)
7507                     {
7508                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7509                       buf++;
7510                     }
7511                 }
7512               else if (INTEGERP (components))
7513                 {
7514                   len = 1;
7515                   *buf++ = XINT (components);
7516                 }
7517               else if (CONSP (components))
7518                 {
7519                   for (len = 0; CONSP (components);
7520                        len++, components = XCDR (components))
7521                     *buf++ = XINT (XCAR (components));
7522                 }
7523               else
7524                 emacs_abort ();
7525               *head -= len;
7526             }
7527         }
7528
7529       if (find_composition (end, limit, &start, &end, &prop,
7530                             coding->src_object)
7531           && end <= limit)
7532         *stop = start;
7533       else
7534         *stop = limit;
7535     }
7536   return buf;
7537 }
7538
7539
7540 /* Extract an annotation datum from a text property `charset' at POS of
7541    CODING->src_object (buffer of string), store the data in BUF, set
7542    *STOP to the position where the value of `charset' property changes
7543    (limiting by LIMIT), and return the address of the next element of
7544    BUF.
7545
7546    If the property value is nil, set *STOP to the position where the
7547    property value is non-nil (limiting by LIMIT), and return BUF.  */
7548
7549 static int *
7550 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7551                            struct coding_system *coding, int *buf,
7552                            ptrdiff_t *stop)
7553 {
7554   Lisp_Object val, next;
7555   int id;
7556
7557   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7558   if (! NILP (val) && CHARSETP (val))
7559     id = XINT (CHARSET_SYMBOL_ID (val));
7560   else
7561     id = -1;
7562   ADD_CHARSET_DATA (buf, 0, id);
7563   next = Fnext_single_property_change (make_number (pos), Qcharset,
7564                                        coding->src_object,
7565                                        make_number (limit));
7566   *stop = XINT (next);
7567   return buf;
7568 }
7569
7570
7571 static void
7572 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7573                int max_lookup)
7574 {
7575   int *buf = coding->charbuf;
7576   int *buf_end = coding->charbuf + coding->charbuf_size;
7577   const unsigned char *src = coding->source + coding->consumed;
7578   const unsigned char *src_end = coding->source + coding->src_bytes;
7579   ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7580   ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7581   bool multibytep = coding->src_multibyte;
7582   Lisp_Object eol_type;
7583   int c;
7584   ptrdiff_t stop, stop_composition, stop_charset;
7585   int *lookup_buf = NULL;
7586
7587   if (! NILP (translation_table))
7588     lookup_buf = alloca (sizeof (int) * max_lookup);
7589
7590   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7591   if (VECTORP (eol_type))
7592     eol_type = Qunix;
7593
7594   /* Note: composition handling is not yet implemented.  */
7595   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7596
7597   if (NILP (coding->src_object))
7598     stop = stop_composition = stop_charset = end_pos;
7599   else
7600     {
7601       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7602         stop = stop_composition = pos;
7603       else
7604         stop = stop_composition = end_pos;
7605       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7606         stop = stop_charset = pos;
7607       else
7608         stop_charset = end_pos;
7609     }
7610
7611   /* Compensate for CRLF and conversion.  */
7612   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7613   while (buf < buf_end)
7614     {
7615       Lisp_Object trans;
7616
7617       if (pos == stop)
7618         {
7619           if (pos == end_pos)
7620             break;
7621           if (pos == stop_composition)
7622             buf = handle_composition_annotation (pos, end_pos, coding,
7623                                                  buf, &stop_composition);
7624           if (pos == stop_charset)
7625             buf = handle_charset_annotation (pos, end_pos, coding,
7626                                              buf, &stop_charset);
7627           stop = (stop_composition < stop_charset
7628                   ? stop_composition : stop_charset);
7629         }
7630
7631       if (! multibytep)
7632         {
7633           int bytes;
7634
7635           if (coding->encoder == encode_coding_raw_text
7636               || coding->encoder == encode_coding_ccl)
7637             c = *src++, pos++;
7638           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7639             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7640           else
7641             c = BYTE8_TO_CHAR (*src), src++, pos++;
7642         }
7643       else
7644         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7645       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7646         c = '\n';
7647       if (! EQ (eol_type, Qunix))
7648         {
7649           if (c == '\n')
7650             {
7651               if (EQ (eol_type, Qdos))
7652                 *buf++ = '\r';
7653               else
7654                 c = '\r';
7655             }
7656         }
7657
7658       trans = Qnil;
7659       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7660       if (NILP (trans))
7661         *buf++ = c;
7662       else
7663         {
7664           ptrdiff_t from_nchars = 1, to_nchars = 1;
7665           int *lookup_buf_end;
7666           const unsigned char *p = src;
7667           int i;
7668
7669           lookup_buf[0] = c;
7670           for (i = 1; i < max_lookup && p < src_end; i++)
7671             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7672           lookup_buf_end = lookup_buf + i;
7673           trans = get_translation (trans, lookup_buf, lookup_buf_end,
7674                                    &from_nchars);
7675           if (INTEGERP (trans))
7676             c = XINT (trans);
7677           else if (VECTORP (trans))
7678             {
7679               to_nchars = ASIZE (trans);
7680               if (buf_end - buf < to_nchars)
7681                 break;
7682               c = XINT (AREF (trans, 0));
7683             }
7684           else
7685             break;
7686           *buf++ = c;
7687           for (i = 1; i < to_nchars; i++)
7688             *buf++ = XINT (AREF (trans, i));
7689           for (i = 1; i < from_nchars; i++, pos++)
7690             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7691         }
7692     }
7693
7694   coding->consumed = src - coding->source;
7695   coding->consumed_char = pos - coding->src_pos;
7696   coding->charbuf_used = buf - coding->charbuf;
7697   coding->chars_at_source = 0;
7698 }
7699
7700
7701 /* Encode the text at CODING->src_object into CODING->dst_object.
7702    CODING->src_object is a buffer or a string.
7703    CODING->dst_object is a buffer or nil.
7704
7705    If CODING->src_object is a buffer, it must be the current buffer.
7706    In this case, if CODING->src_pos is positive, it is a position of
7707    the source text in the buffer, otherwise. the source text is in the
7708    gap area of the buffer, and coding->src_pos specifies the offset of
7709    the text from GPT (which must be the same as PT).  If this is the
7710    same buffer as CODING->dst_object, CODING->src_pos must be
7711    negative and CODING should not have `pre-write-conversion'.
7712
7713    If CODING->src_object is a string, CODING should not have
7714    `pre-write-conversion'.
7715
7716    If CODING->dst_object is a buffer, the encoded data is inserted at
7717    the current point of that buffer.
7718
7719    If CODING->dst_object is nil, the encoded data is placed at the
7720    memory area specified by CODING->destination.  */
7721
7722 static void
7723 encode_coding (struct coding_system *coding)
7724 {
7725   Lisp_Object attrs;
7726   Lisp_Object translation_table;
7727   int max_lookup;
7728   struct ccl_spec cclspec;
7729
7730   USE_SAFE_ALLOCA;
7731
7732   attrs = CODING_ID_ATTRS (coding->id);
7733   if (coding->encoder == encode_coding_raw_text)
7734     translation_table = Qnil, max_lookup = 0;
7735   else
7736     translation_table = get_translation_table (attrs, 1, &max_lookup);
7737
7738   if (BUFFERP (coding->dst_object))
7739     {
7740       set_buffer_internal (XBUFFER (coding->dst_object));
7741       coding->dst_multibyte
7742         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7743     }
7744
7745   coding->consumed = coding->consumed_char = 0;
7746   coding->produced = coding->produced_char = 0;
7747   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7748
7749   ALLOC_CONVERSION_WORK_AREA (coding, coding->src_chars);
7750
7751   if (coding->encoder == encode_coding_ccl)
7752     {
7753       coding->spec.ccl = &cclspec;
7754       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7755     }
7756   do {
7757     coding_set_source (coding);
7758     consume_chars (coding, translation_table, max_lookup);
7759     coding_set_destination (coding);
7760     (*(coding->encoder)) (coding);
7761   } while (coding->consumed_char < coding->src_chars);
7762
7763   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7764     insert_from_gap (coding->produced_char, coding->produced, 0);
7765
7766   SAFE_FREE ();
7767 }
7768
7769
7770 /* Name (or base name) of work buffer for code conversion.  */
7771 static Lisp_Object Vcode_conversion_workbuf_name;
7772
7773 /* A working buffer used by the top level conversion.  Once it is
7774    created, it is never destroyed.  It has the name
7775    Vcode_conversion_workbuf_name.  The other working buffers are
7776    destroyed after the use is finished, and their names are modified
7777    versions of Vcode_conversion_workbuf_name.  */
7778 static Lisp_Object Vcode_conversion_reused_workbuf;
7779
7780 /* True iff Vcode_conversion_reused_workbuf is already in use.  */
7781 static bool reused_workbuf_in_use;
7782
7783
7784 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7785    multibyteness of returning buffer.  */
7786
7787 static Lisp_Object
7788 make_conversion_work_buffer (bool multibyte)
7789 {
7790   Lisp_Object name, workbuf;
7791   struct buffer *current;
7792
7793   if (reused_workbuf_in_use)
7794     {
7795       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7796       workbuf = Fget_buffer_create (name);
7797     }
7798   else
7799     {
7800       reused_workbuf_in_use = 1;
7801       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7802         Vcode_conversion_reused_workbuf
7803           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7804       workbuf = Vcode_conversion_reused_workbuf;
7805     }
7806   current = current_buffer;
7807   set_buffer_internal (XBUFFER (workbuf));
7808   /* We can't allow modification hooks to run in the work buffer.  For
7809      instance, directory_files_internal assumes that file decoding
7810      doesn't compile new regexps.  */
7811   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7812   Ferase_buffer ();
7813   bset_undo_list (current_buffer, Qt);
7814   bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7815   set_buffer_internal (current);
7816   return workbuf;
7817 }
7818
7819
7820 static void
7821 code_conversion_restore (Lisp_Object arg)
7822 {
7823   Lisp_Object current, workbuf;
7824
7825   current = XCAR (arg);
7826   workbuf = XCDR (arg);
7827   if (! NILP (workbuf))
7828     {
7829       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7830         reused_workbuf_in_use = 0;
7831       else
7832         Fkill_buffer (workbuf);
7833     }
7834   set_buffer_internal (XBUFFER (current));
7835 }
7836
7837 Lisp_Object
7838 code_conversion_save (bool with_work_buf, bool multibyte)
7839 {
7840   Lisp_Object workbuf = Qnil;
7841
7842   if (with_work_buf)
7843     workbuf = make_conversion_work_buffer (multibyte);
7844   record_unwind_protect (code_conversion_restore,
7845                          Fcons (Fcurrent_buffer (), workbuf));
7846   return workbuf;
7847 }
7848
7849 void
7850 decode_coding_gap (struct coding_system *coding,
7851                    ptrdiff_t chars, ptrdiff_t bytes)
7852 {
7853   ptrdiff_t count = SPECPDL_INDEX ();
7854   Lisp_Object attrs;
7855
7856   coding->src_object = Fcurrent_buffer ();
7857   coding->src_chars = chars;
7858   coding->src_bytes = bytes;
7859   coding->src_pos = -chars;
7860   coding->src_pos_byte = -bytes;
7861   coding->src_multibyte = chars < bytes;
7862   coding->dst_object = coding->src_object;
7863   coding->dst_pos = PT;
7864   coding->dst_pos_byte = PT_BYTE;
7865   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7866
7867   coding->head_ascii = -1;
7868   coding->detected_utf8_bytes = coding->detected_utf8_chars = -1;
7869   coding->eol_seen = EOL_SEEN_NONE;
7870   if (CODING_REQUIRE_DETECTION (coding))
7871     detect_coding (coding);
7872   attrs = CODING_ID_ATTRS (coding->id);
7873   if (! disable_ascii_optimization
7874       && ! coding->src_multibyte
7875       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
7876       && NILP (CODING_ATTR_POST_READ (attrs))
7877       && NILP (get_translation_table (attrs, 0, NULL)))
7878     {
7879       chars = coding->head_ascii;
7880       if (chars < 0)
7881         chars = check_ascii (coding);
7882       if (chars != bytes)
7883         {
7884           /* There exists a non-ASCII byte.  */
7885           if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8)
7886               && coding->detected_utf8_bytes == coding->src_bytes)
7887             {
7888               if (coding->detected_utf8_chars >= 0)
7889                 chars = coding->detected_utf8_chars;
7890               else
7891                 chars = check_utf_8 (coding);
7892               if (CODING_UTF_8_BOM (coding) != utf_without_bom
7893                   && coding->head_ascii == 0
7894                   && coding->source[0] == UTF_8_BOM_1
7895                   && coding->source[1] == UTF_8_BOM_2
7896                   && coding->source[2] == UTF_8_BOM_3)
7897                 {
7898                   chars--;
7899                   bytes -= 3;
7900                   coding->src_bytes -= 3;
7901                 }
7902             }
7903           else
7904             chars = -1;
7905         }
7906       if (chars >= 0)
7907         {
7908           Lisp_Object eol_type;
7909
7910           eol_type = CODING_ID_EOL_TYPE (coding->id);
7911           if (VECTORP (eol_type))
7912             {
7913               if (coding->eol_seen != EOL_SEEN_NONE)
7914                 eol_type = adjust_coding_eol_type (coding, coding->eol_seen);
7915             }
7916           if (EQ (eol_type, Qmac))
7917             {
7918               unsigned char *src_end = GAP_END_ADDR;
7919               unsigned char *src = src_end - coding->src_bytes;
7920
7921               while (src < src_end)
7922                 {
7923                   if (*src++ == '\r')
7924                     src[-1] = '\n';
7925                 }
7926             }
7927           else if (EQ (eol_type, Qdos))
7928             {
7929               unsigned char *src = GAP_END_ADDR;
7930               unsigned char *src_beg = src - coding->src_bytes;
7931               unsigned char *dst = src;
7932               ptrdiff_t diff;
7933
7934               while (src_beg < src)
7935                 {
7936                   *--dst = *--src;
7937                   if (*src == '\n' && src > src_beg && src[-1] == '\r')
7938                     src--;
7939                 }
7940               diff = dst - src;
7941               bytes -= diff;
7942               chars -= diff;
7943             }
7944           coding->produced = bytes;
7945           coding->produced_char = chars;
7946           insert_from_gap (chars, bytes, 1);
7947           return;
7948         }
7949     }
7950   code_conversion_save (0, 0);
7951
7952   coding->mode |= CODING_MODE_LAST_BLOCK;
7953   current_buffer->text->inhibit_shrinking = 1;
7954   decode_coding (coding);
7955   current_buffer->text->inhibit_shrinking = 0;
7956
7957   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7958     {
7959       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7960       Lisp_Object val;
7961
7962       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7963       val = call1 (CODING_ATTR_POST_READ (attrs),
7964                    make_number (coding->produced_char));
7965       CHECK_NATNUM (val);
7966       coding->produced_char += Z - prev_Z;
7967       coding->produced += Z_BYTE - prev_Z_BYTE;
7968     }
7969
7970   unbind_to (count, Qnil);
7971 }
7972
7973
7974 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7975    SRC_OBJECT into DST_OBJECT by coding context CODING.
7976
7977    SRC_OBJECT is a buffer, a string, or Qnil.
7978
7979    If it is a buffer, the text is at point of the buffer.  FROM and TO
7980    are positions in the buffer.
7981
7982    If it is a string, the text is at the beginning of the string.
7983    FROM and TO are indices to the string.
7984
7985    If it is nil, the text is at coding->source.  FROM and TO are
7986    indices to coding->source.
7987
7988    DST_OBJECT is a buffer, Qt, or Qnil.
7989
7990    If it is a buffer, the decoded text is inserted at point of the
7991    buffer.  If the buffer is the same as SRC_OBJECT, the source text
7992    is deleted.
7993
7994    If it is Qt, a string is made from the decoded text, and
7995    set in CODING->dst_object.
7996
7997    If it is Qnil, the decoded text is stored at CODING->destination.
7998    The caller must allocate CODING->dst_bytes bytes at
7999    CODING->destination by xmalloc.  If the decoded text is longer than
8000    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
8001  */
8002
8003 void
8004 decode_coding_object (struct coding_system *coding,
8005                       Lisp_Object src_object,
8006                       ptrdiff_t from, ptrdiff_t from_byte,
8007                       ptrdiff_t to, ptrdiff_t to_byte,
8008                       Lisp_Object dst_object)
8009 {
8010   ptrdiff_t count = SPECPDL_INDEX ();
8011   unsigned char *destination;
8012   ptrdiff_t dst_bytes;
8013   ptrdiff_t chars = to - from;
8014   ptrdiff_t bytes = to_byte - from_byte;
8015   Lisp_Object attrs;
8016   ptrdiff_t saved_pt = -1, saved_pt_byte;
8017   bool need_marker_adjustment = 0;
8018   Lisp_Object old_deactivate_mark;
8019
8020   old_deactivate_mark = Vdeactivate_mark;
8021
8022   if (NILP (dst_object))
8023     {
8024       destination = coding->destination;
8025       dst_bytes = coding->dst_bytes;
8026     }
8027
8028   coding->src_object = src_object;
8029   coding->src_chars = chars;
8030   coding->src_bytes = bytes;
8031   coding->src_multibyte = chars < bytes;
8032
8033   if (STRINGP (src_object))
8034     {
8035       coding->src_pos = from;
8036       coding->src_pos_byte = from_byte;
8037     }
8038   else if (BUFFERP (src_object))
8039     {
8040       set_buffer_internal (XBUFFER (src_object));
8041       if (from != GPT)
8042         move_gap_both (from, from_byte);
8043       if (EQ (src_object, dst_object))
8044         {
8045           struct Lisp_Marker *tail;
8046
8047           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8048             {
8049               tail->need_adjustment
8050                 = tail->charpos == (tail->insertion_type ? from : to);
8051               need_marker_adjustment |= tail->need_adjustment;
8052             }
8053           saved_pt = PT, saved_pt_byte = PT_BYTE;
8054           TEMP_SET_PT_BOTH (from, from_byte);
8055           current_buffer->text->inhibit_shrinking = 1;
8056           del_range_both (from, from_byte, to, to_byte, 1);
8057           coding->src_pos = -chars;
8058           coding->src_pos_byte = -bytes;
8059         }
8060       else
8061         {
8062           coding->src_pos = from;
8063           coding->src_pos_byte = from_byte;
8064         }
8065     }
8066
8067   if (CODING_REQUIRE_DETECTION (coding))
8068     detect_coding (coding);
8069   attrs = CODING_ID_ATTRS (coding->id);
8070
8071   if (EQ (dst_object, Qt)
8072       || (! NILP (CODING_ATTR_POST_READ (attrs))
8073           && NILP (dst_object)))
8074     {
8075       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
8076       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
8077       coding->dst_pos = BEG;
8078       coding->dst_pos_byte = BEG_BYTE;
8079     }
8080   else if (BUFFERP (dst_object))
8081     {
8082       code_conversion_save (0, 0);
8083       coding->dst_object = dst_object;
8084       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
8085       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
8086       coding->dst_multibyte
8087         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8088     }
8089   else
8090     {
8091       code_conversion_save (0, 0);
8092       coding->dst_object = Qnil;
8093       /* Most callers presume this will return a multibyte result, and they
8094          won't use `binary' or `raw-text' anyway, so let's not worry about
8095          CODING_FOR_UNIBYTE.  */
8096       coding->dst_multibyte = 1;
8097     }
8098
8099   decode_coding (coding);
8100
8101   if (BUFFERP (coding->dst_object))
8102     set_buffer_internal (XBUFFER (coding->dst_object));
8103
8104   if (! NILP (CODING_ATTR_POST_READ (attrs)))
8105     {
8106       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
8107       Lisp_Object val;
8108
8109       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
8110       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
8111                         make_number (coding->produced_char));
8112       CHECK_NATNUM (val);
8113       coding->produced_char += Z - prev_Z;
8114       coding->produced += Z_BYTE - prev_Z_BYTE;
8115     }
8116
8117   if (EQ (dst_object, Qt))
8118     {
8119       coding->dst_object = Fbuffer_string ();
8120     }
8121   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
8122     {
8123       set_buffer_internal (XBUFFER (coding->dst_object));
8124       if (dst_bytes < coding->produced)
8125         {
8126           eassert (coding->produced > 0);
8127           destination = xrealloc (destination, coding->produced);
8128           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
8129             move_gap_both (BEGV, BEGV_BYTE);
8130           memcpy (destination, BEGV_ADDR, coding->produced);
8131           coding->destination = destination;
8132         }
8133     }
8134
8135   if (saved_pt >= 0)
8136     {
8137       /* This is the case of:
8138          (BUFFERP (src_object) && EQ (src_object, dst_object))
8139          As we have moved PT while replacing the original buffer
8140          contents, we must recover it now.  */
8141       set_buffer_internal (XBUFFER (src_object));
8142       current_buffer->text->inhibit_shrinking = 0;
8143       if (saved_pt < from)
8144         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8145       else if (saved_pt < from + chars)
8146         TEMP_SET_PT_BOTH (from, from_byte);
8147       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8148         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8149                           saved_pt_byte + (coding->produced - bytes));
8150       else
8151         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8152                           saved_pt_byte + (coding->produced - bytes));
8153
8154       if (need_marker_adjustment)
8155         {
8156           struct Lisp_Marker *tail;
8157
8158           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8159             if (tail->need_adjustment)
8160               {
8161                 tail->need_adjustment = 0;
8162                 if (tail->insertion_type)
8163                   {
8164                     tail->bytepos = from_byte;
8165                     tail->charpos = from;
8166                   }
8167                 else
8168                   {
8169                     tail->bytepos = from_byte + coding->produced;
8170                     tail->charpos
8171                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8172                          ? tail->bytepos : from + coding->produced_char);
8173                   }
8174               }
8175         }
8176     }
8177
8178   Vdeactivate_mark = old_deactivate_mark;
8179   unbind_to (count, coding->dst_object);
8180 }
8181
8182
8183 void
8184 encode_coding_object (struct coding_system *coding,
8185                       Lisp_Object src_object,
8186                       ptrdiff_t from, ptrdiff_t from_byte,
8187                       ptrdiff_t to, ptrdiff_t to_byte,
8188                       Lisp_Object dst_object)
8189 {
8190   ptrdiff_t count = SPECPDL_INDEX ();
8191   ptrdiff_t chars = to - from;
8192   ptrdiff_t bytes = to_byte - from_byte;
8193   Lisp_Object attrs;
8194   ptrdiff_t saved_pt = -1, saved_pt_byte;
8195   bool need_marker_adjustment = 0;
8196   bool kill_src_buffer = 0;
8197   Lisp_Object old_deactivate_mark;
8198
8199   old_deactivate_mark = Vdeactivate_mark;
8200
8201   coding->src_object = src_object;
8202   coding->src_chars = chars;
8203   coding->src_bytes = bytes;
8204   coding->src_multibyte = chars < bytes;
8205
8206   attrs = CODING_ID_ATTRS (coding->id);
8207
8208   if (EQ (src_object, dst_object))
8209     {
8210       struct Lisp_Marker *tail;
8211
8212       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8213         {
8214           tail->need_adjustment
8215             = tail->charpos == (tail->insertion_type ? from : to);
8216           need_marker_adjustment |= tail->need_adjustment;
8217         }
8218     }
8219
8220   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
8221     {
8222       coding->src_object = code_conversion_save (1, coding->src_multibyte);
8223       set_buffer_internal (XBUFFER (coding->src_object));
8224       if (STRINGP (src_object))
8225         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
8226       else if (BUFFERP (src_object))
8227         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
8228       else
8229         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
8230
8231       if (EQ (src_object, dst_object))
8232         {
8233           set_buffer_internal (XBUFFER (src_object));
8234           saved_pt = PT, saved_pt_byte = PT_BYTE;
8235           del_range_both (from, from_byte, to, to_byte, 1);
8236           set_buffer_internal (XBUFFER (coding->src_object));
8237         }
8238
8239       safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
8240                   make_number (BEG), make_number (Z));
8241       if (XBUFFER (coding->src_object) != current_buffer)
8242         kill_src_buffer = 1;
8243       coding->src_object = Fcurrent_buffer ();
8244       if (BEG != GPT)
8245         move_gap_both (BEG, BEG_BYTE);
8246       coding->src_chars = Z - BEG;
8247       coding->src_bytes = Z_BYTE - BEG_BYTE;
8248       coding->src_pos = BEG;
8249       coding->src_pos_byte = BEG_BYTE;
8250       coding->src_multibyte = Z < Z_BYTE;
8251     }
8252   else if (STRINGP (src_object))
8253     {
8254       code_conversion_save (0, 0);
8255       coding->src_pos = from;
8256       coding->src_pos_byte = from_byte;
8257     }
8258   else if (BUFFERP (src_object))
8259     {
8260       code_conversion_save (0, 0);
8261       set_buffer_internal (XBUFFER (src_object));
8262       if (EQ (src_object, dst_object))
8263         {
8264           saved_pt = PT, saved_pt_byte = PT_BYTE;
8265           coding->src_object = del_range_1 (from, to, 1, 1);
8266           coding->src_pos = 0;
8267           coding->src_pos_byte = 0;
8268         }
8269       else
8270         {
8271           if (from < GPT && to >= GPT)
8272             move_gap_both (from, from_byte);
8273           coding->src_pos = from;
8274           coding->src_pos_byte = from_byte;
8275         }
8276     }
8277   else
8278     {
8279       code_conversion_save (0, 0);
8280       coding->src_pos = from;
8281       coding->src_pos_byte = from_byte;
8282     }
8283
8284   if (BUFFERP (dst_object))
8285     {
8286       coding->dst_object = dst_object;
8287       if (EQ (src_object, dst_object))
8288         {
8289           coding->dst_pos = from;
8290           coding->dst_pos_byte = from_byte;
8291         }
8292       else
8293         {
8294           struct buffer *current = current_buffer;
8295
8296           set_buffer_temp (XBUFFER (dst_object));
8297           coding->dst_pos = PT;
8298           coding->dst_pos_byte = PT_BYTE;
8299           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8300           set_buffer_temp (current);
8301         }
8302       coding->dst_multibyte
8303         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8304     }
8305   else if (EQ (dst_object, Qt))
8306     {
8307       ptrdiff_t dst_bytes = max (1, coding->src_chars);
8308       coding->dst_object = Qnil;
8309       coding->destination = xmalloc (dst_bytes);
8310       coding->dst_bytes = dst_bytes;
8311       coding->dst_multibyte = 0;
8312     }
8313   else
8314     {
8315       coding->dst_object = Qnil;
8316       coding->dst_multibyte = 0;
8317     }
8318
8319   encode_coding (coding);
8320
8321   if (EQ (dst_object, Qt))
8322     {
8323       if (BUFFERP (coding->dst_object))
8324         coding->dst_object = Fbuffer_string ();
8325       else if (coding->raw_destination)
8326         /* This is used to avoid creating huge Lisp string.
8327            NOTE: caller who sets `raw_destination' is also
8328            responsible for freeing `destination' buffer.  */
8329         coding->dst_object = Qnil;
8330       else
8331         {
8332           coding->dst_object
8333             = make_unibyte_string ((char *) coding->destination,
8334                                    coding->produced);
8335           xfree (coding->destination);
8336         }
8337     }
8338
8339   if (saved_pt >= 0)
8340     {
8341       /* This is the case of:
8342          (BUFFERP (src_object) && EQ (src_object, dst_object))
8343          As we have moved PT while replacing the original buffer
8344          contents, we must recover it now.  */
8345       set_buffer_internal (XBUFFER (src_object));
8346       if (saved_pt < from)
8347         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8348       else if (saved_pt < from + chars)
8349         TEMP_SET_PT_BOTH (from, from_byte);
8350       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8351         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8352                           saved_pt_byte + (coding->produced - bytes));
8353       else
8354         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8355                           saved_pt_byte + (coding->produced - bytes));
8356
8357       if (need_marker_adjustment)
8358         {
8359           struct Lisp_Marker *tail;
8360
8361           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8362             if (tail->need_adjustment)
8363               {
8364                 tail->need_adjustment = 0;
8365                 if (tail->insertion_type)
8366                   {
8367                     tail->bytepos = from_byte;
8368                     tail->charpos = from;
8369                   }
8370                 else
8371                   {
8372                     tail->bytepos = from_byte + coding->produced;
8373                     tail->charpos
8374                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8375                          ? tail->bytepos : from + coding->produced_char);
8376                   }
8377               }
8378         }
8379     }
8380
8381   if (kill_src_buffer)
8382     Fkill_buffer (coding->src_object);
8383
8384   Vdeactivate_mark = old_deactivate_mark;
8385   unbind_to (count, Qnil);
8386 }
8387
8388
8389 Lisp_Object
8390 preferred_coding_system (void)
8391 {
8392   int id = coding_categories[coding_priorities[0]].id;
8393
8394   return CODING_ID_NAME (id);
8395 }
8396
8397 #if defined (WINDOWSNT) || defined (CYGWIN)
8398
8399 Lisp_Object
8400 from_unicode (Lisp_Object str)
8401 {
8402   CHECK_STRING (str);
8403   if (!STRING_MULTIBYTE (str) &&
8404       SBYTES (str) & 1)
8405     {
8406       str = Fsubstring (str, make_number (0), make_number (-1));
8407     }
8408
8409   return code_convert_string_norecord (str, Qutf_16le, 0);
8410 }
8411
8412 Lisp_Object
8413 from_unicode_buffer (const wchar_t *wstr)
8414 {
8415   /* We get one of the two final null bytes for free.  */
8416   ptrdiff_t len = 1 + sizeof (wchar_t) * wcslen (wstr);
8417   AUTO_STRING_WITH_LEN (str, (char *) wstr, len);
8418   return from_unicode (str);
8419 }
8420
8421 wchar_t *
8422 to_unicode (Lisp_Object str, Lisp_Object *buf)
8423 {
8424   *buf = code_convert_string_norecord (str, Qutf_16le, 1);
8425   /* We need to make another copy (in addition to the one made by
8426      code_convert_string_norecord) to ensure that the final string is
8427      _doubly_ zero terminated --- that is, that the string is
8428      terminated by two zero bytes and one utf-16le null character.
8429      Because strings are already terminated with a single zero byte,
8430      we just add one additional zero. */
8431   str = make_uninit_string (SBYTES (*buf) + 1);
8432   memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
8433   SDATA (str) [SBYTES (*buf)] = '\0';
8434   *buf = str;
8435   return WCSDATA (*buf);
8436 }
8437
8438 #endif /* WINDOWSNT || CYGWIN */
8439
8440 \f
8441 #ifdef emacs
8442 /*** 8. Emacs Lisp library functions ***/
8443
8444 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8445        doc: /* Return t if OBJECT is nil or a coding-system.
8446 See the documentation of `define-coding-system' for information
8447 about coding-system objects.  */)
8448   (Lisp_Object object)
8449 {
8450   if (NILP (object)
8451       || CODING_SYSTEM_ID (object) >= 0)
8452     return Qt;
8453   if (! SYMBOLP (object)
8454       || NILP (Fget (object, Qcoding_system_define_form)))
8455     return Qnil;
8456   return Qt;
8457 }
8458
8459 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8460        Sread_non_nil_coding_system, 1, 1, 0,
8461        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8462   (Lisp_Object prompt)
8463 {
8464   Lisp_Object val;
8465   do
8466     {
8467       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8468                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8469     }
8470   while (SCHARS (val) == 0);
8471   return (Fintern (val, Qnil));
8472 }
8473
8474 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8475        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8476 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8477 Ignores case when completing coding systems (all Emacs coding systems
8478 are lower-case).  */)
8479   (Lisp_Object prompt, Lisp_Object default_coding_system)
8480 {
8481   Lisp_Object val;
8482   ptrdiff_t count = SPECPDL_INDEX ();
8483
8484   if (SYMBOLP (default_coding_system))
8485     default_coding_system = SYMBOL_NAME (default_coding_system);
8486   specbind (Qcompletion_ignore_case, Qt);
8487   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8488                           Qt, Qnil, Qcoding_system_history,
8489                           default_coding_system, Qnil);
8490   unbind_to (count, Qnil);
8491   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8492 }
8493
8494 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8495        1, 1, 0,
8496        doc: /* Check validity of CODING-SYSTEM.
8497 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8498 It is valid if it is nil or a symbol defined as a coding system by the
8499 function `define-coding-system'.  */)
8500   (Lisp_Object coding_system)
8501 {
8502   Lisp_Object define_form;
8503
8504   define_form = Fget (coding_system, Qcoding_system_define_form);
8505   if (! NILP (define_form))
8506     {
8507       Fput (coding_system, Qcoding_system_define_form, Qnil);
8508       safe_eval (define_form);
8509     }
8510   if (!NILP (Fcoding_system_p (coding_system)))
8511     return coding_system;
8512   xsignal1 (Qcoding_system_error, coding_system);
8513 }
8514
8515 \f
8516 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8517    HIGHEST, return the coding system of the highest
8518    priority among the detected coding systems.  Otherwise return a
8519    list of detected coding systems sorted by their priorities.  If
8520    MULTIBYTEP, it is assumed that the bytes are in correct
8521    multibyte form but contains only ASCII and eight-bit chars.
8522    Otherwise, the bytes are raw bytes.
8523
8524    CODING-SYSTEM controls the detection as below:
8525
8526    If it is nil, detect both text-format and eol-format.  If the
8527    text-format part of CODING-SYSTEM is already specified
8528    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8529    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8530    detect only text-format.  */
8531
8532 Lisp_Object
8533 detect_coding_system (const unsigned char *src,
8534                       ptrdiff_t src_chars, ptrdiff_t src_bytes,
8535                       bool highest, bool multibytep,
8536                       Lisp_Object coding_system)
8537 {
8538   const unsigned char *src_end = src + src_bytes;
8539   Lisp_Object attrs, eol_type;
8540   Lisp_Object val = Qnil;
8541   struct coding_system coding;
8542   ptrdiff_t id;
8543   struct coding_detection_info detect_info;
8544   enum coding_category base_category;
8545   bool null_byte_found = 0, eight_bit_found = 0;
8546
8547   if (NILP (coding_system))
8548     coding_system = Qundecided;
8549   setup_coding_system (coding_system, &coding);
8550   attrs = CODING_ID_ATTRS (coding.id);
8551   eol_type = CODING_ID_EOL_TYPE (coding.id);
8552   coding_system = CODING_ATTR_BASE_NAME (attrs);
8553
8554   coding.source = src;
8555   coding.src_chars = src_chars;
8556   coding.src_bytes = src_bytes;
8557   coding.src_multibyte = multibytep;
8558   coding.consumed = 0;
8559   coding.mode |= CODING_MODE_LAST_BLOCK;
8560   coding.head_ascii = 0;
8561
8562   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8563
8564   /* At first, detect text-format if necessary.  */
8565   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8566   if (base_category == coding_category_undecided)
8567     {
8568       enum coding_category category IF_LINT (= 0);
8569       struct coding_system *this IF_LINT (= NULL);
8570       int c, i;
8571       bool inhibit_nbd = inhibit_flag (coding.spec.undecided.inhibit_nbd,
8572                                        inhibit_null_byte_detection);
8573       bool inhibit_ied = inhibit_flag (coding.spec.undecided.inhibit_ied,
8574                                        inhibit_iso_escape_detection);
8575       bool prefer_utf_8 = coding.spec.undecided.prefer_utf_8;
8576
8577       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8578       for (; src < src_end; src++)
8579         {
8580           c = *src;
8581           if (c & 0x80)
8582             {
8583               eight_bit_found = 1;
8584               if (null_byte_found)
8585                 break;
8586             }
8587           else if (c < 0x20)
8588             {
8589               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8590                   && ! inhibit_ied
8591                   && ! detect_info.checked)
8592                 {
8593                   if (detect_coding_iso_2022 (&coding, &detect_info))
8594                     {
8595                       /* We have scanned the whole data.  */
8596                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8597                         {
8598                           /* We didn't find an 8-bit code.  We may
8599                              have found a null-byte, but it's very
8600                              rare that a binary file confirm to
8601                              ISO-2022.  */
8602                           src = src_end;
8603                           coding.head_ascii = src - coding.source;
8604                         }
8605                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8606                       break;
8607                     }
8608                 }
8609               else if (! c && !inhibit_nbd)
8610                 {
8611                   null_byte_found = 1;
8612                   if (eight_bit_found)
8613                     break;
8614                 }
8615               if (! eight_bit_found)
8616                 coding.head_ascii++;
8617             }
8618           else if (! eight_bit_found)
8619             coding.head_ascii++;
8620         }
8621
8622       if (null_byte_found || eight_bit_found
8623           || coding.head_ascii < coding.src_bytes
8624           || detect_info.found)
8625         {
8626           if (coding.head_ascii == coding.src_bytes)
8627             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8628             for (i = 0; i < coding_category_raw_text; i++)
8629               {
8630                 category = coding_priorities[i];
8631                 this = coding_categories + category;
8632                 if (detect_info.found & (1 << category))
8633                   break;
8634               }
8635           else
8636             {
8637               if (null_byte_found)
8638                 {
8639                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8640                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8641                 }
8642               else if (prefer_utf_8
8643                        && detect_coding_utf_8 (&coding, &detect_info))
8644                 {
8645                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
8646                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
8647                 }
8648               for (i = 0; i < coding_category_raw_text; i++)
8649                 {
8650                   category = coding_priorities[i];
8651                   this = coding_categories + category;
8652
8653                   if (this->id < 0)
8654                     {
8655                       /* No coding system of this category is defined.  */
8656                       detect_info.rejected |= (1 << category);
8657                     }
8658                   else if (category >= coding_category_raw_text)
8659                     continue;
8660                   else if (detect_info.checked & (1 << category))
8661                     {
8662                       if (highest
8663                           && (detect_info.found & (1 << category)))
8664                         break;
8665                     }
8666                   else if ((*(this->detector)) (&coding, &detect_info)
8667                            && highest
8668                            && (detect_info.found & (1 << category)))
8669                     {
8670                       if (category == coding_category_utf_16_auto)
8671                         {
8672                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8673                             category = coding_category_utf_16_le;
8674                           else
8675                             category = coding_category_utf_16_be;
8676                         }
8677                       break;
8678                     }
8679                 }
8680             }
8681         }
8682
8683       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8684           || null_byte_found)
8685         {
8686           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8687           id = CODING_SYSTEM_ID (Qno_conversion);
8688           val = list1 (make_number (id));
8689         }
8690       else if (! detect_info.rejected && ! detect_info.found)
8691         {
8692           detect_info.found = CATEGORY_MASK_ANY;
8693           id = coding_categories[coding_category_undecided].id;
8694           val = list1 (make_number (id));
8695         }
8696       else if (highest)
8697         {
8698           if (detect_info.found)
8699             {
8700               detect_info.found = 1 << category;
8701               val = list1 (make_number (this->id));
8702             }
8703           else
8704             for (i = 0; i < coding_category_raw_text; i++)
8705               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8706                 {
8707                   detect_info.found = 1 << coding_priorities[i];
8708                   id = coding_categories[coding_priorities[i]].id;
8709                   val = list1 (make_number (id));
8710                   break;
8711                 }
8712         }
8713       else
8714         {
8715           int mask = detect_info.rejected | detect_info.found;
8716           int found = 0;
8717
8718           for (i = coding_category_raw_text - 1; i >= 0; i--)
8719             {
8720               category = coding_priorities[i];
8721               if (! (mask & (1 << category)))
8722                 {
8723                   found |= 1 << category;
8724                   id = coding_categories[category].id;
8725                   if (id >= 0)
8726                     val = list1 (make_number (id));
8727                 }
8728             }
8729           for (i = coding_category_raw_text - 1; i >= 0; i--)
8730             {
8731               category = coding_priorities[i];
8732               if (detect_info.found & (1 << category))
8733                 {
8734                   id = coding_categories[category].id;
8735                   val = Fcons (make_number (id), val);
8736                 }
8737             }
8738           detect_info.found |= found;
8739         }
8740     }
8741   else if (base_category == coding_category_utf_8_auto)
8742     {
8743       if (detect_coding_utf_8 (&coding, &detect_info))
8744         {
8745           struct coding_system *this;
8746
8747           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8748             this = coding_categories + coding_category_utf_8_sig;
8749           else
8750             this = coding_categories + coding_category_utf_8_nosig;
8751           val = list1 (make_number (this->id));
8752         }
8753     }
8754   else if (base_category == coding_category_utf_16_auto)
8755     {
8756       if (detect_coding_utf_16 (&coding, &detect_info))
8757         {
8758           struct coding_system *this;
8759
8760           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8761             this = coding_categories + coding_category_utf_16_le;
8762           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8763             this = coding_categories + coding_category_utf_16_be;
8764           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8765             this = coding_categories + coding_category_utf_16_be_nosig;
8766           else
8767             this = coding_categories + coding_category_utf_16_le_nosig;
8768           val = list1 (make_number (this->id));
8769         }
8770     }
8771   else
8772     {
8773       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8774       val = list1 (make_number (coding.id));
8775     }
8776
8777   /* Then, detect eol-format if necessary.  */
8778   {
8779     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8780     Lisp_Object tail;
8781
8782     if (VECTORP (eol_type))
8783       {
8784         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8785           {
8786             if (null_byte_found)
8787               normal_eol = EOL_SEEN_LF;
8788             else
8789               normal_eol = detect_eol (coding.source, src_bytes,
8790                                        coding_category_raw_text);
8791           }
8792         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8793                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8794           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8795                                       coding_category_utf_16_be);
8796         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8797                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8798           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8799                                       coding_category_utf_16_le);
8800       }
8801     else
8802       {
8803         if (EQ (eol_type, Qunix))
8804           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8805         else if (EQ (eol_type, Qdos))
8806           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8807         else
8808           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8809       }
8810
8811     for (tail = val; CONSP (tail); tail = XCDR (tail))
8812       {
8813         enum coding_category category;
8814         int this_eol;
8815
8816         id = XINT (XCAR (tail));
8817         attrs = CODING_ID_ATTRS (id);
8818         category = XINT (CODING_ATTR_CATEGORY (attrs));
8819         eol_type = CODING_ID_EOL_TYPE (id);
8820         if (VECTORP (eol_type))
8821           {
8822             if (category == coding_category_utf_16_be
8823                 || category == coding_category_utf_16_be_nosig)
8824               this_eol = utf_16_be_eol;
8825             else if (category == coding_category_utf_16_le
8826                      || category == coding_category_utf_16_le_nosig)
8827               this_eol = utf_16_le_eol;
8828             else
8829               this_eol = normal_eol;
8830
8831             if (this_eol == EOL_SEEN_LF)
8832               XSETCAR (tail, AREF (eol_type, 0));
8833             else if (this_eol == EOL_SEEN_CRLF)
8834               XSETCAR (tail, AREF (eol_type, 1));
8835             else if (this_eol == EOL_SEEN_CR)
8836               XSETCAR (tail, AREF (eol_type, 2));
8837             else
8838               XSETCAR (tail, CODING_ID_NAME (id));
8839           }
8840         else
8841           XSETCAR (tail, CODING_ID_NAME (id));
8842       }
8843   }
8844
8845   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8846 }
8847
8848
8849 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8850        2, 3, 0,
8851        doc: /* Detect coding system of the text in the region between START and END.
8852 Return a list of possible coding systems ordered by priority.
8853 The coding systems to try and their priorities follows what
8854 the function `coding-system-priority-list' (which see) returns.
8855
8856 If only ASCII characters are found (except for such ISO-2022 control
8857 characters as ESC), it returns a list of single element `undecided'
8858 or its subsidiary coding system according to a detected end-of-line
8859 format.
8860
8861 If optional argument HIGHEST is non-nil, return the coding system of
8862 highest priority.  */)
8863   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8864 {
8865   ptrdiff_t from, to;
8866   ptrdiff_t from_byte, to_byte;
8867
8868   validate_region (&start, &end);
8869   from = XINT (start), to = XINT (end);
8870   from_byte = CHAR_TO_BYTE (from);
8871   to_byte = CHAR_TO_BYTE (to);
8872
8873   if (from < GPT && to >= GPT)
8874     move_gap_both (to, to_byte);
8875
8876   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8877                                to - from, to_byte - from_byte,
8878                                !NILP (highest),
8879                                !NILP (BVAR (current_buffer
8880                                       , enable_multibyte_characters)),
8881                                Qnil);
8882 }
8883
8884 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8885        1, 2, 0,
8886        doc: /* Detect coding system of the text in STRING.
8887 Return a list of possible coding systems ordered by priority.
8888 The coding systems to try and their priorities follows what
8889 the function `coding-system-priority-list' (which see) returns.
8890
8891 If only ASCII characters are found (except for such ISO-2022 control
8892 characters as ESC), it returns a list of single element `undecided'
8893 or its subsidiary coding system according to a detected end-of-line
8894 format.
8895
8896 If optional argument HIGHEST is non-nil, return the coding system of
8897 highest priority.  */)
8898   (Lisp_Object string, Lisp_Object highest)
8899 {
8900   CHECK_STRING (string);
8901
8902   return detect_coding_system (SDATA (string),
8903                                SCHARS (string), SBYTES (string),
8904                                !NILP (highest), STRING_MULTIBYTE (string),
8905                                Qnil);
8906 }
8907
8908
8909 static bool
8910 char_encodable_p (int c, Lisp_Object attrs)
8911 {
8912   Lisp_Object tail;
8913   struct charset *charset;
8914   Lisp_Object translation_table;
8915
8916   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8917   if (! NILP (translation_table))
8918     c = translate_char (translation_table, c);
8919   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8920        CONSP (tail); tail = XCDR (tail))
8921     {
8922       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8923       if (CHAR_CHARSET_P (c, charset))
8924         break;
8925     }
8926   return (! NILP (tail));
8927 }
8928
8929
8930 /* Return a list of coding systems that safely encode the text between
8931    START and END.  If EXCLUDE is non-nil, it is a list of coding
8932    systems not to check.  The returned list doesn't contain any such
8933    coding systems.  In any case, if the text contains only ASCII or is
8934    unibyte, return t.  */
8935
8936 DEFUN ("find-coding-systems-region-internal",
8937        Ffind_coding_systems_region_internal,
8938        Sfind_coding_systems_region_internal, 2, 3, 0,
8939        doc: /* Internal use only.  */)
8940   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8941 {
8942   Lisp_Object coding_attrs_list, safe_codings;
8943   ptrdiff_t start_byte, end_byte;
8944   const unsigned char *p, *pbeg, *pend;
8945   int c;
8946   Lisp_Object tail, elt, work_table;
8947
8948   if (STRINGP (start))
8949     {
8950       if (!STRING_MULTIBYTE (start)
8951           || SCHARS (start) == SBYTES (start))
8952         return Qt;
8953       start_byte = 0;
8954       end_byte = SBYTES (start);
8955     }
8956   else
8957     {
8958       CHECK_NUMBER_COERCE_MARKER (start);
8959       CHECK_NUMBER_COERCE_MARKER (end);
8960       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8961         args_out_of_range (start, end);
8962       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8963         return Qt;
8964       start_byte = CHAR_TO_BYTE (XINT (start));
8965       end_byte = CHAR_TO_BYTE (XINT (end));
8966       if (XINT (end) - XINT (start) == end_byte - start_byte)
8967         return Qt;
8968
8969       if (XINT (start) < GPT && XINT (end) > GPT)
8970         {
8971           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8972             move_gap_both (XINT (start), start_byte);
8973           else
8974             move_gap_both (XINT (end), end_byte);
8975         }
8976     }
8977
8978   coding_attrs_list = Qnil;
8979   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8980     if (NILP (exclude)
8981         || NILP (Fmemq (XCAR (tail), exclude)))
8982       {
8983         Lisp_Object attrs;
8984
8985         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8986         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs)))
8987           {
8988             ASET (attrs, coding_attr_trans_tbl,
8989                   get_translation_table (attrs, 1, NULL));
8990             coding_attrs_list = Fcons (attrs, coding_attrs_list);
8991           }
8992       }
8993
8994   if (STRINGP (start))
8995     p = pbeg = SDATA (start);
8996   else
8997     p = pbeg = BYTE_POS_ADDR (start_byte);
8998   pend = p + (end_byte - start_byte);
8999
9000   while (p < pend && ASCII_CHAR_P (*p)) p++;
9001   while (p < pend && ASCII_CHAR_P (*(pend - 1))) pend--;
9002
9003   work_table = Fmake_char_table (Qnil, Qnil);
9004   while (p < pend)
9005     {
9006       if (ASCII_CHAR_P (*p))
9007         p++;
9008       else
9009         {
9010           c = STRING_CHAR_ADVANCE (p);
9011           if (!NILP (char_table_ref (work_table, c)))
9012             /* This character was already checked.  Ignore it.  */
9013             continue;
9014
9015           charset_map_loaded = 0;
9016           for (tail = coding_attrs_list; CONSP (tail);)
9017             {
9018               elt = XCAR (tail);
9019               if (NILP (elt))
9020                 tail = XCDR (tail);
9021               else if (char_encodable_p (c, elt))
9022                 tail = XCDR (tail);
9023               else if (CONSP (XCDR (tail)))
9024                 {
9025                   XSETCAR (tail, XCAR (XCDR (tail)));
9026                   XSETCDR (tail, XCDR (XCDR (tail)));
9027                 }
9028               else
9029                 {
9030                   XSETCAR (tail, Qnil);
9031                   tail = XCDR (tail);
9032                 }
9033             }
9034           if (charset_map_loaded)
9035             {
9036               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9037
9038               if (STRINGP (start))
9039                 pbeg = SDATA (start);
9040               else
9041                 pbeg = BYTE_POS_ADDR (start_byte);
9042               p = pbeg + p_offset;
9043               pend = pbeg + pend_offset;
9044             }
9045           char_table_set (work_table, c, Qt);
9046         }
9047     }
9048
9049   safe_codings = list2 (Qraw_text, Qno_conversion);
9050   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
9051     if (! NILP (XCAR (tail)))
9052       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
9053
9054   return safe_codings;
9055 }
9056
9057
9058 DEFUN ("unencodable-char-position", Funencodable_char_position,
9059        Sunencodable_char_position, 3, 5, 0,
9060        doc: /* Return position of first un-encodable character in a region.
9061 START and END specify the region and CODING-SYSTEM specifies the
9062 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
9063
9064 If optional 4th argument COUNT is non-nil, it specifies at most how
9065 many un-encodable characters to search.  In this case, the value is a
9066 list of positions.
9067
9068 If optional 5th argument STRING is non-nil, it is a string to search
9069 for un-encodable characters.  In that case, START and END are indexes
9070 to the string and treated as in `substring'.  */)
9071   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system,
9072    Lisp_Object count, Lisp_Object string)
9073 {
9074   EMACS_INT n;
9075   struct coding_system coding;
9076   Lisp_Object attrs, charset_list, translation_table;
9077   Lisp_Object positions;
9078   ptrdiff_t from, to;
9079   const unsigned char *p, *stop, *pend;
9080   bool ascii_compatible;
9081
9082   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
9083   attrs = CODING_ID_ATTRS (coding.id);
9084   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
9085     return Qnil;
9086   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
9087   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9088   translation_table = get_translation_table (attrs, 1, NULL);
9089
9090   if (NILP (string))
9091     {
9092       validate_region (&start, &end);
9093       from = XINT (start);
9094       to = XINT (end);
9095       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
9096           || (ascii_compatible
9097               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
9098         return Qnil;
9099       p = CHAR_POS_ADDR (from);
9100       pend = CHAR_POS_ADDR (to);
9101       if (from < GPT && to >= GPT)
9102         stop = GPT_ADDR;
9103       else
9104         stop = pend;
9105     }
9106   else
9107     {
9108       CHECK_STRING (string);
9109       validate_subarray (string, start, end, SCHARS (string), &from, &to);
9110       if (! STRING_MULTIBYTE (string))
9111         return Qnil;
9112       p = SDATA (string) + string_char_to_byte (string, from);
9113       stop = pend = SDATA (string) + string_char_to_byte (string, to);
9114       if (ascii_compatible && (to - from) == (pend - p))
9115         return Qnil;
9116     }
9117
9118   if (NILP (count))
9119     n = 1;
9120   else
9121     {
9122       CHECK_NATNUM (count);
9123       n = XINT (count);
9124     }
9125
9126   positions = Qnil;
9127   charset_map_loaded = 0;
9128   while (1)
9129     {
9130       int c;
9131
9132       if (ascii_compatible)
9133         while (p < stop && ASCII_CHAR_P (*p))
9134           p++, from++;
9135       if (p >= stop)
9136         {
9137           if (p >= pend)
9138             break;
9139           stop = pend;
9140           p = GAP_END_ADDR;
9141         }
9142
9143       c = STRING_CHAR_ADVANCE (p);
9144       if (! (ASCII_CHAR_P (c) && ascii_compatible)
9145           && ! char_charset (translate_char (translation_table, c),
9146                              charset_list, NULL))
9147         {
9148           positions = Fcons (make_number (from), positions);
9149           n--;
9150           if (n == 0)
9151             break;
9152         }
9153
9154       from++;
9155       if (charset_map_loaded && NILP (string))
9156         {
9157           p = CHAR_POS_ADDR (from);
9158           pend = CHAR_POS_ADDR (to);
9159           if (from < GPT && to >= GPT)
9160             stop = GPT_ADDR;
9161           else
9162             stop = pend;
9163           charset_map_loaded = 0;
9164         }
9165     }
9166
9167   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
9168 }
9169
9170
9171 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
9172        Scheck_coding_systems_region, 3, 3, 0,
9173        doc: /* Check if the region is encodable by coding systems.
9174
9175 START and END are buffer positions specifying the region.
9176 CODING-SYSTEM-LIST is a list of coding systems to check.
9177
9178 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
9179 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
9180 whole region, POS0, POS1, ... are buffer positions where non-encodable
9181 characters are found.
9182
9183 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
9184 value is nil.
9185
9186 START may be a string.  In that case, check if the string is
9187 encodable, and the value contains indices to the string instead of
9188 buffer positions.  END is ignored.
9189
9190 If the current buffer (or START if it is a string) is unibyte, the value
9191 is nil.  */)
9192   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
9193 {
9194   Lisp_Object list;
9195   ptrdiff_t start_byte, end_byte;
9196   ptrdiff_t pos;
9197   const unsigned char *p, *pbeg, *pend;
9198   int c;
9199   Lisp_Object tail, elt, attrs;
9200
9201   if (STRINGP (start))
9202     {
9203       if (!STRING_MULTIBYTE (start)
9204           || SCHARS (start) == SBYTES (start))
9205         return Qnil;
9206       start_byte = 0;
9207       end_byte = SBYTES (start);
9208       pos = 0;
9209     }
9210   else
9211     {
9212       CHECK_NUMBER_COERCE_MARKER (start);
9213       CHECK_NUMBER_COERCE_MARKER (end);
9214       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
9215         args_out_of_range (start, end);
9216       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9217         return Qnil;
9218       start_byte = CHAR_TO_BYTE (XINT (start));
9219       end_byte = CHAR_TO_BYTE (XINT (end));
9220       if (XINT (end) - XINT (start) == end_byte - start_byte)
9221         return Qnil;
9222
9223       if (XINT (start) < GPT && XINT (end) > GPT)
9224         {
9225           if ((GPT - XINT (start)) < (XINT (end) - GPT))
9226             move_gap_both (XINT (start), start_byte);
9227           else
9228             move_gap_both (XINT (end), end_byte);
9229         }
9230       pos = XINT (start);
9231     }
9232
9233   list = Qnil;
9234   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
9235     {
9236       elt = XCAR (tail);
9237       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
9238       ASET (attrs, coding_attr_trans_tbl,
9239             get_translation_table (attrs, 1, NULL));
9240       list = Fcons (list2 (elt, attrs), list);
9241     }
9242
9243   if (STRINGP (start))
9244     p = pbeg = SDATA (start);
9245   else
9246     p = pbeg = BYTE_POS_ADDR (start_byte);
9247   pend = p + (end_byte - start_byte);
9248
9249   while (p < pend && ASCII_CHAR_P (*p)) p++, pos++;
9250   while (p < pend && ASCII_CHAR_P (*(pend - 1))) pend--;
9251
9252   while (p < pend)
9253     {
9254       if (ASCII_CHAR_P (*p))
9255         p++;
9256       else
9257         {
9258           c = STRING_CHAR_ADVANCE (p);
9259
9260           charset_map_loaded = 0;
9261           for (tail = list; CONSP (tail); tail = XCDR (tail))
9262             {
9263               elt = XCDR (XCAR (tail));
9264               if (! char_encodable_p (c, XCAR (elt)))
9265                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
9266             }
9267           if (charset_map_loaded)
9268             {
9269               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9270
9271               if (STRINGP (start))
9272                 pbeg = SDATA (start);
9273               else
9274                 pbeg = BYTE_POS_ADDR (start_byte);
9275               p = pbeg + p_offset;
9276               pend = pbeg + pend_offset;
9277             }
9278         }
9279       pos++;
9280     }
9281
9282   tail = list;
9283   list = Qnil;
9284   for (; CONSP (tail); tail = XCDR (tail))
9285     {
9286       elt = XCAR (tail);
9287       if (CONSP (XCDR (XCDR (elt))))
9288         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
9289                       list);
9290     }
9291
9292   return list;
9293 }
9294
9295
9296 static Lisp_Object
9297 code_convert_region (Lisp_Object start, Lisp_Object end,
9298                      Lisp_Object coding_system, Lisp_Object dst_object,
9299                      bool encodep, bool norecord)
9300 {
9301   struct coding_system coding;
9302   ptrdiff_t from, from_byte, to, to_byte;
9303   Lisp_Object src_object;
9304
9305   if (NILP (coding_system))
9306     coding_system = Qno_conversion;
9307   else
9308     CHECK_CODING_SYSTEM (coding_system);
9309   src_object = Fcurrent_buffer ();
9310   if (NILP (dst_object))
9311     dst_object = src_object;
9312   else if (! EQ (dst_object, Qt))
9313     CHECK_BUFFER (dst_object);
9314
9315   validate_region (&start, &end);
9316   from = XFASTINT (start);
9317   from_byte = CHAR_TO_BYTE (from);
9318   to = XFASTINT (end);
9319   to_byte = CHAR_TO_BYTE (to);
9320
9321   setup_coding_system (coding_system, &coding);
9322   coding.mode |= CODING_MODE_LAST_BLOCK;
9323
9324   if (BUFFERP (dst_object) && !EQ (dst_object, src_object))
9325     {
9326       struct buffer *buf = XBUFFER (dst_object);
9327       ptrdiff_t buf_pt = BUF_PT (buf);
9328
9329       invalidate_buffer_caches (buf, buf_pt, buf_pt);
9330     }
9331
9332   if (encodep)
9333     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9334                           dst_object);
9335   else
9336     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9337                           dst_object);
9338   if (! norecord)
9339     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9340
9341   return (BUFFERP (dst_object)
9342           ? make_number (coding.produced_char)
9343           : coding.dst_object);
9344 }
9345
9346
9347 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9348        3, 4, "r\nzCoding system: ",
9349        doc: /* Decode the current region from the specified coding system.
9350 When called from a program, takes four arguments:
9351         START, END, CODING-SYSTEM, and DESTINATION.
9352 START and END are buffer positions.
9353
9354 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9355 If nil, the region between START and END is replaced by the decoded text.
9356 If buffer, the decoded text is inserted in that buffer after point (point
9357 does not move).
9358 In those cases, the length of the decoded text is returned.
9359 If DESTINATION is t, the decoded text is returned.
9360
9361 This function sets `last-coding-system-used' to the precise coding system
9362 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9363 not fully specified.)  */)
9364   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9365 {
9366   return code_convert_region (start, end, coding_system, destination, 0, 0);
9367 }
9368
9369 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9370        3, 4, "r\nzCoding system: ",
9371        doc: /* Encode the current region by specified coding system.
9372 When called from a program, takes four arguments:
9373         START, END, CODING-SYSTEM and DESTINATION.
9374 START and END are buffer positions.
9375
9376 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9377 If nil, the region between START and END is replace by the encoded text.
9378 If buffer, the encoded text is inserted in that buffer after point (point
9379 does not move).
9380 In those cases, the length of the encoded text is returned.
9381 If DESTINATION is t, the encoded text is returned.
9382
9383 This function sets `last-coding-system-used' to the precise coding system
9384 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9385 not fully specified.)  */)
9386   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9387 {
9388   return code_convert_region (start, end, coding_system, destination, 1, 0);
9389 }
9390
9391 Lisp_Object
9392 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
9393                      Lisp_Object dst_object, bool encodep, bool nocopy,
9394                      bool norecord)
9395 {
9396   struct coding_system coding;
9397   ptrdiff_t chars, bytes;
9398
9399   CHECK_STRING (string);
9400   if (NILP (coding_system))
9401     {
9402       if (! norecord)
9403         Vlast_coding_system_used = Qno_conversion;
9404       if (NILP (dst_object))
9405         return (nocopy ? Fcopy_sequence (string) : string);
9406     }
9407
9408   if (NILP (coding_system))
9409     coding_system = Qno_conversion;
9410   else
9411     CHECK_CODING_SYSTEM (coding_system);
9412   if (NILP (dst_object))
9413     dst_object = Qt;
9414   else if (! EQ (dst_object, Qt))
9415     CHECK_BUFFER (dst_object);
9416
9417   setup_coding_system (coding_system, &coding);
9418   coding.mode |= CODING_MODE_LAST_BLOCK;
9419   chars = SCHARS (string);
9420   bytes = SBYTES (string);
9421
9422   if (BUFFERP (dst_object))
9423     {
9424       struct buffer *buf = XBUFFER (dst_object);
9425       ptrdiff_t buf_pt = BUF_PT (buf);
9426
9427       invalidate_buffer_caches (buf, buf_pt, buf_pt);
9428     }
9429
9430   if (encodep)
9431     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9432   else
9433     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9434   if (! norecord)
9435     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9436
9437   return (BUFFERP (dst_object)
9438           ? make_number (coding.produced_char)
9439           : coding.dst_object);
9440 }
9441
9442
9443 /* Encode or decode STRING according to CODING_SYSTEM.
9444    Do not set Vlast_coding_system_used.
9445
9446    This function is called only from macros DECODE_FILE and
9447    ENCODE_FILE, thus we ignore character composition.  */
9448
9449 Lisp_Object
9450 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9451                               bool encodep)
9452 {
9453   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9454 }
9455
9456 /* Encode or decode a file name, to or from a unibyte string suitable
9457    for passing to C library functions.  */
9458 Lisp_Object
9459 decode_file_name (Lisp_Object fname)
9460 {
9461 #ifdef WINDOWSNT
9462   /* The w32 build pretends to use UTF-8 for file-name encoding, and
9463      converts the file names either to UTF-16LE or to the system ANSI
9464      codepage internally, depending on the underlying OS; see w32.c.  */
9465   if (! NILP (Fcoding_system_p (Qutf_8)))
9466     return code_convert_string_norecord (fname, Qutf_8, 0);
9467   return fname;
9468 #else  /* !WINDOWSNT */
9469   if (! NILP (Vfile_name_coding_system))
9470     return code_convert_string_norecord (fname, Vfile_name_coding_system, 0);
9471   else if (! NILP (Vdefault_file_name_coding_system))
9472     return code_convert_string_norecord (fname,
9473                                          Vdefault_file_name_coding_system, 0);
9474   else
9475     return fname;
9476 #endif
9477 }
9478
9479 Lisp_Object
9480 encode_file_name (Lisp_Object fname)
9481 {
9482   /* This is especially important during bootstrap and dumping, when
9483      file-name encoding is not yet known, and therefore any non-ASCII
9484      file names are unibyte strings, and could only be thrashed if we
9485      try to encode them.  */
9486   if (!STRING_MULTIBYTE (fname))
9487     return fname;
9488 #ifdef WINDOWSNT
9489   /* The w32 build pretends to use UTF-8 for file-name encoding, and
9490      converts the file names either to UTF-16LE or to the system ANSI
9491      codepage internally, depending on the underlying OS; see w32.c.  */
9492   if (! NILP (Fcoding_system_p (Qutf_8)))
9493     return code_convert_string_norecord (fname, Qutf_8, 1);
9494   return fname;
9495 #else  /* !WINDOWSNT */
9496   if (! NILP (Vfile_name_coding_system))
9497     return code_convert_string_norecord (fname, Vfile_name_coding_system, 1);
9498   else if (! NILP (Vdefault_file_name_coding_system))
9499     return code_convert_string_norecord (fname,
9500                                          Vdefault_file_name_coding_system, 1);
9501   else
9502     return fname;
9503 #endif
9504 }
9505
9506 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9507        2, 4, 0,
9508        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9509
9510 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9511 if the decoding operation is trivial.
9512
9513 Optional fourth arg BUFFER non-nil means that the decoded text is
9514 inserted in that buffer after point (point does not move).  In this
9515 case, the return value is the length of the decoded text.
9516
9517 This function sets `last-coding-system-used' to the precise coding system
9518 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9519 not fully specified.)  */)
9520   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9521 {
9522   return code_convert_string (string, coding_system, buffer,
9523                               0, ! NILP (nocopy), 0);
9524 }
9525
9526 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9527        2, 4, 0,
9528        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9529
9530 Optional third arg NOCOPY non-nil means it is OK to return STRING
9531 itself if the encoding operation is trivial.
9532
9533 Optional fourth arg BUFFER non-nil means that the encoded text is
9534 inserted in that buffer after point (point does not move).  In this
9535 case, the return value is the length of the encoded text.
9536
9537 This function sets `last-coding-system-used' to the precise coding system
9538 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9539 not fully specified.)  */)
9540   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9541 {
9542   return code_convert_string (string, coding_system, buffer,
9543                               1, ! NILP (nocopy), 0);
9544 }
9545
9546 \f
9547 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9548        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9549 Return the corresponding character.  */)
9550   (Lisp_Object code)
9551 {
9552   Lisp_Object spec, attrs, val;
9553   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9554   EMACS_INT ch;
9555   int c;
9556
9557   CHECK_NATNUM (code);
9558   ch = XFASTINT (code);
9559   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9560   attrs = AREF (spec, 0);
9561
9562   if (ASCII_CHAR_P (ch)
9563       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9564     return code;
9565
9566   val = CODING_ATTR_CHARSET_LIST (attrs);
9567   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9568   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9569   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9570
9571   if (ch <= 0x7F)
9572     {
9573       c = ch;
9574       charset = charset_roman;
9575     }
9576   else if (ch >= 0xA0 && ch < 0xDF)
9577     {
9578       c = ch - 0x80;
9579       charset = charset_kana;
9580     }
9581   else
9582     {
9583       EMACS_INT c1 = ch >> 8;
9584       int c2 = ch & 0xFF;
9585
9586       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9587           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9588         error ("Invalid code: %"pI"d", ch);
9589       c = ch;
9590       SJIS_TO_JIS (c);
9591       charset = charset_kanji;
9592     }
9593   c = DECODE_CHAR (charset, c);
9594   if (c < 0)
9595     error ("Invalid code: %"pI"d", ch);
9596   return make_number (c);
9597 }
9598
9599
9600 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9601        doc: /* Encode a Japanese character CH to shift_jis encoding.
9602 Return the corresponding code in SJIS.  */)
9603   (Lisp_Object ch)
9604 {
9605   Lisp_Object spec, attrs, charset_list;
9606   int c;
9607   struct charset *charset;
9608   unsigned code;
9609
9610   CHECK_CHARACTER (ch);
9611   c = XFASTINT (ch);
9612   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9613   attrs = AREF (spec, 0);
9614
9615   if (ASCII_CHAR_P (c)
9616       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9617     return ch;
9618
9619   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9620   charset = char_charset (c, charset_list, &code);
9621   if (code == CHARSET_INVALID_CODE (charset))
9622     error ("Can't encode by shift_jis encoding: %c", c);
9623   JIS_TO_SJIS (code);
9624
9625   return make_number (code);
9626 }
9627
9628 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9629        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9630 Return the corresponding character.  */)
9631   (Lisp_Object code)
9632 {
9633   Lisp_Object spec, attrs, val;
9634   struct charset *charset_roman, *charset_big5, *charset;
9635   EMACS_INT ch;
9636   int c;
9637
9638   CHECK_NATNUM (code);
9639   ch = XFASTINT (code);
9640   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9641   attrs = AREF (spec, 0);
9642
9643   if (ASCII_CHAR_P (ch)
9644       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9645     return code;
9646
9647   val = CODING_ATTR_CHARSET_LIST (attrs);
9648   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9649   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9650
9651   if (ch <= 0x7F)
9652     {
9653       c = ch;
9654       charset = charset_roman;
9655     }
9656   else
9657     {
9658       EMACS_INT b1 = ch >> 8;
9659       int b2 = ch & 0x7F;
9660       if (b1 < 0xA1 || b1 > 0xFE
9661           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9662         error ("Invalid code: %"pI"d", ch);
9663       c = ch;
9664       charset = charset_big5;
9665     }
9666   c = DECODE_CHAR (charset, c);
9667   if (c < 0)
9668     error ("Invalid code: %"pI"d", ch);
9669   return make_number (c);
9670 }
9671
9672 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9673        doc: /* Encode the Big5 character CH to BIG5 coding system.
9674 Return the corresponding character code in Big5.  */)
9675   (Lisp_Object ch)
9676 {
9677   Lisp_Object spec, attrs, charset_list;
9678   struct charset *charset;
9679   int c;
9680   unsigned code;
9681
9682   CHECK_CHARACTER (ch);
9683   c = XFASTINT (ch);
9684   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9685   attrs = AREF (spec, 0);
9686   if (ASCII_CHAR_P (c)
9687       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9688     return ch;
9689
9690   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9691   charset = char_charset (c, charset_list, &code);
9692   if (code == CHARSET_INVALID_CODE (charset))
9693     error ("Can't encode by Big5 encoding: %c", c);
9694
9695   return make_number (code);
9696 }
9697
9698 \f
9699 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9700        Sset_terminal_coding_system_internal, 1, 2, 0,
9701        doc: /* Internal use only.  */)
9702   (Lisp_Object coding_system, Lisp_Object terminal)
9703 {
9704   struct terminal *term = decode_live_terminal (terminal);
9705   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9706   CHECK_SYMBOL (coding_system);
9707   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9708   /* We had better not send unsafe characters to terminal.  */
9709   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9710   /* Character composition should be disabled.  */
9711   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9712   terminal_coding->src_multibyte = 1;
9713   terminal_coding->dst_multibyte = 0;
9714   tset_charset_list
9715     (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9716             ? coding_charset_list (terminal_coding)
9717             : list1 (make_number (charset_ascii))));
9718   return Qnil;
9719 }
9720
9721 DEFUN ("set-safe-terminal-coding-system-internal",
9722        Fset_safe_terminal_coding_system_internal,
9723        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9724        doc: /* Internal use only.  */)
9725   (Lisp_Object coding_system)
9726 {
9727   CHECK_SYMBOL (coding_system);
9728   setup_coding_system (Fcheck_coding_system (coding_system),
9729                        &safe_terminal_coding);
9730   /* Character composition should be disabled.  */
9731   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9732   safe_terminal_coding.src_multibyte = 1;
9733   safe_terminal_coding.dst_multibyte = 0;
9734   return Qnil;
9735 }
9736
9737 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9738        Sterminal_coding_system, 0, 1, 0,
9739        doc: /* Return coding system specified for terminal output on the given terminal.
9740 TERMINAL may be a terminal object, a frame, or nil for the selected
9741 frame's terminal device.  */)
9742   (Lisp_Object terminal)
9743 {
9744   struct coding_system *terminal_coding
9745     = TERMINAL_TERMINAL_CODING (decode_live_terminal (terminal));
9746   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9747
9748   /* For backward compatibility, return nil if it is `undecided'.  */
9749   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9750 }
9751
9752 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9753        Sset_keyboard_coding_system_internal, 1, 2, 0,
9754        doc: /* Internal use only.  */)
9755   (Lisp_Object coding_system, Lisp_Object terminal)
9756 {
9757   struct terminal *t = decode_live_terminal (terminal);
9758   CHECK_SYMBOL (coding_system);
9759   if (NILP (coding_system))
9760     coding_system = Qno_conversion;
9761   else
9762     Fcheck_coding_system (coding_system);
9763   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9764   /* Character composition should be disabled.  */
9765   TERMINAL_KEYBOARD_CODING (t)->common_flags
9766     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9767   return Qnil;
9768 }
9769
9770 DEFUN ("keyboard-coding-system",
9771        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9772        doc: /* Return coding system specified for decoding keyboard input.  */)
9773   (Lisp_Object terminal)
9774 {
9775   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9776                          (decode_live_terminal (terminal))->id);
9777 }
9778
9779 \f
9780 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9781        Sfind_operation_coding_system,  1, MANY, 0,
9782        doc: /* Choose a coding system for an operation based on the target name.
9783 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9784 DECODING-SYSTEM is the coding system to use for decoding
9785 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9786 for encoding (in case OPERATION does encoding).
9787
9788 The first argument OPERATION specifies an I/O primitive:
9789   For file I/O, `insert-file-contents' or `write-region'.
9790   For process I/O, `call-process', `call-process-region', or `start-process'.
9791   For network I/O, `open-network-stream'.
9792
9793 The remaining arguments should be the same arguments that were passed
9794 to the primitive.  Depending on which primitive, one of those arguments
9795 is selected as the TARGET.  For example, if OPERATION does file I/O,
9796 whichever argument specifies the file name is TARGET.
9797
9798 TARGET has a meaning which depends on OPERATION:
9799   For file I/O, TARGET is a file name (except for the special case below).
9800   For process I/O, TARGET is a process name.
9801   For network I/O, TARGET is a service name or a port number.
9802
9803 This function looks up what is specified for TARGET in
9804 `file-coding-system-alist', `process-coding-system-alist',
9805 or `network-coding-system-alist' depending on OPERATION.
9806 They may specify a coding system, a cons of coding systems,
9807 or a function symbol to call.
9808 In the last case, we call the function with one argument,
9809 which is a list of all the arguments given to this function.
9810 If the function can't decide a coding system, it can return
9811 `undecided' so that the normal code-detection is performed.
9812
9813 If OPERATION is `insert-file-contents', the argument corresponding to
9814 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9815 file name to look up, and BUFFER is a buffer that contains the file's
9816 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9817 function to call for FILENAME, that function should examine the
9818 contents of BUFFER instead of reading the file.
9819
9820 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9821   (ptrdiff_t nargs, Lisp_Object *args)
9822 {
9823   Lisp_Object operation, target_idx, target, val;
9824   register Lisp_Object chain;
9825
9826   if (nargs < 2)
9827     error ("Too few arguments");
9828   operation = args[0];
9829   if (!SYMBOLP (operation)
9830       || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9831     error ("Invalid first argument");
9832   if (nargs <= 1 + XFASTINT (target_idx))
9833     error ("Too few arguments for operation `%s'",
9834            SDATA (SYMBOL_NAME (operation)));
9835   target = args[XFASTINT (target_idx) + 1];
9836   if (!(STRINGP (target)
9837         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9838             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9839         || (EQ (operation, Qopen_network_stream)
9840             && (INTEGERP (target) || EQ (target, Qt)))))
9841     error ("Invalid argument %"pI"d of operation `%s'",
9842            XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9843   if (CONSP (target))
9844     target = XCAR (target);
9845
9846   chain = ((EQ (operation, Qinsert_file_contents)
9847             || EQ (operation, Qwrite_region))
9848            ? Vfile_coding_system_alist
9849            : (EQ (operation, Qopen_network_stream)
9850               ? Vnetwork_coding_system_alist
9851               : Vprocess_coding_system_alist));
9852   if (NILP (chain))
9853     return Qnil;
9854
9855   for (; CONSP (chain); chain = XCDR (chain))
9856     {
9857       Lisp_Object elt;
9858
9859       elt = XCAR (chain);
9860       if (CONSP (elt)
9861           && ((STRINGP (target)
9862                && STRINGP (XCAR (elt))
9863                && fast_string_match (XCAR (elt), target) >= 0)
9864               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9865         {
9866           val = XCDR (elt);
9867           /* Here, if VAL is both a valid coding system and a valid
9868              function symbol, we return VAL as a coding system.  */
9869           if (CONSP (val))
9870             return val;
9871           if (! SYMBOLP (val))
9872             return Qnil;
9873           if (! NILP (Fcoding_system_p (val)))
9874             return Fcons (val, val);
9875           if (! NILP (Ffboundp (val)))
9876             {
9877               /* We use call1 rather than safe_call1
9878                  so as to get bug reports about functions called here
9879                  which don't handle the current interface.  */
9880               val = call1 (val, Flist (nargs, args));
9881               if (CONSP (val))
9882                 return val;
9883               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9884                 return Fcons (val, val);
9885             }
9886           return Qnil;
9887         }
9888     }
9889   return Qnil;
9890 }
9891
9892 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9893        Sset_coding_system_priority, 0, MANY, 0,
9894        doc: /* Assign higher priority to the coding systems given as arguments.
9895 If multiple coding systems belong to the same category,
9896 all but the first one are ignored.
9897
9898 usage: (set-coding-system-priority &rest coding-systems)  */)
9899   (ptrdiff_t nargs, Lisp_Object *args)
9900 {
9901   ptrdiff_t i, j;
9902   bool changed[coding_category_max];
9903   enum coding_category priorities[coding_category_max];
9904
9905   memset (changed, 0, sizeof changed);
9906
9907   for (i = j = 0; i < nargs; i++)
9908     {
9909       enum coding_category category;
9910       Lisp_Object spec, attrs;
9911
9912       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9913       attrs = AREF (spec, 0);
9914       category = XINT (CODING_ATTR_CATEGORY (attrs));
9915       if (changed[category])
9916         /* Ignore this coding system because a coding system of the
9917            same category already had a higher priority.  */
9918         continue;
9919       changed[category] = 1;
9920       priorities[j++] = category;
9921       if (coding_categories[category].id >= 0
9922           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9923         setup_coding_system (args[i], &coding_categories[category]);
9924       Fset (AREF (Vcoding_category_table, category), args[i]);
9925     }
9926
9927   /* Now we have decided top J priorities.  Reflect the order of the
9928      original priorities to the remaining priorities.  */
9929
9930   for (i = j, j = 0; i < coding_category_max; i++, j++)
9931     {
9932       while (j < coding_category_max
9933              && changed[coding_priorities[j]])
9934         j++;
9935       if (j == coding_category_max)
9936         emacs_abort ();
9937       priorities[i] = coding_priorities[j];
9938     }
9939
9940   memcpy (coding_priorities, priorities, sizeof priorities);
9941
9942   /* Update `coding-category-list'.  */
9943   Vcoding_category_list = Qnil;
9944   for (i = coding_category_max; i-- > 0; )
9945     Vcoding_category_list
9946       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9947                Vcoding_category_list);
9948
9949   return Qnil;
9950 }
9951
9952 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9953        Scoding_system_priority_list, 0, 1, 0,
9954        doc: /* Return a list of coding systems ordered by their priorities.
9955 The list contains a subset of coding systems; i.e. coding systems
9956 assigned to each coding category (see `coding-category-list').
9957
9958 HIGHESTP non-nil means just return the highest priority one.  */)
9959   (Lisp_Object highestp)
9960 {
9961   int i;
9962   Lisp_Object val;
9963
9964   for (i = 0, val = Qnil; i < coding_category_max; i++)
9965     {
9966       enum coding_category category = coding_priorities[i];
9967       int id = coding_categories[category].id;
9968       Lisp_Object attrs;
9969
9970       if (id < 0)
9971         continue;
9972       attrs = CODING_ID_ATTRS (id);
9973       if (! NILP (highestp))
9974         return CODING_ATTR_BASE_NAME (attrs);
9975       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9976     }
9977   return Fnreverse (val);
9978 }
9979
9980 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9981
9982 static Lisp_Object
9983 make_subsidiaries (Lisp_Object base)
9984 {
9985   Lisp_Object subsidiaries;
9986   ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
9987   USE_SAFE_ALLOCA;
9988   char *buf = SAFE_ALLOCA (base_name_len + 6);
9989   int i;
9990
9991   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
9992   subsidiaries = make_uninit_vector (3);
9993   for (i = 0; i < 3; i++)
9994     {
9995       strcpy (buf + base_name_len, suffixes[i]);
9996       ASET (subsidiaries, i, intern (buf));
9997     }
9998   SAFE_FREE ();
9999   return subsidiaries;
10000 }
10001
10002
10003 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
10004        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
10005        doc: /* For internal use only.
10006 usage: (define-coding-system-internal ...)  */)
10007   (ptrdiff_t nargs, Lisp_Object *args)
10008 {
10009   Lisp_Object name;
10010   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
10011   Lisp_Object attrs;            /* Vector of attributes.  */
10012   Lisp_Object eol_type;
10013   Lisp_Object aliases;
10014   Lisp_Object coding_type, charset_list, safe_charsets;
10015   enum coding_category category;
10016   Lisp_Object tail, val;
10017   int max_charset_id = 0;
10018   int i;
10019
10020   if (nargs < coding_arg_max)
10021     goto short_args;
10022
10023   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
10024
10025   name = args[coding_arg_name];
10026   CHECK_SYMBOL (name);
10027   ASET (attrs, coding_attr_base_name, name);
10028
10029   val = args[coding_arg_mnemonic];
10030   if (! STRINGP (val))
10031     CHECK_CHARACTER (val);
10032   ASET (attrs, coding_attr_mnemonic, val);
10033
10034   coding_type = args[coding_arg_coding_type];
10035   CHECK_SYMBOL (coding_type);
10036   ASET (attrs, coding_attr_type, coding_type);
10037
10038   charset_list = args[coding_arg_charset_list];
10039   if (SYMBOLP (charset_list))
10040     {
10041       if (EQ (charset_list, Qiso_2022))
10042         {
10043           if (! EQ (coding_type, Qiso_2022))
10044             error ("Invalid charset-list");
10045           charset_list = Viso_2022_charset_list;
10046         }
10047       else if (EQ (charset_list, Qemacs_mule))
10048         {
10049           if (! EQ (coding_type, Qemacs_mule))
10050             error ("Invalid charset-list");
10051           charset_list = Vemacs_mule_charset_list;
10052         }
10053       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10054         {
10055           if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
10056             error ("Invalid charset-list");
10057           if (max_charset_id < XFASTINT (XCAR (tail)))
10058             max_charset_id = XFASTINT (XCAR (tail));
10059         }
10060     }
10061   else
10062     {
10063       charset_list = Fcopy_sequence (charset_list);
10064       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10065         {
10066           struct charset *charset;
10067
10068           val = XCAR (tail);
10069           CHECK_CHARSET_GET_CHARSET (val, charset);
10070           if (EQ (coding_type, Qiso_2022)
10071               ? CHARSET_ISO_FINAL (charset) < 0
10072               : EQ (coding_type, Qemacs_mule)
10073               ? CHARSET_EMACS_MULE_ID (charset) < 0
10074               : 0)
10075             error ("Can't handle charset `%s'",
10076                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10077
10078           XSETCAR (tail, make_number (charset->id));
10079           if (max_charset_id < charset->id)
10080             max_charset_id = charset->id;
10081         }
10082     }
10083   ASET (attrs, coding_attr_charset_list, charset_list);
10084
10085   safe_charsets = make_uninit_string (max_charset_id + 1);
10086   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
10087   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10088     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
10089   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
10090
10091   ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
10092
10093   val = args[coding_arg_decode_translation_table];
10094   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10095     CHECK_SYMBOL (val);
10096   ASET (attrs, coding_attr_decode_tbl, val);
10097
10098   val = args[coding_arg_encode_translation_table];
10099   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10100     CHECK_SYMBOL (val);
10101   ASET (attrs, coding_attr_encode_tbl, val);
10102
10103   val = args[coding_arg_post_read_conversion];
10104   CHECK_SYMBOL (val);
10105   ASET (attrs, coding_attr_post_read, val);
10106
10107   val = args[coding_arg_pre_write_conversion];
10108   CHECK_SYMBOL (val);
10109   ASET (attrs, coding_attr_pre_write, val);
10110
10111   val = args[coding_arg_default_char];
10112   if (NILP (val))
10113     ASET (attrs, coding_attr_default_char, make_number (' '));
10114   else
10115     {
10116       CHECK_CHARACTER (val);
10117       ASET (attrs, coding_attr_default_char, val);
10118     }
10119
10120   val = args[coding_arg_for_unibyte];
10121   ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
10122
10123   val = args[coding_arg_plist];
10124   CHECK_LIST (val);
10125   ASET (attrs, coding_attr_plist, val);
10126
10127   if (EQ (coding_type, Qcharset))
10128     {
10129       /* Generate a lisp vector of 256 elements.  Each element is nil,
10130          integer, or a list of charset IDs.
10131
10132          If Nth element is nil, the byte code N is invalid in this
10133          coding system.
10134
10135          If Nth element is a number NUM, N is the first byte of a
10136          charset whose ID is NUM.
10137
10138          If Nth element is a list of charset IDs, N is the first byte
10139          of one of them.  The list is sorted by dimensions of the
10140          charsets.  A charset of smaller dimension comes first. */
10141       val = Fmake_vector (make_number (256), Qnil);
10142
10143       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10144         {
10145           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
10146           int dim = CHARSET_DIMENSION (charset);
10147           int idx = (dim - 1) * 4;
10148
10149           if (CHARSET_ASCII_COMPATIBLE_P (charset))
10150             ASET (attrs, coding_attr_ascii_compat, Qt);
10151
10152           for (i = charset->code_space[idx];
10153                i <= charset->code_space[idx + 1]; i++)
10154             {
10155               Lisp_Object tmp, tmp2;
10156               int dim2;
10157
10158               tmp = AREF (val, i);
10159               if (NILP (tmp))
10160                 tmp = XCAR (tail);
10161               else if (NUMBERP (tmp))
10162                 {
10163                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
10164                   if (dim < dim2)
10165                     tmp = list2 (XCAR (tail), tmp);
10166                   else
10167                     tmp = list2 (tmp, XCAR (tail));
10168                 }
10169               else
10170                 {
10171                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
10172                     {
10173                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
10174                       if (dim < dim2)
10175                         break;
10176                     }
10177                   if (NILP (tmp2))
10178                     tmp = nconc2 (tmp, list1 (XCAR (tail)));
10179                   else
10180                     {
10181                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
10182                       XSETCAR (tmp2, XCAR (tail));
10183                     }
10184                 }
10185               ASET (val, i, tmp);
10186             }
10187         }
10188       ASET (attrs, coding_attr_charset_valids, val);
10189       category = coding_category_charset;
10190     }
10191   else if (EQ (coding_type, Qccl))
10192     {
10193       Lisp_Object valids;
10194
10195       if (nargs < coding_arg_ccl_max)
10196         goto short_args;
10197
10198       val = args[coding_arg_ccl_decoder];
10199       CHECK_CCL_PROGRAM (val);
10200       if (VECTORP (val))
10201         val = Fcopy_sequence (val);
10202       ASET (attrs, coding_attr_ccl_decoder, val);
10203
10204       val = args[coding_arg_ccl_encoder];
10205       CHECK_CCL_PROGRAM (val);
10206       if (VECTORP (val))
10207         val = Fcopy_sequence (val);
10208       ASET (attrs, coding_attr_ccl_encoder, val);
10209
10210       val = args[coding_arg_ccl_valids];
10211       valids = Fmake_string (make_number (256), make_number (0));
10212       for (tail = val; CONSP (tail); tail = XCDR (tail))
10213         {
10214           int from, to;
10215
10216           val = XCAR (tail);
10217           if (INTEGERP (val))
10218             {
10219               if (! (0 <= XINT (val) && XINT (val) <= 255))
10220                 args_out_of_range_3 (val, make_number (0), make_number (255));
10221               from = to = XINT (val);
10222             }
10223           else
10224             {
10225               CHECK_CONS (val);
10226               CHECK_NATNUM_CAR (val);
10227               CHECK_NUMBER_CDR (val);
10228               if (XINT (XCAR (val)) > 255)
10229                 args_out_of_range_3 (XCAR (val),
10230                                      make_number (0), make_number (255));
10231               from = XINT (XCAR (val));
10232               if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
10233                 args_out_of_range_3 (XCDR (val),
10234                                      XCAR (val), make_number (255));
10235               to = XINT (XCDR (val));
10236             }
10237           for (i = from; i <= to; i++)
10238             SSET (valids, i, 1);
10239         }
10240       ASET (attrs, coding_attr_ccl_valids, valids);
10241
10242       category = coding_category_ccl;
10243     }
10244   else if (EQ (coding_type, Qutf_16))
10245     {
10246       Lisp_Object bom, endian;
10247
10248       ASET (attrs, coding_attr_ascii_compat, Qnil);
10249
10250       if (nargs < coding_arg_utf16_max)
10251         goto short_args;
10252
10253       bom = args[coding_arg_utf16_bom];
10254       if (! NILP (bom) && ! EQ (bom, Qt))
10255         {
10256           CHECK_CONS (bom);
10257           val = XCAR (bom);
10258           CHECK_CODING_SYSTEM (val);
10259           val = XCDR (bom);
10260           CHECK_CODING_SYSTEM (val);
10261         }
10262       ASET (attrs, coding_attr_utf_bom, bom);
10263
10264       endian = args[coding_arg_utf16_endian];
10265       CHECK_SYMBOL (endian);
10266       if (NILP (endian))
10267         endian = Qbig;
10268       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
10269         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
10270       ASET (attrs, coding_attr_utf_16_endian, endian);
10271
10272       category = (CONSP (bom)
10273                   ? coding_category_utf_16_auto
10274                   : NILP (bom)
10275                   ? (EQ (endian, Qbig)
10276                      ? coding_category_utf_16_be_nosig
10277                      : coding_category_utf_16_le_nosig)
10278                   : (EQ (endian, Qbig)
10279                      ? coding_category_utf_16_be
10280                      : coding_category_utf_16_le));
10281     }
10282   else if (EQ (coding_type, Qiso_2022))
10283     {
10284       Lisp_Object initial, reg_usage, request, flags;
10285
10286       if (nargs < coding_arg_iso2022_max)
10287         goto short_args;
10288
10289       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
10290       CHECK_VECTOR (initial);
10291       for (i = 0; i < 4; i++)
10292         {
10293           val = AREF (initial, i);
10294           if (! NILP (val))
10295             {
10296               struct charset *charset;
10297
10298               CHECK_CHARSET_GET_CHARSET (val, charset);
10299               ASET (initial, i, make_number (CHARSET_ID (charset)));
10300               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
10301                 ASET (attrs, coding_attr_ascii_compat, Qt);
10302             }
10303           else
10304             ASET (initial, i, make_number (-1));
10305         }
10306
10307       reg_usage = args[coding_arg_iso2022_reg_usage];
10308       CHECK_CONS (reg_usage);
10309       CHECK_NUMBER_CAR (reg_usage);
10310       CHECK_NUMBER_CDR (reg_usage);
10311
10312       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
10313       for (tail = request; CONSP (tail); tail = XCDR (tail))
10314         {
10315           int id;
10316           Lisp_Object tmp1;
10317
10318           val = XCAR (tail);
10319           CHECK_CONS (val);
10320           tmp1 = XCAR (val);
10321           CHECK_CHARSET_GET_ID (tmp1, id);
10322           CHECK_NATNUM_CDR (val);
10323           if (XINT (XCDR (val)) >= 4)
10324             error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
10325           XSETCAR (val, make_number (id));
10326         }
10327
10328       flags = args[coding_arg_iso2022_flags];
10329       CHECK_NATNUM (flags);
10330       i = XINT (flags) & INT_MAX;
10331       if (EQ (args[coding_arg_charset_list], Qiso_2022))
10332         i |= CODING_ISO_FLAG_FULL_SUPPORT;
10333       flags = make_number (i);
10334
10335       ASET (attrs, coding_attr_iso_initial, initial);
10336       ASET (attrs, coding_attr_iso_usage, reg_usage);
10337       ASET (attrs, coding_attr_iso_request, request);
10338       ASET (attrs, coding_attr_iso_flags, flags);
10339       setup_iso_safe_charsets (attrs);
10340
10341       if (i & CODING_ISO_FLAG_SEVEN_BITS)
10342         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
10343                           | CODING_ISO_FLAG_SINGLE_SHIFT))
10344                     ? coding_category_iso_7_else
10345                     : EQ (args[coding_arg_charset_list], Qiso_2022)
10346                     ? coding_category_iso_7
10347                     : coding_category_iso_7_tight);
10348       else
10349         {
10350           int id = XINT (AREF (initial, 1));
10351
10352           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
10353                        || EQ (args[coding_arg_charset_list], Qiso_2022)
10354                        || id < 0)
10355                       ? coding_category_iso_8_else
10356                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
10357                       ? coding_category_iso_8_1
10358                       : coding_category_iso_8_2);
10359         }
10360       if (category != coding_category_iso_8_1
10361           && category != coding_category_iso_8_2)
10362         ASET (attrs, coding_attr_ascii_compat, Qnil);
10363     }
10364   else if (EQ (coding_type, Qemacs_mule))
10365     {
10366       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
10367         ASET (attrs, coding_attr_emacs_mule_full, Qt);
10368       ASET (attrs, coding_attr_ascii_compat, Qt);
10369       category = coding_category_emacs_mule;
10370     }
10371   else if (EQ (coding_type, Qshift_jis))
10372     {
10373
10374       struct charset *charset;
10375
10376       if (XINT (Flength (charset_list)) != 3
10377           && XINT (Flength (charset_list)) != 4)
10378         error ("There should be three or four charsets");
10379
10380       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10381       if (CHARSET_DIMENSION (charset) != 1)
10382         error ("Dimension of charset %s is not one",
10383                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10384       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10385         ASET (attrs, coding_attr_ascii_compat, Qt);
10386
10387       charset_list = XCDR (charset_list);
10388       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10389       if (CHARSET_DIMENSION (charset) != 1)
10390         error ("Dimension of charset %s is not one",
10391                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10392
10393       charset_list = XCDR (charset_list);
10394       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10395       if (CHARSET_DIMENSION (charset) != 2)
10396         error ("Dimension of charset %s is not two",
10397                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10398
10399       charset_list = XCDR (charset_list);
10400       if (! NILP (charset_list))
10401         {
10402           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10403           if (CHARSET_DIMENSION (charset) != 2)
10404             error ("Dimension of charset %s is not two",
10405                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10406         }
10407
10408       category = coding_category_sjis;
10409       Vsjis_coding_system = name;
10410     }
10411   else if (EQ (coding_type, Qbig5))
10412     {
10413       struct charset *charset;
10414
10415       if (XINT (Flength (charset_list)) != 2)
10416         error ("There should be just two charsets");
10417
10418       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10419       if (CHARSET_DIMENSION (charset) != 1)
10420         error ("Dimension of charset %s is not one",
10421                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10422       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10423         ASET (attrs, coding_attr_ascii_compat, Qt);
10424
10425       charset_list = XCDR (charset_list);
10426       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10427       if (CHARSET_DIMENSION (charset) != 2)
10428         error ("Dimension of charset %s is not two",
10429                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10430
10431       category = coding_category_big5;
10432       Vbig5_coding_system = name;
10433     }
10434   else if (EQ (coding_type, Qraw_text))
10435     {
10436       category = coding_category_raw_text;
10437       ASET (attrs, coding_attr_ascii_compat, Qt);
10438     }
10439   else if (EQ (coding_type, Qutf_8))
10440     {
10441       Lisp_Object bom;
10442
10443       if (nargs < coding_arg_utf8_max)
10444         goto short_args;
10445
10446       bom = args[coding_arg_utf8_bom];
10447       if (! NILP (bom) && ! EQ (bom, Qt))
10448         {
10449           CHECK_CONS (bom);
10450           val = XCAR (bom);
10451           CHECK_CODING_SYSTEM (val);
10452           val = XCDR (bom);
10453           CHECK_CODING_SYSTEM (val);
10454         }
10455       ASET (attrs, coding_attr_utf_bom, bom);
10456       if (NILP (bom))
10457         ASET (attrs, coding_attr_ascii_compat, Qt);
10458
10459       category = (CONSP (bom) ? coding_category_utf_8_auto
10460                   : NILP (bom) ? coding_category_utf_8_nosig
10461                   : coding_category_utf_8_sig);
10462     }
10463   else if (EQ (coding_type, Qundecided))
10464     {
10465       if (nargs < coding_arg_undecided_max)
10466         goto short_args;
10467       ASET (attrs, coding_attr_undecided_inhibit_null_byte_detection,
10468             args[coding_arg_undecided_inhibit_null_byte_detection]);
10469       ASET (attrs, coding_attr_undecided_inhibit_iso_escape_detection,
10470             args[coding_arg_undecided_inhibit_iso_escape_detection]);
10471       ASET (attrs, coding_attr_undecided_prefer_utf_8,
10472             args[coding_arg_undecided_prefer_utf_8]);
10473       category = coding_category_undecided;
10474     }
10475   else
10476     error ("Invalid coding system type: %s",
10477            SDATA (SYMBOL_NAME (coding_type)));
10478
10479   ASET (attrs, coding_attr_category, make_number (category));
10480   ASET (attrs, coding_attr_plist,
10481         Fcons (QCcategory,
10482                Fcons (AREF (Vcoding_category_table, category),
10483                       CODING_ATTR_PLIST (attrs))));
10484   ASET (attrs, coding_attr_plist,
10485         Fcons (QCascii_compatible_p,
10486                Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10487                       CODING_ATTR_PLIST (attrs))));
10488
10489   eol_type = args[coding_arg_eol_type];
10490   if (! NILP (eol_type)
10491       && ! EQ (eol_type, Qunix)
10492       && ! EQ (eol_type, Qdos)
10493       && ! EQ (eol_type, Qmac))
10494     error ("Invalid eol-type");
10495
10496   aliases = list1 (name);
10497
10498   if (NILP (eol_type))
10499     {
10500       eol_type = make_subsidiaries (name);
10501       for (i = 0; i < 3; i++)
10502         {
10503           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10504
10505           this_name = AREF (eol_type, i);
10506           this_aliases = list1 (this_name);
10507           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10508           this_spec = make_uninit_vector (3);
10509           ASET (this_spec, 0, attrs);
10510           ASET (this_spec, 1, this_aliases);
10511           ASET (this_spec, 2, this_eol_type);
10512           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10513           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10514           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10515           if (NILP (val))
10516             Vcoding_system_alist
10517               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10518                        Vcoding_system_alist);
10519         }
10520     }
10521
10522   spec_vec = make_uninit_vector (3);
10523   ASET (spec_vec, 0, attrs);
10524   ASET (spec_vec, 1, aliases);
10525   ASET (spec_vec, 2, eol_type);
10526
10527   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10528   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10529   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10530   if (NILP (val))
10531     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10532                                   Vcoding_system_alist);
10533
10534   {
10535     int id = coding_categories[category].id;
10536
10537     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10538       setup_coding_system (name, &coding_categories[category]);
10539   }
10540
10541   return Qnil;
10542
10543  short_args:
10544   return Fsignal (Qwrong_number_of_arguments,
10545                   Fcons (intern ("define-coding-system-internal"),
10546                          make_number (nargs)));
10547 }
10548
10549
10550 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10551        3, 3, 0,
10552        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10553   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10554 {
10555   Lisp_Object spec, attrs;
10556
10557   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10558   attrs = AREF (spec, 0);
10559   if (EQ (prop, QCmnemonic))
10560     {
10561       if (! STRINGP (val))
10562         CHECK_CHARACTER (val);
10563       ASET (attrs, coding_attr_mnemonic, val);
10564     }
10565   else if (EQ (prop, QCdefault_char))
10566     {
10567       if (NILP (val))
10568         val = make_number (' ');
10569       else
10570         CHECK_CHARACTER (val);
10571       ASET (attrs, coding_attr_default_char, val);
10572     }
10573   else if (EQ (prop, QCdecode_translation_table))
10574     {
10575       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10576         CHECK_SYMBOL (val);
10577       ASET (attrs, coding_attr_decode_tbl, val);
10578     }
10579   else if (EQ (prop, QCencode_translation_table))
10580     {
10581       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10582         CHECK_SYMBOL (val);
10583       ASET (attrs, coding_attr_encode_tbl, val);
10584     }
10585   else if (EQ (prop, QCpost_read_conversion))
10586     {
10587       CHECK_SYMBOL (val);
10588       ASET (attrs, coding_attr_post_read, val);
10589     }
10590   else if (EQ (prop, QCpre_write_conversion))
10591     {
10592       CHECK_SYMBOL (val);
10593       ASET (attrs, coding_attr_pre_write, val);
10594     }
10595   else if (EQ (prop, QCascii_compatible_p))
10596     {
10597       ASET (attrs, coding_attr_ascii_compat, val);
10598     }
10599
10600   ASET (attrs, coding_attr_plist,
10601         Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10602   return val;
10603 }
10604
10605
10606 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10607        Sdefine_coding_system_alias, 2, 2, 0,
10608        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10609   (Lisp_Object alias, Lisp_Object coding_system)
10610 {
10611   Lisp_Object spec, aliases, eol_type, val;
10612
10613   CHECK_SYMBOL (alias);
10614   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10615   aliases = AREF (spec, 1);
10616   /* ALIASES should be a list of length more than zero, and the first
10617      element is a base coding system.  Append ALIAS at the tail of the
10618      list.  */
10619   while (!NILP (XCDR (aliases)))
10620     aliases = XCDR (aliases);
10621   XSETCDR (aliases, list1 (alias));
10622
10623   eol_type = AREF (spec, 2);
10624   if (VECTORP (eol_type))
10625     {
10626       Lisp_Object subsidiaries;
10627       int i;
10628
10629       subsidiaries = make_subsidiaries (alias);
10630       for (i = 0; i < 3; i++)
10631         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10632                                      AREF (eol_type, i));
10633     }
10634
10635   Fputhash (alias, spec, Vcoding_system_hash_table);
10636   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10637   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10638   if (NILP (val))
10639     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10640                                   Vcoding_system_alist);
10641
10642   return Qnil;
10643 }
10644
10645 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10646        1, 1, 0,
10647        doc: /* Return the base of CODING-SYSTEM.
10648 Any alias or subsidiary coding system is not a base coding system.  */)
10649   (Lisp_Object coding_system)
10650 {
10651   Lisp_Object spec, attrs;
10652
10653   if (NILP (coding_system))
10654     return (Qno_conversion);
10655   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10656   attrs = AREF (spec, 0);
10657   return CODING_ATTR_BASE_NAME (attrs);
10658 }
10659
10660 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10661        1, 1, 0,
10662        doc: /* Return the property list of CODING-SYSTEM.  */)
10663   (Lisp_Object coding_system)
10664 {
10665   Lisp_Object spec, attrs;
10666
10667   if (NILP (coding_system))
10668     coding_system = Qno_conversion;
10669   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10670   attrs = AREF (spec, 0);
10671   return CODING_ATTR_PLIST (attrs);
10672 }
10673
10674
10675 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10676        1, 1, 0,
10677        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10678   (Lisp_Object coding_system)
10679 {
10680   Lisp_Object spec;
10681
10682   if (NILP (coding_system))
10683     coding_system = Qno_conversion;
10684   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10685   return AREF (spec, 1);
10686 }
10687
10688 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10689        Scoding_system_eol_type, 1, 1, 0,
10690        doc: /* Return eol-type of CODING-SYSTEM.
10691 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10692
10693 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10694 and CR respectively.
10695
10696 A vector value indicates that a format of end-of-line should be
10697 detected automatically.  Nth element of the vector is the subsidiary
10698 coding system whose eol-type is N.  */)
10699   (Lisp_Object coding_system)
10700 {
10701   Lisp_Object spec, eol_type;
10702   int n;
10703
10704   if (NILP (coding_system))
10705     coding_system = Qno_conversion;
10706   if (! CODING_SYSTEM_P (coding_system))
10707     return Qnil;
10708   spec = CODING_SYSTEM_SPEC (coding_system);
10709   eol_type = AREF (spec, 2);
10710   if (VECTORP (eol_type))
10711     return Fcopy_sequence (eol_type);
10712   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10713   return make_number (n);
10714 }
10715
10716 #endif /* emacs */
10717
10718 \f
10719 /*** 9. Post-amble ***/
10720
10721 void
10722 init_coding_once (void)
10723 {
10724   int i;
10725
10726   for (i = 0; i < coding_category_max; i++)
10727     {
10728       coding_categories[i].id = -1;
10729       coding_priorities[i] = i;
10730     }
10731
10732   /* ISO2022 specific initialize routine.  */
10733   for (i = 0; i < 0x20; i++)
10734     iso_code_class[i] = ISO_control_0;
10735   for (i = 0x21; i < 0x7F; i++)
10736     iso_code_class[i] = ISO_graphic_plane_0;
10737   for (i = 0x80; i < 0xA0; i++)
10738     iso_code_class[i] = ISO_control_1;
10739   for (i = 0xA1; i < 0xFF; i++)
10740     iso_code_class[i] = ISO_graphic_plane_1;
10741   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10742   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10743   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10744   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10745   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10746   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10747   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10748   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10749   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10750
10751   for (i = 0; i < 256; i++)
10752     {
10753       emacs_mule_bytes[i] = 1;
10754     }
10755   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10756   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10757   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10758   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10759 }
10760
10761 #ifdef emacs
10762
10763 void
10764 syms_of_coding (void)
10765 {
10766   staticpro (&Vcoding_system_hash_table);
10767   Vcoding_system_hash_table = CALLN (Fmake_hash_table, QCtest, Qeq);
10768
10769   staticpro (&Vsjis_coding_system);
10770   Vsjis_coding_system = Qnil;
10771
10772   staticpro (&Vbig5_coding_system);
10773   Vbig5_coding_system = Qnil;
10774
10775   staticpro (&Vcode_conversion_reused_workbuf);
10776   Vcode_conversion_reused_workbuf = Qnil;
10777
10778   staticpro (&Vcode_conversion_workbuf_name);
10779   Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10780
10781   reused_workbuf_in_use = 0;
10782
10783   DEFSYM (Qcharset, "charset");
10784   DEFSYM (Qtarget_idx, "target-idx");
10785   DEFSYM (Qcoding_system_history, "coding-system-history");
10786   Fset (Qcoding_system_history, Qnil);
10787
10788   /* Target FILENAME is the first argument.  */
10789   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10790   /* Target FILENAME is the third argument.  */
10791   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10792
10793   DEFSYM (Qcall_process, "call-process");
10794   /* Target PROGRAM is the first argument.  */
10795   Fput (Qcall_process, Qtarget_idx, make_number (0));
10796
10797   DEFSYM (Qcall_process_region, "call-process-region");
10798   /* Target PROGRAM is the third argument.  */
10799   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10800
10801   DEFSYM (Qstart_process, "start-process");
10802   /* Target PROGRAM is the third argument.  */
10803   Fput (Qstart_process, Qtarget_idx, make_number (2));
10804
10805   DEFSYM (Qopen_network_stream, "open-network-stream");
10806   /* Target SERVICE is the fourth argument.  */
10807   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10808
10809   DEFSYM (Qunix, "unix");
10810   DEFSYM (Qdos, "dos");
10811   DEFSYM (Qmac, "mac");
10812
10813   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10814   DEFSYM (Qundecided, "undecided");
10815   DEFSYM (Qno_conversion, "no-conversion");
10816   DEFSYM (Qraw_text, "raw-text");
10817
10818   DEFSYM (Qiso_2022, "iso-2022");
10819
10820   DEFSYM (Qutf_8, "utf-8");
10821   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10822
10823 #if defined (WINDOWSNT) || defined (CYGWIN)
10824   /* No, not utf-16-le: that one has a BOM.  */
10825   DEFSYM (Qutf_16le, "utf-16le");
10826 #endif
10827
10828   DEFSYM (Qutf_16, "utf-16");
10829   DEFSYM (Qbig, "big");
10830   DEFSYM (Qlittle, "little");
10831
10832   DEFSYM (Qshift_jis, "shift-jis");
10833   DEFSYM (Qbig5, "big5");
10834
10835   DEFSYM (Qcoding_system_p, "coding-system-p");
10836
10837   /* Error signaled when there's a problem with detecting a coding system.  */
10838   DEFSYM (Qcoding_system_error, "coding-system-error");
10839   Fput (Qcoding_system_error, Qerror_conditions,
10840         listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10841   Fput (Qcoding_system_error, Qerror_message,
10842         build_pure_c_string ("Invalid coding system"));
10843
10844   DEFSYM (Qtranslation_table, "translation-table");
10845   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10846   DEFSYM (Qtranslation_table_id, "translation-table-id");
10847
10848   /* Coding system emacs-mule and raw-text are for converting only
10849      end-of-line format.  */
10850   DEFSYM (Qemacs_mule, "emacs-mule");
10851
10852   DEFSYM (QCcategory, ":category");
10853   DEFSYM (QCmnemonic, ":mnemonic");
10854   DEFSYM (QCdefault_char, ":default-char");
10855   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10856   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10857   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10858   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10859   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10860
10861   Vcoding_category_table
10862     = Fmake_vector (make_number (coding_category_max), Qnil);
10863   staticpro (&Vcoding_category_table);
10864   /* Followings are target of code detection.  */
10865   ASET (Vcoding_category_table, coding_category_iso_7,
10866         intern_c_string ("coding-category-iso-7"));
10867   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10868         intern_c_string ("coding-category-iso-7-tight"));
10869   ASET (Vcoding_category_table, coding_category_iso_8_1,
10870         intern_c_string ("coding-category-iso-8-1"));
10871   ASET (Vcoding_category_table, coding_category_iso_8_2,
10872         intern_c_string ("coding-category-iso-8-2"));
10873   ASET (Vcoding_category_table, coding_category_iso_7_else,
10874         intern_c_string ("coding-category-iso-7-else"));
10875   ASET (Vcoding_category_table, coding_category_iso_8_else,
10876         intern_c_string ("coding-category-iso-8-else"));
10877   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10878         intern_c_string ("coding-category-utf-8-auto"));
10879   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10880         intern_c_string ("coding-category-utf-8"));
10881   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10882         intern_c_string ("coding-category-utf-8-sig"));
10883   ASET (Vcoding_category_table, coding_category_utf_16_be,
10884         intern_c_string ("coding-category-utf-16-be"));
10885   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10886         intern_c_string ("coding-category-utf-16-auto"));
10887   ASET (Vcoding_category_table, coding_category_utf_16_le,
10888         intern_c_string ("coding-category-utf-16-le"));
10889   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10890         intern_c_string ("coding-category-utf-16-be-nosig"));
10891   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10892         intern_c_string ("coding-category-utf-16-le-nosig"));
10893   ASET (Vcoding_category_table, coding_category_charset,
10894         intern_c_string ("coding-category-charset"));
10895   ASET (Vcoding_category_table, coding_category_sjis,
10896         intern_c_string ("coding-category-sjis"));
10897   ASET (Vcoding_category_table, coding_category_big5,
10898         intern_c_string ("coding-category-big5"));
10899   ASET (Vcoding_category_table, coding_category_ccl,
10900         intern_c_string ("coding-category-ccl"));
10901   ASET (Vcoding_category_table, coding_category_emacs_mule,
10902         intern_c_string ("coding-category-emacs-mule"));
10903   /* Followings are NOT target of code detection.  */
10904   ASET (Vcoding_category_table, coding_category_raw_text,
10905         intern_c_string ("coding-category-raw-text"));
10906   ASET (Vcoding_category_table, coding_category_undecided,
10907         intern_c_string ("coding-category-undecided"));
10908
10909   DEFSYM (Qinsufficient_source, "insufficient-source");
10910   DEFSYM (Qinvalid_source, "invalid-source");
10911   DEFSYM (Qinterrupted, "interrupted");
10912
10913   /* If a symbol has this property, evaluate the value to define the
10914      symbol as a coding system.  */
10915   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10916
10917   defsubr (&Scoding_system_p);
10918   defsubr (&Sread_coding_system);
10919   defsubr (&Sread_non_nil_coding_system);
10920   defsubr (&Scheck_coding_system);
10921   defsubr (&Sdetect_coding_region);
10922   defsubr (&Sdetect_coding_string);
10923   defsubr (&Sfind_coding_systems_region_internal);
10924   defsubr (&Sunencodable_char_position);
10925   defsubr (&Scheck_coding_systems_region);
10926   defsubr (&Sdecode_coding_region);
10927   defsubr (&Sencode_coding_region);
10928   defsubr (&Sdecode_coding_string);
10929   defsubr (&Sencode_coding_string);
10930   defsubr (&Sdecode_sjis_char);
10931   defsubr (&Sencode_sjis_char);
10932   defsubr (&Sdecode_big5_char);
10933   defsubr (&Sencode_big5_char);
10934   defsubr (&Sset_terminal_coding_system_internal);
10935   defsubr (&Sset_safe_terminal_coding_system_internal);
10936   defsubr (&Sterminal_coding_system);
10937   defsubr (&Sset_keyboard_coding_system_internal);
10938   defsubr (&Skeyboard_coding_system);
10939   defsubr (&Sfind_operation_coding_system);
10940   defsubr (&Sset_coding_system_priority);
10941   defsubr (&Sdefine_coding_system_internal);
10942   defsubr (&Sdefine_coding_system_alias);
10943   defsubr (&Scoding_system_put);
10944   defsubr (&Scoding_system_base);
10945   defsubr (&Scoding_system_plist);
10946   defsubr (&Scoding_system_aliases);
10947   defsubr (&Scoding_system_eol_type);
10948   defsubr (&Scoding_system_priority_list);
10949
10950   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10951                doc: /* List of coding systems.
10952
10953 Do not alter the value of this variable manually.  This variable should be
10954 updated by the functions `define-coding-system' and
10955 `define-coding-system-alias'.  */);
10956   Vcoding_system_list = Qnil;
10957
10958   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10959                doc: /* Alist of coding system names.
10960 Each element is one element list of coding system name.
10961 This variable is given to `completing-read' as COLLECTION argument.
10962
10963 Do not alter the value of this variable manually.  This variable should be
10964 updated by the functions `make-coding-system' and
10965 `define-coding-system-alias'.  */);
10966   Vcoding_system_alist = Qnil;
10967
10968   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10969                doc: /* List of coding-categories (symbols) ordered by priority.
10970
10971 On detecting a coding system, Emacs tries code detection algorithms
10972 associated with each coding-category one by one in this order.  When
10973 one algorithm agrees with a byte sequence of source text, the coding
10974 system bound to the corresponding coding-category is selected.
10975
10976 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
10977   {
10978     int i;
10979
10980     Vcoding_category_list = Qnil;
10981     for (i = coding_category_max - 1; i >= 0; i--)
10982       Vcoding_category_list
10983         = Fcons (AREF (Vcoding_category_table, i),
10984                  Vcoding_category_list);
10985   }
10986
10987   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
10988                doc: /* Specify the coding system for read operations.
10989 It is useful to bind this variable with `let', but do not set it globally.
10990 If the value is a coding system, it is used for decoding on read operation.
10991 If not, an appropriate element is used from one of the coding system alists.
10992 There are three such tables: `file-coding-system-alist',
10993 `process-coding-system-alist', and `network-coding-system-alist'.  */);
10994   Vcoding_system_for_read = Qnil;
10995
10996   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
10997                doc: /* Specify the coding system for write operations.
10998 Programs bind this variable with `let', but you should not set it globally.
10999 If the value is a coding system, it is used for encoding of output,
11000 when writing it to a file and when sending it to a file or subprocess.
11001
11002 If this does not specify a coding system, an appropriate element
11003 is used from one of the coding system alists.
11004 There are three such tables: `file-coding-system-alist',
11005 `process-coding-system-alist', and `network-coding-system-alist'.
11006 For output to files, if the above procedure does not specify a coding system,
11007 the value of `buffer-file-coding-system' is used.  */);
11008   Vcoding_system_for_write = Qnil;
11009
11010   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
11011                doc: /*
11012 Coding system used in the latest file or process I/O.  */);
11013   Vlast_coding_system_used = Qnil;
11014
11015   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
11016                doc: /*
11017 Error status of the last code conversion.
11018
11019 When an error was detected in the last code conversion, this variable
11020 is set to one of the following symbols.
11021   `insufficient-source'
11022   `inconsistent-eol'
11023   `invalid-source'
11024   `interrupted'
11025   `insufficient-memory'
11026 When no error was detected, the value doesn't change.  So, to check
11027 the error status of a code conversion by this variable, you must
11028 explicitly set this variable to nil before performing code
11029 conversion.  */);
11030   Vlast_code_conversion_error = Qnil;
11031
11032   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
11033                doc: /*
11034 Non-nil means always inhibit code conversion of end-of-line format.
11035 See info node `Coding Systems' and info node `Text and Binary' concerning
11036 such conversion.  */);
11037   inhibit_eol_conversion = 0;
11038
11039   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
11040                doc: /*
11041 Non-nil means process buffer inherits coding system of process output.
11042 Bind it to t if the process output is to be treated as if it were a file
11043 read from some filesystem.  */);
11044   inherit_process_coding_system = 0;
11045
11046   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
11047                doc: /*
11048 Alist to decide a coding system to use for a file I/O operation.
11049 The format is ((PATTERN . VAL) ...),
11050 where PATTERN is a regular expression matching a file name,
11051 VAL is a coding system, a cons of coding systems, or a function symbol.
11052 If VAL is a coding system, it is used for both decoding and encoding
11053 the file contents.
11054 If VAL is a cons of coding systems, the car part is used for decoding,
11055 and the cdr part is used for encoding.
11056 If VAL is a function symbol, the function must return a coding system
11057 or a cons of coding systems which are used as above.  The function is
11058 called with an argument that is a list of the arguments with which
11059 `find-operation-coding-system' was called.  If the function can't decide
11060 a coding system, it can return `undecided' so that the normal
11061 code-detection is performed.
11062
11063 See also the function `find-operation-coding-system'
11064 and the variable `auto-coding-alist'.  */);
11065   Vfile_coding_system_alist = Qnil;
11066
11067   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
11068                doc: /*
11069 Alist to decide a coding system to use for a process I/O operation.
11070 The format is ((PATTERN . VAL) ...),
11071 where PATTERN is a regular expression matching a program name,
11072 VAL is a coding system, a cons of coding systems, or a function symbol.
11073 If VAL is a coding system, it is used for both decoding what received
11074 from the program and encoding what sent to the program.
11075 If VAL is a cons of coding systems, the car part is used for decoding,
11076 and the cdr part is used for encoding.
11077 If VAL is a function symbol, the function must return a coding system
11078 or a cons of coding systems which are used as above.
11079
11080 See also the function `find-operation-coding-system'.  */);
11081   Vprocess_coding_system_alist = Qnil;
11082
11083   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
11084                doc: /*
11085 Alist to decide a coding system to use for a network I/O operation.
11086 The format is ((PATTERN . VAL) ...),
11087 where PATTERN is a regular expression matching a network service name
11088 or is a port number to connect to,
11089 VAL is a coding system, a cons of coding systems, or a function symbol.
11090 If VAL is a coding system, it is used for both decoding what received
11091 from the network stream and encoding what sent to the network stream.
11092 If VAL is a cons of coding systems, the car part is used for decoding,
11093 and the cdr part is used for encoding.
11094 If VAL is a function symbol, the function must return a coding system
11095 or a cons of coding systems which are used as above.
11096
11097 See also the function `find-operation-coding-system'.  */);
11098   Vnetwork_coding_system_alist = Qnil;
11099
11100   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
11101                doc: /* Coding system to use with system messages.
11102 Also used for decoding keyboard input on X Window system, and for
11103 encoding standard output and error streams.  */);
11104   Vlocale_coding_system = Qnil;
11105
11106   /* The eol mnemonics are reset in startup.el system-dependently.  */
11107   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
11108                doc: /*
11109 String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
11110   eol_mnemonic_unix = build_pure_c_string (":");
11111
11112   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
11113                doc: /*
11114 String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
11115   eol_mnemonic_dos = build_pure_c_string ("\\");
11116
11117   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
11118                doc: /*
11119 String displayed in mode line for MAC-like (CR) end-of-line format.  */);
11120   eol_mnemonic_mac = build_pure_c_string ("/");
11121
11122   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
11123                doc: /*
11124 String displayed in mode line when end-of-line format is not yet determined.  */);
11125   eol_mnemonic_undecided = build_pure_c_string (":");
11126
11127   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
11128                doc: /*
11129 Non-nil enables character translation while encoding and decoding.  */);
11130   Venable_character_translation = Qt;
11131
11132   DEFVAR_LISP ("standard-translation-table-for-decode",
11133                Vstandard_translation_table_for_decode,
11134                doc: /* Table for translating characters while decoding.  */);
11135   Vstandard_translation_table_for_decode = Qnil;
11136
11137   DEFVAR_LISP ("standard-translation-table-for-encode",
11138                Vstandard_translation_table_for_encode,
11139                doc: /* Table for translating characters while encoding.  */);
11140   Vstandard_translation_table_for_encode = Qnil;
11141
11142   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
11143                doc: /* Alist of charsets vs revision numbers.
11144 While encoding, if a charset (car part of an element) is found,
11145 designate it with the escape sequence identifying revision (cdr part
11146 of the element).  */);
11147   Vcharset_revision_table = Qnil;
11148
11149   DEFVAR_LISP ("default-process-coding-system",
11150                Vdefault_process_coding_system,
11151                doc: /* Cons of coding systems used for process I/O by default.
11152 The car part is used for decoding a process output,
11153 the cdr part is used for encoding a text to be sent to a process.  */);
11154   Vdefault_process_coding_system = Qnil;
11155
11156   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
11157                doc: /*
11158 Table of extra Latin codes in the range 128..159 (inclusive).
11159 This is a vector of length 256.
11160 If Nth element is non-nil, the existence of code N in a file
11161 \(or output of subprocess) doesn't prevent it to be detected as
11162 a coding system of ISO 2022 variant which has a flag
11163 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
11164 or reading output of a subprocess.
11165 Only 128th through 159th elements have a meaning.  */);
11166   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
11167
11168   DEFVAR_LISP ("select-safe-coding-system-function",
11169                Vselect_safe_coding_system_function,
11170                doc: /*
11171 Function to call to select safe coding system for encoding a text.
11172
11173 If set, this function is called to force a user to select a proper
11174 coding system which can encode the text in the case that a default
11175 coding system used in each operation can't encode the text.  The
11176 function should take care that the buffer is not modified while
11177 the coding system is being selected.
11178
11179 The default value is `select-safe-coding-system' (which see).  */);
11180   Vselect_safe_coding_system_function = Qnil;
11181
11182   DEFVAR_BOOL ("coding-system-require-warning",
11183                coding_system_require_warning,
11184                doc: /* Internal use only.
11185 If non-nil, on writing a file, `select-safe-coding-system-function' is
11186 called even if `coding-system-for-write' is non-nil.  The command
11187 `universal-coding-system-argument' binds this variable to t temporarily.  */);
11188   coding_system_require_warning = 0;
11189
11190
11191   DEFVAR_BOOL ("inhibit-iso-escape-detection",
11192                inhibit_iso_escape_detection,
11193                doc: /*
11194 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
11195
11196 When Emacs reads text, it tries to detect how the text is encoded.
11197 This code detection is sensitive to escape sequences.  If Emacs sees
11198 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
11199 of the ISO2022 encodings, and decodes text by the corresponding coding
11200 system (e.g. `iso-2022-7bit').
11201
11202 However, there may be a case that you want to read escape sequences in
11203 a file as is.  In such a case, you can set this variable to non-nil.
11204 Then the code detection will ignore any escape sequences, and no text is
11205 detected as encoded in some ISO-2022 encoding.  The result is that all
11206 escape sequences become visible in a buffer.
11207
11208 The default value is nil, and it is strongly recommended not to change
11209 it.  That is because many Emacs Lisp source files that contain
11210 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
11211 in Emacs's distribution, and they won't be decoded correctly on
11212 reading if you suppress escape sequence detection.
11213
11214 The other way to read escape sequences in a file without decoding is
11215 to explicitly specify some coding system that doesn't use ISO-2022
11216 escape sequence (e.g., `latin-1') on reading by \\[universal-coding-system-argument].  */);
11217   inhibit_iso_escape_detection = 0;
11218
11219   DEFVAR_BOOL ("inhibit-null-byte-detection",
11220                inhibit_null_byte_detection,
11221                doc: /* If non-nil, Emacs ignores null bytes on code detection.
11222 By default, Emacs treats it as binary data, and does not attempt to
11223 decode it.  The effect is as if you specified `no-conversion' for
11224 reading that text.
11225
11226 Set this to non-nil when a regular text happens to include null bytes.
11227 Examples are Index nodes of Info files and null-byte delimited output
11228 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
11229 decode text as usual.  */);
11230   inhibit_null_byte_detection = 0;
11231
11232   DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
11233                doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
11234 Internal use only.  Remove after the experimental optimizer becomes stable.  */);
11235   disable_ascii_optimization = 0;
11236
11237   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
11238                doc: /* Char table for translating self-inserting characters.
11239 This is applied to the result of input methods, not their input.
11240 See also `keyboard-translate-table'.
11241
11242 Use of this variable for character code unification was rendered
11243 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
11244 internal character representation.  */);
11245   Vtranslation_table_for_input = Qnil;
11246
11247   Lisp_Object args[coding_arg_undecided_max];
11248   memclear (args, sizeof args);
11249
11250   Lisp_Object plist[] =
11251     {
11252       QCname,
11253       args[coding_arg_name] = Qno_conversion,
11254       QCmnemonic,
11255       args[coding_arg_mnemonic] = make_number ('='),
11256       intern_c_string (":coding-type"),
11257       args[coding_arg_coding_type] = Qraw_text,
11258       QCascii_compatible_p,
11259       args[coding_arg_ascii_compatible_p] = Qt,
11260       QCdefault_char,
11261       args[coding_arg_default_char] = make_number (0),
11262       intern_c_string (":for-unibyte"),
11263       args[coding_arg_for_unibyte] = Qt,
11264       intern_c_string (":docstring"),
11265       (build_pure_c_string
11266        ("Do no conversion.\n"
11267         "\n"
11268         "When you visit a file with this coding, the file is read into a\n"
11269         "unibyte buffer as is, thus each byte of a file is treated as a\n"
11270         "character.")),
11271       intern_c_string (":eol-type"),
11272       args[coding_arg_eol_type] = Qunix,
11273     };
11274   args[coding_arg_plist] = CALLMANY (Flist, plist);
11275   Fdefine_coding_system_internal (coding_arg_max, args);
11276
11277   plist[1] = args[coding_arg_name] = Qundecided;
11278   plist[3] = args[coding_arg_mnemonic] = make_number ('-');
11279   plist[5] = args[coding_arg_coding_type] = Qundecided;
11280   /* This is already set.
11281      plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
11282   plist[8] = intern_c_string (":charset-list");
11283   plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
11284   plist[11] = args[coding_arg_for_unibyte] = Qnil;
11285   plist[13] = build_pure_c_string ("No conversion on encoding, "
11286                                    "automatic conversion on decoding.");
11287   plist[15] = args[coding_arg_eol_type] = Qnil;
11288   args[coding_arg_plist] = CALLMANY (Flist, plist);
11289   args[coding_arg_undecided_inhibit_null_byte_detection] = make_number (0);
11290   args[coding_arg_undecided_inhibit_iso_escape_detection] = make_number (0);
11291   Fdefine_coding_system_internal (coding_arg_undecided_max, args);
11292
11293   setup_coding_system (Qno_conversion, &safe_terminal_coding);
11294
11295   for (int i = 0; i < coding_category_max; i++)
11296     Fset (AREF (Vcoding_category_table, i), Qno_conversion);
11297
11298 #if defined (DOS_NT)
11299   system_eol_type = Qdos;
11300 #else
11301   system_eol_type = Qunix;
11302 #endif
11303   staticpro (&system_eol_type);
11304 }
11305
11306 char *
11307 emacs_strerror (int error_number)
11308 {
11309   char *str;
11310
11311   synchronize_system_messages_locale ();
11312   str = strerror (error_number);
11313
11314   if (! NILP (Vlocale_coding_system))
11315     {
11316       Lisp_Object dec = code_convert_string_norecord (build_string (str),
11317                                                       Vlocale_coding_system,
11318                                                       0);
11319       str = SSDATA (dec);
11320     }
11321
11322   return str;
11323 }
11324
11325 #endif /* emacs */