code.delx.au - gnu-emacs/blob - src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2015 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or
  16 (at your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  On
  59   the C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return true if the byte sequence conforms to XXX.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static bool
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   bool multibytep = coding->src_multibyte;
 162   ptrdiff_t consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   bool multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce a encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   bool multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   ptrdiff_t produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288
 289 #ifdef HAVE_WCHAR_H
 290 #include <wchar.h>
 291 #endif /* HAVE_WCHAR_H */
 292
 293 #include "lisp.h"
 294 #include "character.h"
 295 #include "buffer.h"
 296 #include "charset.h"
 297 #include "ccl.h"
 298 #include "composite.h"
 299 #include "coding.h"
 300 #include "termhooks.h"
 301
 302 Lisp_Object Vcoding_system_hash_table;
 303
 304 /* Format of end-of-line decided by system.  This is Qunix on
 305    Unix and Mac, Qdos on DOS/Windows.
 306    This has an effect only for external encoding (i.e. for output to
 307    file and process), not for in-buffer or Lisp string encoding.  */
 308 static Lisp_Object system_eol_type;
 309
 310 #ifdef emacs
 311
 312 /* Coding-systems are handed between Emacs Lisp programs and C internal
 313    routines by the following three variables.  */
 314 /* Coding system to be used to encode text for terminal display when
 315    terminal coding system is nil.  */
 316 struct coding_system safe_terminal_coding;
 317
 318 #endif /* emacs */
 319
 320 /* Two special coding systems.  */
 321 static Lisp_Object Vsjis_coding_system;
 322 static Lisp_Object Vbig5_coding_system;
 323
 324 /* ISO2022 section */
 325
 326 #define CODING_ISO_INITIAL(coding, reg)                 \
 327   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 328                      coding_attr_iso_initial),          \
 329                reg)))
 330
 331
 332 #define CODING_ISO_REQUEST(coding, charset_id)          \
 333   (((charset_id) <= (coding)->max_charset_id            \
 334     ? ((coding)->safe_charsets[charset_id] != 255       \
 335        ? (coding)->safe_charsets[charset_id]            \
 336        : -1)                                            \
 337     : -1))
 338
 339
 340 #define CODING_ISO_FLAGS(coding)        \
 341   ((coding)->spec.iso_2022.flags)
 342 #define CODING_ISO_DESIGNATION(coding, reg)     \
 343   ((coding)->spec.iso_2022.current_designation[reg])
 344 #define CODING_ISO_INVOCATION(coding, plane)    \
 345   ((coding)->spec.iso_2022.current_invocation[plane])
 346 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 347   ((coding)->spec.iso_2022.single_shifting)
 348 #define CODING_ISO_BOL(coding)  \
 349   ((coding)->spec.iso_2022.bol)
 350 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 351   (CODING_ISO_INVOCATION (coding, plane) < 0 ? -1       \
 352    : CODING_ISO_DESIGNATION (coding, CODING_ISO_INVOCATION (coding, plane)))
 353 #define CODING_ISO_CMP_STATUS(coding)   \
 354   (&(coding)->spec.iso_2022.cmp_status)
 355 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 356   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 357 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 358   ((coding)->spec.iso_2022.embedded_utf_8)
 359
 360 /* Control characters of ISO2022.  */
 361                         /* code */      /* function */
 362 #define ISO_CODE_SO     0x0E            /* shift-out */
 363 #define ISO_CODE_SI     0x0F            /* shift-in */
 364 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 365 #define ISO_CODE_ESC    0x1B            /* escape */
 366 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 367 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 368 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 369
 370 /* All code (1-byte) of ISO2022 is classified into one of the
 371    followings.  */
 372 enum iso_code_class_type
 373   {
 374     ISO_control_0,              /* Control codes in the range
 375                                    0x00..0x1F and 0x7F, except for the
 376                                    following 5 codes.  */
 377     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 378     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 379     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 380     ISO_escape,                 /* ISO_CODE_ESC (0x1B) */
 381     ISO_control_1,              /* Control codes in the range
 382                                    0x80..0x9F, except for the
 383                                    following 3 codes.  */
 384     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 385     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 386     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 387     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 388     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 389     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 390     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 391   };
 392
 393 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 394     `iso-flags' attribute of an iso2022 coding system.  */
 395
 396 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 397    instead of the correct short-form sequence (e.g. ESC $ A).  */
 398 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 399
 400 /* If set, reset graphic planes and registers at end-of-line to the
 401    initial state.  */
 402 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 403
 404 /* If set, reset graphic planes and registers before any control
 405    characters to the initial state.  */
 406 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 407
 408 /* If set, encode by 7-bit environment.  */
 409 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 410
 411 /* If set, use locking-shift function.  */
 412 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 413
 414 /* If set, use single-shift function.  Overwrite
 415    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 416 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 417
 418 /* If set, use designation escape sequence.  */
 419 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 420
 421 /* If set, produce revision number sequence.  */
 422 #define CODING_ISO_FLAG_REVISION        0x0080
 423
 424 /* If set, produce ISO6429's direction specifying sequence.  */
 425 #define CODING_ISO_FLAG_DIRECTION       0x0100
 426
 427 /* If set, assume designation states are reset at beginning of line on
 428    output.  */
 429 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 430
 431 /* If set, designation sequence should be placed at beginning of line
 432    on output.  */
 433 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 434
 435 /* If set, do not encode unsafe characters on output.  */
 436 #define CODING_ISO_FLAG_SAFE            0x0800
 437
 438 /* If set, extra latin codes (128..159) are accepted as a valid code
 439    on input.  */
 440 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 441
 442 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 443
 444 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
 445
 446 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 447
 448 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 449
 450 #define CODING_ISO_FLAG_LEVEL_4         0x20000
 451
 452 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 453
 454 /* A character to be produced on output if encoding of the original
 455    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 456 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 457
 458 /* UTF-8 section */
 459 #define CODING_UTF_8_BOM(coding)        \
 460   ((coding)->spec.utf_8_bom)
 461
 462 /* UTF-16 section */
 463 #define CODING_UTF_16_BOM(coding)       \
 464   ((coding)->spec.utf_16.bom)
 465
 466 #define CODING_UTF_16_ENDIAN(coding)    \
 467   ((coding)->spec.utf_16.endian)
 468
 469 #define CODING_UTF_16_SURROGATE(coding) \
 470   ((coding)->spec.utf_16.surrogate)
 471
 472
 473 /* CCL section */
 474 #define CODING_CCL_DECODER(coding)      \
 475   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 476 #define CODING_CCL_ENCODER(coding)      \
 477   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 478 #define CODING_CCL_VALIDS(coding)                                          \
 479   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 480
 481 /* Index for each coding category in `coding_categories' */
 482
 483 enum coding_category
 484   {
 485     coding_category_iso_7,
 486     coding_category_iso_7_tight,
 487     coding_category_iso_8_1,
 488     coding_category_iso_8_2,
 489     coding_category_iso_7_else,
 490     coding_category_iso_8_else,
 491     coding_category_utf_8_auto,
 492     coding_category_utf_8_nosig,
 493     coding_category_utf_8_sig,
 494     coding_category_utf_16_auto,
 495     coding_category_utf_16_be,
 496     coding_category_utf_16_le,
 497     coding_category_utf_16_be_nosig,
 498     coding_category_utf_16_le_nosig,
 499     coding_category_charset,
 500     coding_category_sjis,
 501     coding_category_big5,
 502     coding_category_ccl,
 503     coding_category_emacs_mule,
 504     /* All above are targets of code detection.  */
 505     coding_category_raw_text,
 506     coding_category_undecided,
 507     coding_category_max
 508   };
 509
 510 /* Definitions of flag bits used in detect_coding_XXXX.  */
 511 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 512 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 513 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 514 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 515 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 516 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 517 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 518 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 519 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 520 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 521 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 522 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 523 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 524 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 525 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 526 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 527 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 528 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 529 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 530 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 531
 532 /* This value is returned if detect_coding_mask () find nothing other
 533    than ASCII characters.  */
 534 #define CATEGORY_MASK_ANY               \
 535   (CATEGORY_MASK_ISO_7                  \
 536    | CATEGORY_MASK_ISO_7_TIGHT          \
 537    | CATEGORY_MASK_ISO_8_1              \
 538    | CATEGORY_MASK_ISO_8_2              \
 539    | CATEGORY_MASK_ISO_7_ELSE           \
 540    | CATEGORY_MASK_ISO_8_ELSE           \
 541    | CATEGORY_MASK_UTF_8_AUTO           \
 542    | CATEGORY_MASK_UTF_8_NOSIG          \
 543    | CATEGORY_MASK_UTF_8_SIG            \
 544    | CATEGORY_MASK_UTF_16_AUTO          \
 545    | CATEGORY_MASK_UTF_16_BE            \
 546    | CATEGORY_MASK_UTF_16_LE            \
 547    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 548    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 549    | CATEGORY_MASK_CHARSET              \
 550    | CATEGORY_MASK_SJIS                 \
 551    | CATEGORY_MASK_BIG5                 \
 552    | CATEGORY_MASK_CCL                  \
 553    | CATEGORY_MASK_EMACS_MULE)
 554
 555
 556 #define CATEGORY_MASK_ISO_7BIT \
 557   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 558
 559 #define CATEGORY_MASK_ISO_8BIT \
 560   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 561
 562 #define CATEGORY_MASK_ISO_ELSE \
 563   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 564
 565 #define CATEGORY_MASK_ISO_ESCAPE        \
 566   (CATEGORY_MASK_ISO_7                  \
 567    | CATEGORY_MASK_ISO_7_TIGHT          \
 568    | CATEGORY_MASK_ISO_7_ELSE           \
 569    | CATEGORY_MASK_ISO_8_ELSE)
 570
 571 #define CATEGORY_MASK_ISO       \
 572   (  CATEGORY_MASK_ISO_7BIT     \
 573      | CATEGORY_MASK_ISO_8BIT   \
 574      | CATEGORY_MASK_ISO_ELSE)
 575
 576 #define CATEGORY_MASK_UTF_16            \
 577   (CATEGORY_MASK_UTF_16_AUTO            \
 578    | CATEGORY_MASK_UTF_16_BE            \
 579    | CATEGORY_MASK_UTF_16_LE            \
 580    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 581    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 582
 583 #define CATEGORY_MASK_UTF_8     \
 584   (CATEGORY_MASK_UTF_8_AUTO     \
 585    | CATEGORY_MASK_UTF_8_NOSIG  \
 586    | CATEGORY_MASK_UTF_8_SIG)
 587
 588 /* Table of coding categories (Lisp symbols).  This variable is for
 589    internal use only.  */
 590 static Lisp_Object Vcoding_category_table;
 591
 592 /* Table of coding-categories ordered by priority.  */
 593 static enum coding_category coding_priorities[coding_category_max];
 594
 595 /* Nth element is a coding context for the coding system bound to the
 596    Nth coding category.  */
 597 static struct coding_system coding_categories[coding_category_max];
 598
 599 /* Encode a flag that can be nil, something else, or t as -1, 0, 1.  */
 600
 601 static int
 602 encode_inhibit_flag (Lisp_Object flag)
 603 {
 604   return NILP (flag) ? -1 : EQ (flag, Qt);
 605 }
 606
 607 /* True if the value of ENCODED_FLAG says a flag should be treated as set.
 608    1 means yes, -1 means no, 0 means ask the user variable VAR.  */
 609
 610 static bool
 611 inhibit_flag (int encoded_flag, bool var)
 612 {
 613   return 0 < encoded_flag + var;
 614 }
 615
 616 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 617   do {                                                  \
 618     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 619     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 620   } while (0)
 621
 622 static void
 623 CHECK_NATNUM_CAR (Lisp_Object x)
 624 {
 625   Lisp_Object tmp = XCAR (x);
 626   CHECK_NATNUM (tmp);
 627   XSETCAR (x, tmp);
 628 }
 629
 630 static void
 631 CHECK_NATNUM_CDR (Lisp_Object x)
 632 {
 633   Lisp_Object tmp = XCDR (x);
 634   CHECK_NATNUM (tmp);
 635   XSETCDR (x, tmp);
 636 }
 637
 638 /* True if CODING's destination can be grown.  */
 639
 640 static bool
 641 growable_destination (struct coding_system *coding)
 642 {
 643   return STRINGP (coding->dst_object) || BUFFERP (coding->dst_object);
 644 }
 645
 646
 647 /* Safely get one byte from the source text pointed by SRC which ends
 648    at SRC_END, and set C to that byte.  If there are not enough bytes
 649    in the source, it jumps to 'no_more_source'.  If MULTIBYTEP,
 650    and a multibyte character is found at SRC, set C to the
 651    negative value of the character code.  The caller should declare
 652    and set these variables appropriately in advance:
 653         src, src_end, multibytep */
 654
 655 #define ONE_MORE_BYTE(c)                                \
 656   do {                                                  \
 657     if (src == src_end)                                 \
 658       {                                                 \
 659         if (src_base < src)                             \
 660           record_conversion_result                      \
 661             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 662         goto no_more_source;                            \
 663       }                                                 \
 664     c = *src++;                                         \
 665     if (multibytep && (c & 0x80))                       \
 666       {                                                 \
 667         if ((c & 0xFE) == 0xC0)                         \
 668           c = ((c & 1) << 6) | *src++;                  \
 669         else                                            \
 670           {                                             \
 671             src--;                                      \
 672             c = - string_char (src, &src, NULL);        \
 673             record_conversion_result                    \
 674               (coding, CODING_RESULT_INVALID_SRC);      \
 675           }                                             \
 676       }                                                 \
 677     consumed_chars++;                                   \
 678   } while (0)
 679
 680 /* Safely get two bytes from the source text pointed by SRC which ends
 681    at SRC_END, and set C1 and C2 to those bytes while skipping the
 682    heading multibyte characters.  If there are not enough bytes in the
 683    source, it jumps to 'no_more_source'.  If MULTIBYTEP and
 684    a multibyte character is found for C2, set C2 to the negative value
 685    of the character code.  The caller should declare and set these
 686    variables appropriately in advance:
 687         src, src_end, multibytep
 688    It is intended that this macro is used in detect_coding_utf_16.  */
 689
 690 #define TWO_MORE_BYTES(c1, c2)                          \
 691   do {                                                  \
 692     do {                                                \
 693       if (src == src_end)                               \
 694         goto no_more_source;                            \
 695       c1 = *src++;                                      \
 696       if (multibytep && (c1 & 0x80))                    \
 697         {                                               \
 698           if ((c1 & 0xFE) == 0xC0)                      \
 699             c1 = ((c1 & 1) << 6) | *src++;              \
 700           else                                          \
 701             {                                           \
 702               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 703               c1 = -1;                                  \
 704             }                                           \
 705         }                                               \
 706     } while (c1 < 0);                                   \
 707     if (src == src_end)                                 \
 708       goto no_more_source;                              \
 709     c2 = *src++;                                        \
 710     if (multibytep && (c2 & 0x80))                      \
 711       {                                                 \
 712         if ((c2 & 0xFE) == 0xC0)                        \
 713           c2 = ((c2 & 1) << 6) | *src++;                \
 714         else                                            \
 715           c2 = -1;                                      \
 716       }                                                 \
 717   } while (0)
 718
 719
 720 /* Store a byte C in the place pointed by DST and increment DST to the
 721    next free point, and increment PRODUCED_CHARS.  The caller should
 722    assure that C is 0..127, and declare and set the variable `dst'
 723    appropriately in advance.
 724 */
 725
 726
 727 #define EMIT_ONE_ASCII_BYTE(c)  \
 728   do {                          \
 729     produced_chars++;           \
 730     *dst++ = (c);               \
 731   } while (0)
 732
 733
 734 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 735
 736 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 737   do {                                  \
 738     produced_chars += 2;                \
 739     *dst++ = (c1), *dst++ = (c2);       \
 740   } while (0)
 741
 742
 743 /* Store a byte C in the place pointed by DST and increment DST to the
 744    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP,
 745    store in an appropriate multibyte form.  The caller should
 746    declare and set the variables `dst' and `multibytep' appropriately
 747    in advance.  */
 748
 749 #define EMIT_ONE_BYTE(c)                \
 750   do {                                  \
 751     produced_chars++;                   \
 752     if (multibytep)                     \
 753       {                                 \
 754         unsigned ch = (c);              \
 755         if (ch >= 0x80)                 \
 756           ch = BYTE8_TO_CHAR (ch);      \
 757         CHAR_STRING_ADVANCE (ch, dst);  \
 758       }                                 \
 759     else                                \
 760       *dst++ = (c);                     \
 761   } while (0)
 762
 763
 764 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 765
 766 #define EMIT_TWO_BYTES(c1, c2)          \
 767   do {                                  \
 768     produced_chars += 2;                \
 769     if (multibytep)                     \
 770       {                                 \
 771         unsigned ch;                    \
 772                                         \
 773         ch = (c1);                      \
 774         if (ch >= 0x80)                 \
 775           ch = BYTE8_TO_CHAR (ch);      \
 776         CHAR_STRING_ADVANCE (ch, dst);  \
 777         ch = (c2);                      \
 778         if (ch >= 0x80)                 \
 779           ch = BYTE8_TO_CHAR (ch);      \
 780         CHAR_STRING_ADVANCE (ch, dst);  \
 781       }                                 \
 782     else                                \
 783       {                                 \
 784         *dst++ = (c1);                  \
 785         *dst++ = (c2);                  \
 786       }                                 \
 787   } while (0)
 788
 789
 790 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 791   do {                                  \
 792     EMIT_ONE_BYTE (c1);                 \
 793     EMIT_TWO_BYTES (c2, c3);            \
 794   } while (0)
 795
 796
 797 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 798   do {                                          \
 799     EMIT_TWO_BYTES (c1, c2);                    \
 800     EMIT_TWO_BYTES (c3, c4);                    \
 801   } while (0)
 802
 803
 804 static void
 805 record_conversion_result (struct coding_system *coding,
 806                           enum coding_result_code result)
 807 {
 808   coding->result = result;
 809   switch (result)
 810     {
 811     case CODING_RESULT_INSUFFICIENT_SRC:
 812       Vlast_code_conversion_error = Qinsufficient_source;
 813       break;
 814     case CODING_RESULT_INVALID_SRC:
 815       Vlast_code_conversion_error = Qinvalid_source;
 816       break;
 817     case CODING_RESULT_INTERRUPT:
 818       Vlast_code_conversion_error = Qinterrupted;
 819       break;
 820     case CODING_RESULT_INSUFFICIENT_DST:
 821       /* Don't record this error in Vlast_code_conversion_error
 822          because it happens just temporarily and is resolved when the
 823          whole conversion is finished.  */
 824       break;
 825     case CODING_RESULT_SUCCESS:
 826       break;
 827     default:
 828       Vlast_code_conversion_error = intern ("Unknown error");
 829     }
 830 }
 831
 832 /* These wrapper macros are used to preserve validity of pointers into
 833    buffer text across calls to decode_char, encode_char, etc, which
 834    could cause relocation of buffers if it loads a charset map,
 835    because loading a charset map allocates large structures.  */
 836
 837 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 838   do {                                                                       \
 839     ptrdiff_t offset;                                                        \
 840                                                                              \
 841     charset_map_loaded = 0;                                                  \
 842     c = DECODE_CHAR (charset, code);                                         \
 843     if (charset_map_loaded                                                   \
 844         && (offset = coding_change_source (coding)))                         \
 845       {                                                                      \
 846         src += offset;                                                       \
 847         src_base += offset;                                                  \
 848         src_end += offset;                                                   \
 849       }                                                                      \
 850   } while (0)
 851
 852 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code)      \
 853   do {                                                                  \
 854     ptrdiff_t offset;                                                   \
 855                                                                         \
 856     charset_map_loaded = 0;                                             \
 857     code = ENCODE_CHAR (charset, c);                                    \
 858     if (charset_map_loaded                                              \
 859         && (offset = coding_change_destination (coding)))               \
 860       {                                                                 \
 861         dst += offset;                                                  \
 862         dst_end += offset;                                              \
 863       }                                                                 \
 864   } while (0)
 865
 866 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
 867   do {                                                                  \
 868     ptrdiff_t offset;                                                   \
 869                                                                         \
 870     charset_map_loaded = 0;                                             \
 871     charset = char_charset (c, charset_list, code_return);              \
 872     if (charset_map_loaded                                              \
 873         && (offset = coding_change_destination (coding)))               \
 874       {                                                                 \
 875         dst += offset;                                                  \
 876         dst_end += offset;                                              \
 877       }                                                                 \
 878   } while (0)
 879
 880 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
 881   do {                                                                  \
 882     ptrdiff_t offset;                                                   \
 883                                                                         \
 884     charset_map_loaded = 0;                                             \
 885     result = CHAR_CHARSET_P (c, charset);                               \
 886     if (charset_map_loaded                                              \
 887         && (offset = coding_change_destination (coding)))               \
 888       {                                                                 \
 889         dst += offset;                                                  \
 890         dst_end += offset;                                              \
 891       }                                                                 \
 892   } while (0)
 893
 894
 895 /* If there are at least BYTES length of room at dst, allocate memory
 896    for coding->destination and update dst and dst_end.  We don't have
 897    to take care of coding->source which will be relocated.  It is
 898    handled by calling coding_set_source in encode_coding.  */
 899
 900 #define ASSURE_DESTINATION(bytes)                               \
 901   do {                                                          \
 902     if (dst + (bytes) >= dst_end)                               \
 903       {                                                         \
 904         ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
 905                                                                 \
 906         dst = alloc_destination (coding, more_bytes, dst);      \
 907         dst_end = coding->destination + coding->dst_bytes;      \
 908       }                                                         \
 909   } while (0)
 910
 911
 912 /* Store multibyte form of the character C in P, and advance P to the
 913    end of the multibyte form.  This used to be like CHAR_STRING_ADVANCE
 914    without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
 915    MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE.  */
 916
 917 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)  CHAR_STRING_ADVANCE(c, p)
 918
 919 /* Return the character code of character whose multibyte form is at
 920    P, and advance P to the end of the multibyte form.  This used to be
 921    like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
 922    nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR.  */
 923
 924 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
 925
 926 /* Set coding->source from coding->src_object.  */
 927
 928 static void
 929 coding_set_source (struct coding_system *coding)
 930 {
 931   if (BUFFERP (coding->src_object))
 932     {
 933       struct buffer *buf = XBUFFER (coding->src_object);
 934
 935       if (coding->src_pos < 0)
 936         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 937       else
 938         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 939     }
 940   else if (STRINGP (coding->src_object))
 941     {
 942       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 943     }
 944   else
 945     {
 946       /* Otherwise, the source is C string and is never relocated
 947          automatically.  Thus we don't have to update anything.  */
 948     }
 949 }
 950
 951
 952 /* Set coding->source from coding->src_object, and return how many
 953    bytes coding->source was changed.  */
 954
 955 static ptrdiff_t
 956 coding_change_source (struct coding_system *coding)
 957 {
 958   const unsigned char *orig = coding->source;
 959   coding_set_source (coding);
 960   return coding->source - orig;
 961 }
 962
 963
 964 /* Set coding->destination from coding->dst_object.  */
 965
 966 static void
 967 coding_set_destination (struct coding_system *coding)
 968 {
 969   if (BUFFERP (coding->dst_object))
 970     {
 971       if (BUFFERP (coding->src_object) && coding->src_pos < 0)
 972         {
 973           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
 974           coding->dst_bytes = (GAP_END_ADDR
 975                                - (coding->src_bytes - coding->consumed)
 976                                - coding->destination);
 977         }
 978       else
 979         {
 980           /* We are sure that coding->dst_pos_byte is before the gap
 981              of the buffer. */
 982           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
 983                                  + coding->dst_pos_byte - BEG_BYTE);
 984           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
 985                                - coding->destination);
 986         }
 987     }
 988   else
 989     {
 990       /* Otherwise, the destination is C string and is never relocated
 991          automatically.  Thus we don't have to update anything.  */
 992     }
 993 }
 994
 995
 996 /* Set coding->destination from coding->dst_object, and return how
 997    many bytes coding->destination was changed.  */
 998
 999 static ptrdiff_t
1000 coding_change_destination (struct coding_system *coding)
1001 {
1002   const unsigned char *orig = coding->destination;
1003   coding_set_destination (coding);
1004   return coding->destination - orig;
1005 }
1006
1007
1008 static void
1009 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1010 {
1011   if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
1012     string_overflow ();
1013   coding->destination = xrealloc (coding->destination,
1014                                   coding->dst_bytes + bytes);
1015   coding->dst_bytes += bytes;
1016 }
1017
1018 static void
1019 coding_alloc_by_making_gap (struct coding_system *coding,
1020                             ptrdiff_t gap_head_used, ptrdiff_t bytes)
1021 {
1022   if (EQ (coding->src_object, coding->dst_object))
1023     {
1024       /* The gap may contain the produced data at the head and not-yet
1025          consumed data at the tail.  To preserve those data, we at
1026          first make the gap size to zero, then increase the gap
1027          size.  */
1028       ptrdiff_t add = GAP_SIZE;
1029
1030       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1031       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1032       make_gap (bytes);
1033       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1034       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1035     }
1036   else
1037     make_gap_1 (XBUFFER (coding->dst_object), bytes);
1038 }
1039
1040
1041 static unsigned char *
1042 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1043                    unsigned char *dst)
1044 {
1045   ptrdiff_t offset = dst - coding->destination;
1046
1047   if (BUFFERP (coding->dst_object))
1048     {
1049       struct buffer *buf = XBUFFER (coding->dst_object);
1050
1051       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1052     }
1053   else
1054     coding_alloc_by_realloc (coding, nbytes);
1055   coding_set_destination (coding);
1056   dst = coding->destination + offset;
1057   return dst;
1058 }
1059
1060 /** Macros for annotations.  */
1061
1062 /* An annotation data is stored in the array coding->charbuf in this
1063    format:
1064      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1065    LENGTH is the number of elements in the annotation.
1066    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1067    NCHARS is the number of characters in the text annotated.
1068
1069    The format of the following elements depend on ANNOTATION_MASK.
1070
1071    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1072    follows:
1073      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1074
1075    NBYTES is the number of bytes specified in the header part of
1076    old-style emacs-mule encoding, or 0 for the other kind of
1077    composition.
1078
1079    METHOD is one of enum composition_method.
1080
1081    Optional COMPOSITION-COMPONENTS are characters and composition
1082    rules.
1083
1084    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1085    follows.
1086
1087    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1088    recover from an invalid annotation, and should be skipped by
1089    produce_annotation.  */
1090
1091 /* Maximum length of the header of annotation data.  */
1092 #define MAX_ANNOTATION_LENGTH 5
1093
1094 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1095   do {                                                  \
1096     *(buf)++ = -(len);                                  \
1097     *(buf)++ = (mask);                                  \
1098     *(buf)++ = (nchars);                                \
1099     coding->annotated = 1;                              \
1100   } while (0);
1101
1102 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1103   do {                                                                      \
1104     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1105     *buf++ = nbytes;                                                        \
1106     *buf++ = method;                                                        \
1107   } while (0)
1108
1109
1110 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1111   do {                                                                  \
1112     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1113     *buf++ = id;                                                        \
1114   } while (0)
1115
1116
1117 /* Bitmasks for coding->eol_seen.  */
1118
1119 #define EOL_SEEN_NONE   0
1120 #define EOL_SEEN_LF     1
1121 #define EOL_SEEN_CR     2
1122 #define EOL_SEEN_CRLF   4
1123
1124 \f
1125 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1126
1127
1128
1129 \f
1130 /*** 3. UTF-8 ***/
1131
1132 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1133    Return true if a text is encoded in UTF-8.  */
1134
1135 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1136 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1137 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1138 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1139 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1140 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1141
1142 #define UTF_8_BOM_1 0xEF
1143 #define UTF_8_BOM_2 0xBB
1144 #define UTF_8_BOM_3 0xBF
1145
1146 /* Unlike the other detect_coding_XXX, this function counts the number
1147    of characters and checks the EOL format.  */
1148
1149 static bool
1150 detect_coding_utf_8 (struct coding_system *coding,
1151                      struct coding_detection_info *detect_info)
1152 {
1153   const unsigned char *src = coding->source, *src_base;
1154   const unsigned char *src_end = coding->source + coding->src_bytes;
1155   bool multibytep = coding->src_multibyte;
1156   ptrdiff_t consumed_chars = 0;
1157   bool bom_found = 0;
1158   ptrdiff_t nchars = coding->head_ascii;
1159   int eol_seen = coding->eol_seen;
1160
1161   detect_info->checked |= CATEGORY_MASK_UTF_8;
1162   /* A coding system of this category is always ASCII compatible.  */
1163   src += nchars;
1164
1165   if (src == coding->source     /* BOM should be at the head.  */
1166       && src + 3 < src_end      /* BOM is 3-byte long.  */
1167       && src[0] == UTF_8_BOM_1
1168       && src[1] == UTF_8_BOM_2
1169       && src[2] == UTF_8_BOM_3)
1170     {
1171       bom_found = 1;
1172       src += 3;
1173       nchars++;
1174     }
1175
1176   while (1)
1177     {
1178       int c, c1, c2, c3, c4;
1179
1180       src_base = src;
1181       ONE_MORE_BYTE (c);
1182       if (c < 0 || UTF_8_1_OCTET_P (c))
1183         {
1184           nchars++;
1185           if (c == '\r')
1186             {
1187               if (src < src_end && *src == '\n')
1188                 {
1189                   eol_seen |= EOL_SEEN_CRLF;
1190                   src++;
1191                   nchars++;
1192                 }
1193               else
1194                 eol_seen |= EOL_SEEN_CR;
1195             }
1196           else if (c == '\n')
1197             eol_seen |= EOL_SEEN_LF;
1198           continue;
1199         }
1200       ONE_MORE_BYTE (c1);
1201       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1202         break;
1203       if (UTF_8_2_OCTET_LEADING_P (c))
1204         {
1205           nchars++;
1206           continue;
1207         }
1208       ONE_MORE_BYTE (c2);
1209       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1210         break;
1211       if (UTF_8_3_OCTET_LEADING_P (c))
1212         {
1213           nchars++;
1214           continue;
1215         }
1216       ONE_MORE_BYTE (c3);
1217       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1218         break;
1219       if (UTF_8_4_OCTET_LEADING_P (c))
1220         {
1221           nchars++;
1222           continue;
1223         }
1224       ONE_MORE_BYTE (c4);
1225       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1226         break;
1227       if (UTF_8_5_OCTET_LEADING_P (c))
1228         {
1229           nchars++;
1230           continue;
1231         }
1232       break;
1233     }
1234   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1235   return 0;
1236
1237  no_more_source:
1238   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1239     {
1240       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1241       return 0;
1242     }
1243   if (bom_found)
1244     {
1245       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1246       detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1247     }
1248   else
1249     {
1250       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1251       if (nchars < src_end - coding->source)
1252         /* The found characters are less than source bytes, which
1253            means that we found a valid non-ASCII characters.  */
1254         detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_NOSIG;
1255     }
1256   coding->detected_utf8_bytes = src_base - coding->source;
1257   coding->detected_utf8_chars = nchars;
1258   return 1;
1259 }
1260
1261
1262 static void
1263 decode_coding_utf_8 (struct coding_system *coding)
1264 {
1265   const unsigned char *src = coding->source + coding->consumed;
1266   const unsigned char *src_end = coding->source + coding->src_bytes;
1267   const unsigned char *src_base;
1268   int *charbuf = coding->charbuf + coding->charbuf_used;
1269   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1270   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1271   bool multibytep = coding->src_multibyte;
1272   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1273   bool eol_dos
1274     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1275   int byte_after_cr = -1;
1276
1277   if (bom != utf_without_bom)
1278     {
1279       int c1, c2, c3;
1280
1281       src_base = src;
1282       ONE_MORE_BYTE (c1);
1283       if (! UTF_8_3_OCTET_LEADING_P (c1))
1284         src = src_base;
1285       else
1286         {
1287           ONE_MORE_BYTE (c2);
1288           if (! UTF_8_EXTRA_OCTET_P (c2))
1289             src = src_base;
1290           else
1291             {
1292               ONE_MORE_BYTE (c3);
1293               if (! UTF_8_EXTRA_OCTET_P (c3))
1294                 src = src_base;
1295               else
1296                 {
1297                   if ((c1 != UTF_8_BOM_1)
1298                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1299                     src = src_base;
1300                   else
1301                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1302                 }
1303             }
1304         }
1305     }
1306   CODING_UTF_8_BOM (coding) = utf_without_bom;
1307
1308   while (1)
1309     {
1310       int c, c1, c2, c3, c4, c5;
1311
1312       src_base = src;
1313       consumed_chars_base = consumed_chars;
1314
1315       if (charbuf >= charbuf_end)
1316         {
1317           if (byte_after_cr >= 0)
1318             src_base--;
1319           break;
1320         }
1321
1322       /* In the simple case, rapidly handle ordinary characters */
1323       if (multibytep && ! eol_dos
1324           && charbuf < charbuf_end - 6 && src < src_end - 6)
1325         {
1326           while (charbuf < charbuf_end - 6 && src < src_end - 6)
1327             {
1328               c1 = *src;
1329               if (c1 & 0x80)
1330                 break;
1331               src++;
1332               consumed_chars++;
1333               *charbuf++ = c1;
1334
1335               c1 = *src;
1336               if (c1 & 0x80)
1337                 break;
1338               src++;
1339               consumed_chars++;
1340               *charbuf++ = c1;
1341
1342               c1 = *src;
1343               if (c1 & 0x80)
1344                 break;
1345               src++;
1346               consumed_chars++;
1347               *charbuf++ = c1;
1348
1349               c1 = *src;
1350               if (c1 & 0x80)
1351                 break;
1352               src++;
1353               consumed_chars++;
1354               *charbuf++ = c1;
1355             }
1356           /* If we handled at least one character, restart the main loop.  */
1357           if (src != src_base)
1358             continue;
1359         }
1360
1361       if (byte_after_cr >= 0)
1362         c1 = byte_after_cr, byte_after_cr = -1;
1363       else
1364         ONE_MORE_BYTE (c1);
1365       if (c1 < 0)
1366         {
1367           c = - c1;
1368         }
1369       else if (UTF_8_1_OCTET_P (c1))
1370         {
1371           if (eol_dos && c1 == '\r')
1372             ONE_MORE_BYTE (byte_after_cr);
1373           c = c1;
1374         }
1375       else
1376         {
1377           ONE_MORE_BYTE (c2);
1378           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1379             goto invalid_code;
1380           if (UTF_8_2_OCTET_LEADING_P (c1))
1381             {
1382               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1383               /* Reject overlong sequences here and below.  Encoders
1384                  producing them are incorrect, they can be misleading,
1385                  and they mess up read/write invariance.  */
1386               if (c < 128)
1387                 goto invalid_code;
1388             }
1389           else
1390             {
1391               ONE_MORE_BYTE (c3);
1392               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1393                 goto invalid_code;
1394               if (UTF_8_3_OCTET_LEADING_P (c1))
1395                 {
1396                   c = (((c1 & 0xF) << 12)
1397                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1398                   if (c < 0x800
1399                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1400                     goto invalid_code;
1401                 }
1402               else
1403                 {
1404                   ONE_MORE_BYTE (c4);
1405                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1406                     goto invalid_code;
1407                   if (UTF_8_4_OCTET_LEADING_P (c1))
1408                     {
1409                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1410                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1411                     if (c < 0x10000)
1412                       goto invalid_code;
1413                     }
1414                   else
1415                     {
1416                       ONE_MORE_BYTE (c5);
1417                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1418                         goto invalid_code;
1419                       if (UTF_8_5_OCTET_LEADING_P (c1))
1420                         {
1421                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1422                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1423                                | (c5 & 0x3F));
1424                           if ((c > MAX_CHAR) || (c < 0x200000))
1425                             goto invalid_code;
1426                         }
1427                       else
1428                         goto invalid_code;
1429                     }
1430                 }
1431             }
1432         }
1433
1434       *charbuf++ = c;
1435       continue;
1436
1437     invalid_code:
1438       src = src_base;
1439       consumed_chars = consumed_chars_base;
1440       ONE_MORE_BYTE (c);
1441       *charbuf++ = ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
1442     }
1443
1444  no_more_source:
1445   coding->consumed_char += consumed_chars_base;
1446   coding->consumed = src_base - coding->source;
1447   coding->charbuf_used = charbuf - coding->charbuf;
1448 }
1449
1450
1451 static bool
1452 encode_coding_utf_8 (struct coding_system *coding)
1453 {
1454   bool multibytep = coding->dst_multibyte;
1455   int *charbuf = coding->charbuf;
1456   int *charbuf_end = charbuf + coding->charbuf_used;
1457   unsigned char *dst = coding->destination + coding->produced;
1458   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1459   ptrdiff_t produced_chars = 0;
1460   int c;
1461
1462   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1463     {
1464       ASSURE_DESTINATION (3);
1465       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1466       CODING_UTF_8_BOM (coding) = utf_without_bom;
1467     }
1468
1469   if (multibytep)
1470     {
1471       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1472
1473       while (charbuf < charbuf_end)
1474         {
1475           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1476
1477           ASSURE_DESTINATION (safe_room);
1478           c = *charbuf++;
1479           if (CHAR_BYTE8_P (c))
1480             {
1481               c = CHAR_TO_BYTE8 (c);
1482               EMIT_ONE_BYTE (c);
1483             }
1484           else
1485             {
1486               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1487               for (p = str; p < pend; p++)
1488                 EMIT_ONE_BYTE (*p);
1489             }
1490         }
1491     }
1492   else
1493     {
1494       int safe_room = MAX_MULTIBYTE_LENGTH;
1495
1496       while (charbuf < charbuf_end)
1497         {
1498           ASSURE_DESTINATION (safe_room);
1499           c = *charbuf++;
1500           if (CHAR_BYTE8_P (c))
1501             *dst++ = CHAR_TO_BYTE8 (c);
1502           else
1503             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1504         }
1505       produced_chars = dst - (coding->destination + coding->produced);
1506     }
1507   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1508   coding->produced_char += produced_chars;
1509   coding->produced = dst - coding->destination;
1510   return 0;
1511 }
1512
1513
1514 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1515    Return true if a text is encoded in one of UTF-16 based coding systems.  */
1516
1517 #define UTF_16_HIGH_SURROGATE_P(val) \
1518   (((val) & 0xFC00) == 0xD800)
1519
1520 #define UTF_16_LOW_SURROGATE_P(val) \
1521   (((val) & 0xFC00) == 0xDC00)
1522
1523
1524 static bool
1525 detect_coding_utf_16 (struct coding_system *coding,
1526                       struct coding_detection_info *detect_info)
1527 {
1528   const unsigned char *src = coding->source;
1529   const unsigned char *src_end = coding->source + coding->src_bytes;
1530   bool multibytep = coding->src_multibyte;
1531   int c1, c2;
1532
1533   detect_info->checked |= CATEGORY_MASK_UTF_16;
1534   if (coding->mode & CODING_MODE_LAST_BLOCK
1535       && (coding->src_chars & 1))
1536     {
1537       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1538       return 0;
1539     }
1540
1541   TWO_MORE_BYTES (c1, c2);
1542   if ((c1 == 0xFF) && (c2 == 0xFE))
1543     {
1544       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1545                              | CATEGORY_MASK_UTF_16_AUTO);
1546       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1547                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1548                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1549     }
1550   else if ((c1 == 0xFE) && (c2 == 0xFF))
1551     {
1552       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1553                              | CATEGORY_MASK_UTF_16_AUTO);
1554       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1555                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1556                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1557     }
1558   else if (c2 < 0)
1559     {
1560       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1561       return 0;
1562     }
1563   else
1564     {
1565       /* We check the dispersion of Eth and Oth bytes where E is even and
1566          O is odd.  If both are high, we assume binary data.*/
1567       unsigned char e[256], o[256];
1568       unsigned e_num = 1, o_num = 1;
1569
1570       memset (e, 0, 256);
1571       memset (o, 0, 256);
1572       e[c1] = 1;
1573       o[c2] = 1;
1574
1575       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1576                                 |CATEGORY_MASK_UTF_16_BE
1577                                 | CATEGORY_MASK_UTF_16_LE);
1578
1579       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1580              != CATEGORY_MASK_UTF_16)
1581         {
1582           TWO_MORE_BYTES (c1, c2);
1583           if (c2 < 0)
1584             break;
1585           if (! e[c1])
1586             {
1587               e[c1] = 1;
1588               e_num++;
1589               if (e_num >= 128)
1590                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1591             }
1592           if (! o[c2])
1593             {
1594               o[c2] = 1;
1595               o_num++;
1596               if (o_num >= 128)
1597                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1598             }
1599         }
1600       return 0;
1601     }
1602
1603  no_more_source:
1604   return 1;
1605 }
1606
1607 static void
1608 decode_coding_utf_16 (struct coding_system *coding)
1609 {
1610   const unsigned char *src = coding->source + coding->consumed;
1611   const unsigned char *src_end = coding->source + coding->src_bytes;
1612   const unsigned char *src_base;
1613   int *charbuf = coding->charbuf + coding->charbuf_used;
1614   /* We may produces at most 3 chars in one loop.  */
1615   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1616   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1617   bool multibytep = coding->src_multibyte;
1618   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1619   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1620   int surrogate = CODING_UTF_16_SURROGATE (coding);
1621   bool eol_dos
1622     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1623   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1624
1625   if (bom == utf_with_bom)
1626     {
1627       int c, c1, c2;
1628
1629       src_base = src;
1630       ONE_MORE_BYTE (c1);
1631       ONE_MORE_BYTE (c2);
1632       c = (c1 << 8) | c2;
1633
1634       if (endian == utf_16_big_endian
1635           ? c != 0xFEFF : c != 0xFFFE)
1636         {
1637           /* The first two bytes are not BOM.  Treat them as bytes
1638              for a normal character.  */
1639           src = src_base;
1640         }
1641       CODING_UTF_16_BOM (coding) = utf_without_bom;
1642     }
1643   else if (bom == utf_detect_bom)
1644     {
1645       /* We have already tried to detect BOM and failed in
1646          detect_coding.  */
1647       CODING_UTF_16_BOM (coding) = utf_without_bom;
1648     }
1649
1650   while (1)
1651     {
1652       int c, c1, c2;
1653
1654       src_base = src;
1655       consumed_chars_base = consumed_chars;
1656
1657       if (charbuf >= charbuf_end)
1658         {
1659           if (byte_after_cr1 >= 0)
1660             src_base -= 2;
1661           break;
1662         }
1663
1664       if (byte_after_cr1 >= 0)
1665         c1 = byte_after_cr1, byte_after_cr1 = -1;
1666       else
1667         ONE_MORE_BYTE (c1);
1668       if (c1 < 0)
1669         {
1670           *charbuf++ = -c1;
1671           continue;
1672         }
1673       if (byte_after_cr2 >= 0)
1674         c2 = byte_after_cr2, byte_after_cr2 = -1;
1675       else
1676         ONE_MORE_BYTE (c2);
1677       if (c2 < 0)
1678         {
1679           *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1680           *charbuf++ = -c2;
1681           continue;
1682         }
1683       c = (endian == utf_16_big_endian
1684            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1685
1686       if (surrogate)
1687         {
1688           if (! UTF_16_LOW_SURROGATE_P (c))
1689             {
1690               if (endian == utf_16_big_endian)
1691                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1692               else
1693                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1694               *charbuf++ = c1;
1695               *charbuf++ = c2;
1696               if (UTF_16_HIGH_SURROGATE_P (c))
1697                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1698               else
1699                 *charbuf++ = c;
1700             }
1701           else
1702             {
1703               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1704               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1705               *charbuf++ = 0x10000 + c;
1706             }
1707         }
1708       else
1709         {
1710           if (UTF_16_HIGH_SURROGATE_P (c))
1711             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1712           else
1713             {
1714               if (eol_dos && c == '\r')
1715                 {
1716                   ONE_MORE_BYTE (byte_after_cr1);
1717                   ONE_MORE_BYTE (byte_after_cr2);
1718                 }
1719               *charbuf++ = c;
1720             }
1721         }
1722     }
1723
1724  no_more_source:
1725   coding->consumed_char += consumed_chars_base;
1726   coding->consumed = src_base - coding->source;
1727   coding->charbuf_used = charbuf - coding->charbuf;
1728 }
1729
1730 static bool
1731 encode_coding_utf_16 (struct coding_system *coding)
1732 {
1733   bool multibytep = coding->dst_multibyte;
1734   int *charbuf = coding->charbuf;
1735   int *charbuf_end = charbuf + coding->charbuf_used;
1736   unsigned char *dst = coding->destination + coding->produced;
1737   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1738   int safe_room = 8;
1739   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1740   bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1741   ptrdiff_t produced_chars = 0;
1742   int c;
1743
1744   if (bom != utf_without_bom)
1745     {
1746       ASSURE_DESTINATION (safe_room);
1747       if (big_endian)
1748         EMIT_TWO_BYTES (0xFE, 0xFF);
1749       else
1750         EMIT_TWO_BYTES (0xFF, 0xFE);
1751       CODING_UTF_16_BOM (coding) = utf_without_bom;
1752     }
1753
1754   while (charbuf < charbuf_end)
1755     {
1756       ASSURE_DESTINATION (safe_room);
1757       c = *charbuf++;
1758       if (c > MAX_UNICODE_CHAR)
1759         c = coding->default_char;
1760
1761       if (c < 0x10000)
1762         {
1763           if (big_endian)
1764             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1765           else
1766             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1767         }
1768       else
1769         {
1770           int c1, c2;
1771
1772           c -= 0x10000;
1773           c1 = (c >> 10) + 0xD800;
1774           c2 = (c & 0x3FF) + 0xDC00;
1775           if (big_endian)
1776             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1777           else
1778             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1779         }
1780     }
1781   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1782   coding->produced = dst - coding->destination;
1783   coding->produced_char += produced_chars;
1784   return 0;
1785 }
1786
1787 \f
1788 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1789
1790 /* Emacs' internal format for representation of multiple character
1791    sets is a kind of multi-byte encoding, i.e. characters are
1792    represented by variable-length sequences of one-byte codes.
1793
1794    ASCII characters and control characters (e.g. `tab', `newline') are
1795    represented by one-byte sequences which are their ASCII codes, in
1796    the range 0x00 through 0x7F.
1797
1798    8-bit characters of the range 0x80..0x9F are represented by
1799    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1800    code + 0x20).
1801
1802    8-bit characters of the range 0xA0..0xFF are represented by
1803    one-byte sequences which are their 8-bit code.
1804
1805    The other characters are represented by a sequence of `base
1806    leading-code', optional `extended leading-code', and one or two
1807    `position-code's.  The length of the sequence is determined by the
1808    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1809    whereas extended leading-code and position-code take the range 0xA0
1810    through 0xFF.  See `charset.h' for more details about leading-code
1811    and position-code.
1812
1813    --- CODE RANGE of Emacs' internal format ---
1814    character set        range
1815    -------------        -----
1816    ascii                0x00..0x7F
1817    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1818    eight-bit-graphic    0xA0..0xBF
1819    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1820    ---------------------------------------------
1821
1822    As this is the internal character representation, the format is
1823    usually not used externally (i.e. in a file or in a data sent to a
1824    process).  But, it is possible to have a text externally in this
1825    format (i.e. by encoding by the coding system `emacs-mule').
1826
1827    In that case, a sequence of one-byte codes has a slightly different
1828    form.
1829
1830    At first, all characters in eight-bit-control are represented by
1831    one-byte sequences which are their 8-bit code.
1832
1833    Next, character composition data are represented by the byte
1834    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1835    where,
1836         METHOD is 0xF2 plus one of composition method (enum
1837         composition_method),
1838
1839         BYTES is 0xA0 plus a byte length of this composition data,
1840
1841         CHARS is 0xA0 plus a number of characters composed by this
1842         data,
1843
1844         COMPONENTs are characters of multibyte form or composition
1845         rules encoded by two-byte of ASCII codes.
1846
1847    In addition, for backward compatibility, the following formats are
1848    also recognized as composition data on decoding.
1849
1850    0x80 MSEQ ...
1851    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1852
1853    Here,
1854         MSEQ is a multibyte form but in these special format:
1855           ASCII: 0xA0 ASCII_CODE+0x80,
1856           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1857         RULE is a one byte code of the range 0xA0..0xF0 that
1858         represents a composition rule.
1859   */
1860
1861 char emacs_mule_bytes[256];
1862
1863
1864 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1865    Return true if a text is encoded in 'emacs-mule'.  */
1866
1867 static bool
1868 detect_coding_emacs_mule (struct coding_system *coding,
1869                           struct coding_detection_info *detect_info)
1870 {
1871   const unsigned char *src = coding->source, *src_base;
1872   const unsigned char *src_end = coding->source + coding->src_bytes;
1873   bool multibytep = coding->src_multibyte;
1874   ptrdiff_t consumed_chars = 0;
1875   int c;
1876   int found = 0;
1877
1878   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1879   /* A coding system of this category is always ASCII compatible.  */
1880   src += coding->head_ascii;
1881
1882   while (1)
1883     {
1884       src_base = src;
1885       ONE_MORE_BYTE (c);
1886       if (c < 0)
1887         continue;
1888       if (c == 0x80)
1889         {
1890           /* Perhaps the start of composite character.  We simply skip
1891              it because analyzing it is too heavy for detecting.  But,
1892              at least, we check that the composite character
1893              constitutes of more than 4 bytes.  */
1894           const unsigned char *src_start;
1895
1896         repeat:
1897           src_start = src;
1898           do
1899             {
1900               ONE_MORE_BYTE (c);
1901             }
1902           while (c >= 0xA0);
1903
1904           if (src - src_start <= 4)
1905             break;
1906           found = CATEGORY_MASK_EMACS_MULE;
1907           if (c == 0x80)
1908             goto repeat;
1909         }
1910
1911       if (c < 0x80)
1912         {
1913           if (c < 0x20
1914               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1915             break;
1916         }
1917       else
1918         {
1919           int more_bytes = emacs_mule_bytes[c] - 1;
1920
1921           while (more_bytes > 0)
1922             {
1923               ONE_MORE_BYTE (c);
1924               if (c < 0xA0)
1925                 {
1926                   src--;        /* Unread the last byte.  */
1927                   break;
1928                 }
1929               more_bytes--;
1930             }
1931           if (more_bytes != 0)
1932             break;
1933           found = CATEGORY_MASK_EMACS_MULE;
1934         }
1935     }
1936   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1937   return 0;
1938
1939  no_more_source:
1940   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1941     {
1942       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1943       return 0;
1944     }
1945   detect_info->found |= found;
1946   return 1;
1947 }
1948
1949
1950 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1951    character.  If CMP_STATUS indicates that we must expect MSEQ or
1952    RULE described above, decode it and return the negative value of
1953    the decoded character or rule.  If an invalid byte is found, return
1954    -1.  If SRC is too short, return -2.  */
1955
1956 static int
1957 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1958                  int *nbytes, int *nchars, int *id,
1959                  struct composition_status *cmp_status)
1960 {
1961   const unsigned char *src_end = coding->source + coding->src_bytes;
1962   const unsigned char *src_base = src;
1963   bool multibytep = coding->src_multibyte;
1964   int charset_ID;
1965   unsigned code;
1966   int c;
1967   ptrdiff_t consumed_chars = 0;
1968   bool mseq_found = 0;
1969
1970   ONE_MORE_BYTE (c);
1971   if (c < 0)
1972     {
1973       c = -c;
1974       charset_ID = emacs_mule_charset[0];
1975     }
1976   else
1977     {
1978       if (c >= 0xA0)
1979         {
1980           if (cmp_status->state != COMPOSING_NO
1981               && cmp_status->old_form)
1982             {
1983               if (cmp_status->state == COMPOSING_CHAR)
1984                 {
1985                   if (c == 0xA0)
1986                     {
1987                       ONE_MORE_BYTE (c);
1988                       c -= 0x80;
1989                       if (c < 0)
1990                         goto invalid_code;
1991                     }
1992                   else
1993                     c -= 0x20;
1994                   mseq_found = 1;
1995                 }
1996               else
1997                 {
1998                   *nbytes = src - src_base;
1999                   *nchars = consumed_chars;
2000                   return -c;
2001                 }
2002             }
2003           else
2004             goto invalid_code;
2005         }
2006
2007       switch (emacs_mule_bytes[c])
2008         {
2009         case 2:
2010           if ((charset_ID = emacs_mule_charset[c]) < 0)
2011             goto invalid_code;
2012           ONE_MORE_BYTE (c);
2013           if (c < 0xA0)
2014             goto invalid_code;
2015           code = c & 0x7F;
2016           break;
2017
2018         case 3:
2019           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2020               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2021             {
2022               ONE_MORE_BYTE (c);
2023               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
2024                 goto invalid_code;
2025               ONE_MORE_BYTE (c);
2026               if (c < 0xA0)
2027                 goto invalid_code;
2028               code = c & 0x7F;
2029             }
2030           else
2031             {
2032               if ((charset_ID = emacs_mule_charset[c]) < 0)
2033                 goto invalid_code;
2034               ONE_MORE_BYTE (c);
2035               if (c < 0xA0)
2036                 goto invalid_code;
2037               code = (c & 0x7F) << 8;
2038               ONE_MORE_BYTE (c);
2039               if (c < 0xA0)
2040                 goto invalid_code;
2041               code |= c & 0x7F;
2042             }
2043           break;
2044
2045         case 4:
2046           ONE_MORE_BYTE (c);
2047           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
2048             goto invalid_code;
2049           ONE_MORE_BYTE (c);
2050           if (c < 0xA0)
2051             goto invalid_code;
2052           code = (c & 0x7F) << 8;
2053           ONE_MORE_BYTE (c);
2054           if (c < 0xA0)
2055             goto invalid_code;
2056           code |= c & 0x7F;
2057           break;
2058
2059         case 1:
2060           code = c;
2061           charset_ID = ASCII_CHAR_P (code) ? charset_ascii : charset_eight_bit;
2062           break;
2063
2064         default:
2065           emacs_abort ();
2066         }
2067       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2068                           CHARSET_FROM_ID (charset_ID), code, c);
2069       if (c < 0)
2070         goto invalid_code;
2071     }
2072   *nbytes = src - src_base;
2073   *nchars = consumed_chars;
2074   if (id)
2075     *id = charset_ID;
2076   return (mseq_found ? -c : c);
2077
2078  no_more_source:
2079   return -2;
2080
2081  invalid_code:
2082   return -1;
2083 }
2084
2085
2086 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2087
2088 /* Handle these composition sequence ('|': the end of header elements,
2089    BYTES and CHARS >= 0xA0):
2090
2091    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2092    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2093    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2094
2095    and these old form:
2096
2097    (4) relative composition: 0x80 | MSEQ ... MSEQ
2098    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2099
2100    When the starter 0x80 and the following header elements are found,
2101    this annotation header is produced.
2102
2103         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2104
2105    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2106    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2107
2108    Then, upon reading the following elements, these codes are produced
2109    until the composition end is found:
2110
2111    (1) CHAR ... CHAR
2112    (2) ALT ... ALT CHAR ... CHAR
2113    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2114    (4) CHAR ... CHAR
2115    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2116
2117    When the composition end is found, LENGTH and NCHARS in the
2118    annotation header is updated as below:
2119
2120    (1) LENGTH: unchanged, NCHARS: unchanged
2121    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2122    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2123    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2124    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2125
2126    If an error is found while composing, the annotation header is
2127    changed to the original composition header (plus filler -1s) as
2128    below:
2129
2130    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2131    (5)          [ 0x80 0xFF -1 -1- -1 ]
2132
2133    and the sequence [ -2 DECODED-RULE ] is changed to the original
2134    byte sequence as below:
2135         o the original byte sequence is B: [ B -1 ]
2136         o the original byte sequence is B1 B2: [ B1 B2 ]
2137
2138    Most of the routines are implemented by macros because many
2139    variables and labels in the caller decode_coding_emacs_mule must be
2140    accessible, and they are usually called just once (thus doesn't
2141    increase the size of compiled object).  */
2142
2143 /* Decode a composition rule represented by C as a component of
2144    composition sequence of Emacs 20 style.  Set RULE to the decoded
2145    rule. */
2146
2147 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2148   do {                                                  \
2149     int gref, nref;                                     \
2150                                                         \
2151     c -= 0xA0;                                          \
2152     if (c < 0 || c >= 81)                               \
2153       goto invalid_code;                                \
2154     gref = c / 9, nref = c % 9;                         \
2155     if (gref == 4) gref = 10;                           \
2156     if (nref == 4) nref = 10;                           \
2157     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2158   } while (0)
2159
2160
2161 /* Decode a composition rule represented by C and the following byte
2162    at SRC as a component of composition sequence of Emacs 21 style.
2163    Set RULE to the decoded rule.  */
2164
2165 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2166   do {                                                  \
2167     int gref, nref;                                     \
2168                                                         \
2169     gref = c - 0x20;                                    \
2170     if (gref < 0 || gref >= 81)                         \
2171       goto invalid_code;                                \
2172     ONE_MORE_BYTE (c);                                  \
2173     nref = c - 0x20;                                    \
2174     if (nref < 0 || nref >= 81)                         \
2175       goto invalid_code;                                \
2176     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2177   } while (0)
2178
2179
2180 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2181    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2182    byte length of this composition information, CHARS is the number of
2183    characters composed by this composition.  */
2184
2185 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2186   do {                                                                  \
2187     enum composition_method method = c - 0xF2;                          \
2188     int nbytes, nchars;                                                 \
2189                                                                         \
2190     ONE_MORE_BYTE (c);                                                  \
2191     if (c < 0)                                                          \
2192       goto invalid_code;                                                \
2193     nbytes = c - 0xA0;                                                  \
2194     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2195       goto invalid_code;                                                \
2196     ONE_MORE_BYTE (c);                                                  \
2197     nchars = c - 0xA0;                                                  \
2198     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2199       goto invalid_code;                                                \
2200     cmp_status->old_form = 0;                                           \
2201     cmp_status->method = method;                                        \
2202     if (method == COMPOSITION_RELATIVE)                                 \
2203       cmp_status->state = COMPOSING_CHAR;                               \
2204     else                                                                \
2205       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2206     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2207     cmp_status->nchars = nchars;                                        \
2208     cmp_status->ncomps = nbytes - 4;                                    \
2209     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2210   } while (0)
2211
2212
2213 /* Start of Emacs 20 style format for relative composition.  */
2214
2215 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2216   do {                                                          \
2217     cmp_status->old_form = 1;                                   \
2218     cmp_status->method = COMPOSITION_RELATIVE;                  \
2219     cmp_status->state = COMPOSING_CHAR;                         \
2220     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2221     cmp_status->nchars = cmp_status->ncomps = 0;                \
2222     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2223   } while (0)
2224
2225
2226 /* Start of Emacs 20 style format for rule-base composition.  */
2227
2228 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2229   do {                                                          \
2230     cmp_status->old_form = 1;                                   \
2231     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2232     cmp_status->state = COMPOSING_CHAR;                         \
2233     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2234     cmp_status->nchars = cmp_status->ncomps = 0;                \
2235     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2236   } while (0)
2237
2238
2239 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2240   do {                                                  \
2241     const unsigned char *current_src = src;             \
2242                                                         \
2243     ONE_MORE_BYTE (c);                                  \
2244     if (c < 0)                                          \
2245       goto invalid_code;                                \
2246     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2247         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2248       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2249     else if (c < 0xA0)                                  \
2250       goto invalid_code;                                \
2251     else if (c < 0xC0)                                  \
2252       {                                                 \
2253         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2254         /* Re-read C as a composition component.  */    \
2255         src = current_src;                              \
2256       }                                                 \
2257     else if (c == 0xFF)                                 \
2258       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2259     else                                                \
2260       goto invalid_code;                                \
2261   } while (0)
2262
2263 #define EMACS_MULE_COMPOSITION_END()                            \
2264   do {                                                          \
2265     int idx = - cmp_status->length;                             \
2266                                                                 \
2267     if (cmp_status->old_form)                                   \
2268       charbuf[idx + 2] = cmp_status->nchars;                    \
2269     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2270       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2271     cmp_status->state = COMPOSING_NO;                           \
2272   } while (0)
2273
2274
2275 static int
2276 emacs_mule_finish_composition (int *charbuf,
2277                                struct composition_status *cmp_status)
2278 {
2279   int idx = - cmp_status->length;
2280   int new_chars;
2281
2282   if (cmp_status->old_form && cmp_status->nchars > 0)
2283     {
2284       charbuf[idx + 2] = cmp_status->nchars;
2285       new_chars = 0;
2286       if (cmp_status->method == COMPOSITION_WITH_RULE
2287           && cmp_status->state == COMPOSING_CHAR)
2288         {
2289           /* The last rule was invalid.  */
2290           int rule = charbuf[-1] + 0xA0;
2291
2292           charbuf[-2] = BYTE8_TO_CHAR (rule);
2293           charbuf[-1] = -1;
2294           new_chars = 1;
2295         }
2296     }
2297   else
2298     {
2299       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2300
2301       if (cmp_status->method == COMPOSITION_WITH_RULE)
2302         {
2303           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2304           charbuf[idx++] = -3;
2305           charbuf[idx++] = 0;
2306           new_chars = 1;
2307         }
2308       else
2309         {
2310           int nchars = charbuf[idx + 1] + 0xA0;
2311           int nbytes = charbuf[idx + 2] + 0xA0;
2312
2313           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2314           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2315           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2316           charbuf[idx++] = -1;
2317           new_chars = 4;
2318         }
2319     }
2320   cmp_status->state = COMPOSING_NO;
2321   return new_chars;
2322 }
2323
2324 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2325   do {                                                                    \
2326     if (cmp_status->state != COMPOSING_NO)                                \
2327       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2328   } while (0)
2329
2330
2331 static void
2332 decode_coding_emacs_mule (struct coding_system *coding)
2333 {
2334   const unsigned char *src = coding->source + coding->consumed;
2335   const unsigned char *src_end = coding->source + coding->src_bytes;
2336   const unsigned char *src_base;
2337   int *charbuf = coding->charbuf + coding->charbuf_used;
2338   /* We may produce two annotations (charset and composition) in one
2339      loop and one more charset annotation at the end.  */
2340   int *charbuf_end
2341     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2342       /* We can produce up to 2 characters in a loop.  */
2343       - 1;
2344   ptrdiff_t consumed_chars = 0, consumed_chars_base;
2345   bool multibytep = coding->src_multibyte;
2346   ptrdiff_t char_offset = coding->produced_char;
2347   ptrdiff_t last_offset = char_offset;
2348   int last_id = charset_ascii;
2349   bool eol_dos
2350     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2351   int byte_after_cr = -1;
2352   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2353
2354   if (cmp_status->state != COMPOSING_NO)
2355     {
2356       int i;
2357
2358       if (charbuf_end - charbuf < cmp_status->length)
2359         emacs_abort ();
2360       for (i = 0; i < cmp_status->length; i++)
2361         *charbuf++ = cmp_status->carryover[i];
2362       coding->annotated = 1;
2363     }
2364
2365   while (1)
2366     {
2367       int c, id IF_LINT (= 0);
2368
2369       src_base = src;
2370       consumed_chars_base = consumed_chars;
2371
2372       if (charbuf >= charbuf_end)
2373         {
2374           if (byte_after_cr >= 0)
2375             src_base--;
2376           break;
2377         }
2378
2379       if (byte_after_cr >= 0)
2380         c = byte_after_cr, byte_after_cr = -1;
2381       else
2382         ONE_MORE_BYTE (c);
2383
2384       if (c < 0 || c == 0x80)
2385         {
2386           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2387           if (c < 0)
2388             {
2389               *charbuf++ = -c;
2390               char_offset++;
2391             }
2392           else
2393             DECODE_EMACS_MULE_COMPOSITION_START ();
2394           continue;
2395         }
2396
2397       if (c < 0x80)
2398         {
2399           if (eol_dos && c == '\r')
2400             ONE_MORE_BYTE (byte_after_cr);
2401           id = charset_ascii;
2402           if (cmp_status->state != COMPOSING_NO)
2403             {
2404               if (cmp_status->old_form)
2405                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2406               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2407                 cmp_status->ncomps--;
2408             }
2409         }
2410       else
2411         {
2412           int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2413           /* emacs_mule_char can load a charset map from a file, which
2414              allocates a large structure and might cause buffer text
2415              to be relocated as result.  Thus, we need to remember the
2416              original pointer to buffer text, and fix up all related
2417              pointers after the call.  */
2418           const unsigned char *orig = coding->source;
2419           ptrdiff_t offset;
2420
2421           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2422                                cmp_status);
2423           offset = coding->source - orig;
2424           if (offset)
2425             {
2426               src += offset;
2427               src_base += offset;
2428               src_end += offset;
2429             }
2430           if (c < 0)
2431             {
2432               if (c == -1)
2433                 goto invalid_code;
2434               if (c == -2)
2435                 break;
2436             }
2437           src = src_base + nbytes;
2438           consumed_chars = consumed_chars_base + nchars;
2439           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2440             cmp_status->ncomps -= nchars;
2441         }
2442
2443       /* Now if C >= 0, we found a normally encoded character, if C <
2444          0, we found an old-style composition component character or
2445          rule.  */
2446
2447       if (cmp_status->state == COMPOSING_NO)
2448         {
2449           if (last_id != id)
2450             {
2451               if (last_id != charset_ascii)
2452                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2453                                   last_id);
2454               last_id = id;
2455               last_offset = char_offset;
2456             }
2457           *charbuf++ = c;
2458           char_offset++;
2459         }
2460       else if (cmp_status->state == COMPOSING_CHAR)
2461         {
2462           if (cmp_status->old_form)
2463             {
2464               if (c >= 0)
2465                 {
2466                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2467                   *charbuf++ = c;
2468                   char_offset++;
2469                 }
2470               else
2471                 {
2472                   *charbuf++ = -c;
2473                   cmp_status->nchars++;
2474                   cmp_status->length++;
2475                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2476                     EMACS_MULE_COMPOSITION_END ();
2477                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2478                     cmp_status->state = COMPOSING_RULE;
2479                 }
2480             }
2481           else
2482             {
2483               *charbuf++ = c;
2484               cmp_status->length++;
2485               cmp_status->nchars--;
2486               if (cmp_status->nchars == 0)
2487                 EMACS_MULE_COMPOSITION_END ();
2488             }
2489         }
2490       else if (cmp_status->state == COMPOSING_RULE)
2491         {
2492           int rule;
2493
2494           if (c >= 0)
2495             {
2496               EMACS_MULE_COMPOSITION_END ();
2497               *charbuf++ = c;
2498               char_offset++;
2499             }
2500           else
2501             {
2502               c = -c;
2503               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2504               if (rule < 0)
2505                 goto invalid_code;
2506               *charbuf++ = -2;
2507               *charbuf++ = rule;
2508               cmp_status->length += 2;
2509               cmp_status->state = COMPOSING_CHAR;
2510             }
2511         }
2512       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2513         {
2514           *charbuf++ = c;
2515           cmp_status->length++;
2516           if (cmp_status->ncomps == 0)
2517             cmp_status->state = COMPOSING_CHAR;
2518           else if (cmp_status->ncomps > 0)
2519             {
2520               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2521                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2522             }
2523           else
2524             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2525         }
2526       else                      /* COMPOSING_COMPONENT_RULE */
2527         {
2528           int rule;
2529
2530           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2531           if (rule < 0)
2532             goto invalid_code;
2533           *charbuf++ = -2;
2534           *charbuf++ = rule;
2535           cmp_status->length += 2;
2536           cmp_status->ncomps--;
2537           if (cmp_status->ncomps > 0)
2538             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2539           else
2540             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2541         }
2542       continue;
2543
2544     invalid_code:
2545       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2546       src = src_base;
2547       consumed_chars = consumed_chars_base;
2548       ONE_MORE_BYTE (c);
2549       *charbuf++ = ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
2550       char_offset++;
2551     }
2552
2553  no_more_source:
2554   if (cmp_status->state != COMPOSING_NO)
2555     {
2556       if (coding->mode & CODING_MODE_LAST_BLOCK)
2557         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2558       else
2559         {
2560           int i;
2561
2562           charbuf -= cmp_status->length;
2563           for (i = 0; i < cmp_status->length; i++)
2564             cmp_status->carryover[i] = charbuf[i];
2565         }
2566     }
2567   if (last_id != charset_ascii)
2568     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2569   coding->consumed_char += consumed_chars_base;
2570   coding->consumed = src_base - coding->source;
2571   coding->charbuf_used = charbuf - coding->charbuf;
2572 }
2573
2574
2575 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2576   do {                                          \
2577     if (id < 0xA0)                              \
2578       codes[0] = id, codes[1] = 0;              \
2579     else if (id < 0xE0)                         \
2580       codes[0] = 0x9A, codes[1] = id;           \
2581     else if (id < 0xF0)                         \
2582       codes[0] = 0x9B, codes[1] = id;           \
2583     else if (id < 0xF5)                         \
2584       codes[0] = 0x9C, codes[1] = id;           \
2585     else                                        \
2586       codes[0] = 0x9D, codes[1] = id;           \
2587   } while (0);
2588
2589
2590 static bool
2591 encode_coding_emacs_mule (struct coding_system *coding)
2592 {
2593   bool multibytep = coding->dst_multibyte;
2594   int *charbuf = coding->charbuf;
2595   int *charbuf_end = charbuf + coding->charbuf_used;
2596   unsigned char *dst = coding->destination + coding->produced;
2597   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2598   int safe_room = 8;
2599   ptrdiff_t produced_chars = 0;
2600   Lisp_Object attrs, charset_list;
2601   int c;
2602   int preferred_charset_id = -1;
2603
2604   CODING_GET_INFO (coding, attrs, charset_list);
2605   if (! EQ (charset_list, Vemacs_mule_charset_list))
2606     {
2607       charset_list = Vemacs_mule_charset_list;
2608       ASET (attrs, coding_attr_charset_list, charset_list);
2609     }
2610
2611   while (charbuf < charbuf_end)
2612     {
2613       ASSURE_DESTINATION (safe_room);
2614       c = *charbuf++;
2615
2616       if (c < 0)
2617         {
2618           /* Handle an annotation.  */
2619           switch (*charbuf)
2620             {
2621             case CODING_ANNOTATE_COMPOSITION_MASK:
2622               /* Not yet implemented.  */
2623               break;
2624             case CODING_ANNOTATE_CHARSET_MASK:
2625               preferred_charset_id = charbuf[3];
2626               if (preferred_charset_id >= 0
2627                   && NILP (Fmemq (make_number (preferred_charset_id),
2628                                   charset_list)))
2629                 preferred_charset_id = -1;
2630               break;
2631             default:
2632               emacs_abort ();
2633             }
2634           charbuf += -c - 1;
2635           continue;
2636         }
2637
2638       if (ASCII_CHAR_P (c))
2639         EMIT_ONE_ASCII_BYTE (c);
2640       else if (CHAR_BYTE8_P (c))
2641         {
2642           c = CHAR_TO_BYTE8 (c);
2643           EMIT_ONE_BYTE (c);
2644         }
2645       else
2646         {
2647           struct charset *charset;
2648           unsigned code;
2649           int dimension;
2650           int emacs_mule_id;
2651           unsigned char leading_codes[2];
2652
2653           if (preferred_charset_id >= 0)
2654             {
2655               bool result;
2656
2657               charset = CHARSET_FROM_ID (preferred_charset_id);
2658               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2659               if (result)
2660                 code = ENCODE_CHAR (charset, c);
2661               else
2662                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2663                                      &code, charset);
2664             }
2665           else
2666             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2667                                  &code, charset);
2668           if (! charset)
2669             {
2670               c = coding->default_char;
2671               if (ASCII_CHAR_P (c))
2672                 {
2673                   EMIT_ONE_ASCII_BYTE (c);
2674                   continue;
2675                 }
2676               CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2677                                    &code, charset);
2678             }
2679           dimension = CHARSET_DIMENSION (charset);
2680           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2681           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2682           EMIT_ONE_BYTE (leading_codes[0]);
2683           if (leading_codes[1])
2684             EMIT_ONE_BYTE (leading_codes[1]);
2685           if (dimension == 1)
2686             EMIT_ONE_BYTE (code | 0x80);
2687           else
2688             {
2689               code |= 0x8080;
2690               EMIT_ONE_BYTE (code >> 8);
2691               EMIT_ONE_BYTE (code & 0xFF);
2692             }
2693         }
2694     }
2695   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2696   coding->produced_char += produced_chars;
2697   coding->produced = dst - coding->destination;
2698   return 0;
2699 }
2700
2701 \f
2702 /*** 7. ISO2022 handlers ***/
2703
2704 /* The following note describes the coding system ISO2022 briefly.
2705    Since the intention of this note is to help understand the
2706    functions in this file, some parts are NOT ACCURATE or are OVERLY
2707    SIMPLIFIED.  For thorough understanding, please refer to the
2708    original document of ISO2022.  This is equivalent to the standard
2709    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2710
2711    ISO2022 provides many mechanisms to encode several character sets
2712    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2713    is encoded using bytes less than 128.  This may make the encoded
2714    text a little bit longer, but the text passes more easily through
2715    several types of gateway, some of which strip off the MSB (Most
2716    Significant Bit).
2717
2718    There are two kinds of character sets: control character sets and
2719    graphic character sets.  The former contain control characters such
2720    as `newline' and `escape' to provide control functions (control
2721    functions are also provided by escape sequences).  The latter
2722    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2723    two control character sets and many graphic character sets.
2724
2725    Graphic character sets are classified into one of the following
2726    four classes, according to the number of bytes (DIMENSION) and
2727    number of characters in one dimension (CHARS) of the set:
2728    - DIMENSION1_CHARS94
2729    - DIMENSION1_CHARS96
2730    - DIMENSION2_CHARS94
2731    - DIMENSION2_CHARS96
2732
2733    In addition, each character set is assigned an identification tag,
2734    unique for each set, called the "final character" (denoted as <F>
2735    hereafter).  The <F> of each character set is decided by ECMA(*)
2736    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2737    (0x30..0x3F are for private use only).
2738
2739    Note (*): ECMA = European Computer Manufacturers Association
2740
2741    Here are examples of graphic character sets [NAME(<F>)]:
2742         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2743         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2744         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2745         o DIMENSION2_CHARS96 -- none for the moment
2746
2747    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2748         C0 [0x00..0x1F] -- control character plane 0
2749         GL [0x20..0x7F] -- graphic character plane 0
2750         C1 [0x80..0x9F] -- control character plane 1
2751         GR [0xA0..0xFF] -- graphic character plane 1
2752
2753    A control character set is directly designated and invoked to C0 or
2754    C1 by an escape sequence.  The most common case is that:
2755    - ISO646's  control character set is designated/invoked to C0, and
2756    - ISO6429's control character set is designated/invoked to C1,
2757    and usually these designations/invocations are omitted in encoded
2758    text.  In a 7-bit environment, only C0 can be used, and a control
2759    character for C1 is encoded by an appropriate escape sequence to
2760    fit into the environment.  All control characters for C1 are
2761    defined to have corresponding escape sequences.
2762
2763    A graphic character set is at first designated to one of four
2764    graphic registers (G0 through G3), then these graphic registers are
2765    invoked to GL or GR.  These designations and invocations can be
2766    done independently.  The most common case is that G0 is invoked to
2767    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2768    these invocations and designations are omitted in encoded text.
2769    In a 7-bit environment, only GL can be used.
2770
2771    When a graphic character set of CHARS94 is invoked to GL, codes
2772    0x20 and 0x7F of the GL area work as control characters SPACE and
2773    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2774    be used.
2775
2776    There are two ways of invocation: locking-shift and single-shift.
2777    With locking-shift, the invocation lasts until the next different
2778    invocation, whereas with single-shift, the invocation affects the
2779    following character only and doesn't affect the locking-shift
2780    state.  Invocations are done by the following control characters or
2781    escape sequences:
2782
2783    ----------------------------------------------------------------------
2784    abbrev  function                  cntrl escape seq   description
2785    ----------------------------------------------------------------------
2786    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2787    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2788    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2789    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2790    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2791    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2792    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2793    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2794    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2795    ----------------------------------------------------------------------
2796    (*) These are not used by any known coding system.
2797
2798    Control characters for these functions are defined by macros
2799    ISO_CODE_XXX in `coding.h'.
2800
2801    Designations are done by the following escape sequences:
2802    ----------------------------------------------------------------------
2803    escape sequence      description
2804    ----------------------------------------------------------------------
2805    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2806    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2807    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2808    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2809    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2810    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2811    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2812    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2813    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2814    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2815    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2816    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2817    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2818    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2819    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2820    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2821    ----------------------------------------------------------------------
2822
2823    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2824    of dimension 1, chars 94, and final character <F>, etc...
2825
2826    Note (*): Although these designations are not allowed in ISO2022,
2827    Emacs accepts them on decoding, and produces them on encoding
2828    CHARS96 character sets in a coding system which is characterized as
2829    7-bit environment, non-locking-shift, and non-single-shift.
2830
2831    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2832    '(' must be omitted.  We refer to this as "short-form" hereafter.
2833
2834    Now you may notice that there are a lot of ways of encoding the
2835    same multilingual text in ISO2022.  Actually, there exist many
2836    coding systems such as Compound Text (used in X11's inter client
2837    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2838    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2839    localized platforms), and all of these are variants of ISO2022.
2840
2841    In addition to the above, Emacs handles two more kinds of escape
2842    sequences: ISO6429's direction specification and Emacs' private
2843    sequence for specifying character composition.
2844
2845    ISO6429's direction specification takes the following form:
2846         o CSI ']'      -- end of the current direction
2847         o CSI '0' ']'  -- end of the current direction
2848         o CSI '1' ']'  -- start of left-to-right text
2849         o CSI '2' ']'  -- start of right-to-left text
2850    The control character CSI (0x9B: control sequence introducer) is
2851    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2852
2853    Character composition specification takes the following form:
2854         o ESC '0' -- start relative composition
2855         o ESC '1' -- end composition
2856         o ESC '2' -- start rule-base composition (*)
2857         o ESC '3' -- start relative composition with alternate chars  (**)
2858         o ESC '4' -- start rule-base composition with alternate chars  (**)
2859   Since these are not standard escape sequences of any ISO standard,
2860   the use of them with these meanings is restricted to Emacs only.
2861
2862   (*) This form is used only in Emacs 20.7 and older versions,
2863   but newer versions can safely decode it.
2864   (**) This form is used only in Emacs 21.1 and newer versions,
2865   and older versions can't decode it.
2866
2867   Here's a list of example usages of these composition escape
2868   sequences (categorized by `enum composition_method').
2869
2870   COMPOSITION_RELATIVE:
2871         ESC 0 CHAR [ CHAR ] ESC 1
2872   COMPOSITION_WITH_RULE:
2873         ESC 2 CHAR [ RULE CHAR ] ESC 1
2874   COMPOSITION_WITH_ALTCHARS:
2875         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2876   COMPOSITION_WITH_RULE_ALTCHARS:
2877         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2878
2879 static enum iso_code_class_type iso_code_class[256];
2880
2881 #define SAFE_CHARSET_P(coding, id)      \
2882   ((id) <= (coding)->max_charset_id     \
2883    && (coding)->safe_charsets[id] != 255)
2884
2885 static void
2886 setup_iso_safe_charsets (Lisp_Object attrs)
2887 {
2888   Lisp_Object charset_list, safe_charsets;
2889   Lisp_Object request;
2890   Lisp_Object reg_usage;
2891   Lisp_Object tail;
2892   EMACS_INT reg94, reg96;
2893   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2894   int max_charset_id;
2895
2896   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2897   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2898       && ! EQ (charset_list, Viso_2022_charset_list))
2899     {
2900       charset_list = Viso_2022_charset_list;
2901       ASET (attrs, coding_attr_charset_list, charset_list);
2902       ASET (attrs, coding_attr_safe_charsets, Qnil);
2903     }
2904
2905   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2906     return;
2907
2908   max_charset_id = 0;
2909   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2910     {
2911       int id = XINT (XCAR (tail));
2912       if (max_charset_id < id)
2913         max_charset_id = id;
2914     }
2915
2916   safe_charsets = make_uninit_string (max_charset_id + 1);
2917   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2918   request = AREF (attrs, coding_attr_iso_request);
2919   reg_usage = AREF (attrs, coding_attr_iso_usage);
2920   reg94 = XINT (XCAR (reg_usage));
2921   reg96 = XINT (XCDR (reg_usage));
2922
2923   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2924     {
2925       Lisp_Object id;
2926       Lisp_Object reg;
2927       struct charset *charset;
2928
2929       id = XCAR (tail);
2930       charset = CHARSET_FROM_ID (XINT (id));
2931       reg = Fcdr (Fassq (id, request));
2932       if (! NILP (reg))
2933         SSET (safe_charsets, XINT (id), XINT (reg));
2934       else if (charset->iso_chars_96)
2935         {
2936           if (reg96 < 4)
2937             SSET (safe_charsets, XINT (id), reg96);
2938         }
2939       else
2940         {
2941           if (reg94 < 4)
2942             SSET (safe_charsets, XINT (id), reg94);
2943         }
2944     }
2945   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2946 }
2947
2948
2949 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2950    Return true if a text is encoded in one of ISO-2022 based coding
2951    systems.  */
2952
2953 static bool
2954 detect_coding_iso_2022 (struct coding_system *coding,
2955                         struct coding_detection_info *detect_info)
2956 {
2957   const unsigned char *src = coding->source, *src_base = src;
2958   const unsigned char *src_end = coding->source + coding->src_bytes;
2959   bool multibytep = coding->src_multibyte;
2960   bool single_shifting = 0;
2961   int id;
2962   int c, c1;
2963   ptrdiff_t consumed_chars = 0;
2964   int i;
2965   int rejected = 0;
2966   int found = 0;
2967   int composition_count = -1;
2968
2969   detect_info->checked |= CATEGORY_MASK_ISO;
2970
2971   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2972     {
2973       struct coding_system *this = &(coding_categories[i]);
2974       Lisp_Object attrs, val;
2975
2976       if (this->id < 0)
2977         continue;
2978       attrs = CODING_ID_ATTRS (this->id);
2979       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2980           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
2981         setup_iso_safe_charsets (attrs);
2982       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2983       this->max_charset_id = SCHARS (val) - 1;
2984       this->safe_charsets = SDATA (val);
2985     }
2986
2987   /* A coding system of this category is always ASCII compatible.  */
2988   src += coding->head_ascii;
2989
2990   while (rejected != CATEGORY_MASK_ISO)
2991     {
2992       src_base = src;
2993       ONE_MORE_BYTE (c);
2994       switch (c)
2995         {
2996         case ISO_CODE_ESC:
2997           if (inhibit_iso_escape_detection)
2998             break;
2999           single_shifting = 0;
3000           ONE_MORE_BYTE (c);
3001           if (c == 'N' || c == 'O')
3002             {
3003               /* ESC <Fe> for SS2 or SS3.  */
3004               single_shifting = 1;
3005               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3006             }
3007           else if (c == '1')
3008             {
3009               /* End of composition.  */
3010               if (composition_count < 0
3011                   || composition_count > MAX_COMPOSITION_COMPONENTS)
3012                 /* Invalid */
3013                 break;
3014               composition_count = -1;
3015               found |= CATEGORY_MASK_ISO;
3016             }
3017           else if (c >= '0' && c <= '4')
3018             {
3019               /* ESC <Fp> for start/end composition.  */
3020               composition_count = 0;
3021             }
3022           else
3023             {
3024               if (c >= '(' && c <= '/')
3025                 {
3026                   /* Designation sequence for a charset of dimension 1.  */
3027                   ONE_MORE_BYTE (c1);
3028                   if (c1 < ' ' || c1 >= 0x80
3029                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3030                     {
3031                       /* Invalid designation sequence.  Just ignore.  */
3032                       if (c1 >= 0x80)
3033                         rejected |= (CATEGORY_MASK_ISO_7BIT
3034                                      | CATEGORY_MASK_ISO_7_ELSE);
3035                       break;
3036                     }
3037                 }
3038               else if (c == '$')
3039                 {
3040                   /* Designation sequence for a charset of dimension 2.  */
3041                   ONE_MORE_BYTE (c);
3042                   if (c >= '@' && c <= 'B')
3043                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3044                     id = iso_charset_table[1][0][c];
3045                   else if (c >= '(' && c <= '/')
3046                     {
3047                       ONE_MORE_BYTE (c1);
3048                       if (c1 < ' ' || c1 >= 0x80
3049                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3050                         {
3051                           /* Invalid designation sequence.  Just ignore.  */
3052                           if (c1 >= 0x80)
3053                             rejected |= (CATEGORY_MASK_ISO_7BIT
3054                                          | CATEGORY_MASK_ISO_7_ELSE);
3055                           break;
3056                         }
3057                     }
3058                   else
3059                     {
3060                       /* Invalid designation sequence.  Just ignore it.  */
3061                       if (c >= 0x80)
3062                         rejected |= (CATEGORY_MASK_ISO_7BIT
3063                                      | CATEGORY_MASK_ISO_7_ELSE);
3064                       break;
3065                     }
3066                 }
3067               else
3068                 {
3069                   /* Invalid escape sequence.  Just ignore it.  */
3070                   if (c >= 0x80)
3071                     rejected |= (CATEGORY_MASK_ISO_7BIT
3072                                  | CATEGORY_MASK_ISO_7_ELSE);
3073                   break;
3074                 }
3075
3076               /* We found a valid designation sequence for CHARSET.  */
3077               rejected |= CATEGORY_MASK_ISO_8BIT;
3078               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3079                                   id))
3080                 found |= CATEGORY_MASK_ISO_7;
3081               else
3082                 rejected |= CATEGORY_MASK_ISO_7;
3083               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3084                                   id))
3085                 found |= CATEGORY_MASK_ISO_7_TIGHT;
3086               else
3087                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3088               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3089                                   id))
3090                 found |= CATEGORY_MASK_ISO_7_ELSE;
3091               else
3092                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3093               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3094                                   id))
3095                 found |= CATEGORY_MASK_ISO_8_ELSE;
3096               else
3097                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3098             }
3099           break;
3100
3101         case ISO_CODE_SO:
3102         case ISO_CODE_SI:
3103           /* Locking shift out/in.  */
3104           if (inhibit_iso_escape_detection)
3105             break;
3106           single_shifting = 0;
3107           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3108           break;
3109
3110         case ISO_CODE_CSI:
3111           /* Control sequence introducer.  */
3112           single_shifting = 0;
3113           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3114           found |= CATEGORY_MASK_ISO_8_ELSE;
3115           goto check_extra_latin;
3116
3117         case ISO_CODE_SS2:
3118         case ISO_CODE_SS3:
3119           /* Single shift.   */
3120           if (inhibit_iso_escape_detection)
3121             break;
3122           single_shifting = 0;
3123           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3124           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3125               & CODING_ISO_FLAG_SINGLE_SHIFT)
3126             {
3127               found |= CATEGORY_MASK_ISO_8_1;
3128               single_shifting = 1;
3129             }
3130           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3131               & CODING_ISO_FLAG_SINGLE_SHIFT)
3132             {
3133               found |= CATEGORY_MASK_ISO_8_2;
3134               single_shifting = 1;
3135             }
3136           if (single_shifting)
3137             break;
3138           goto check_extra_latin;
3139
3140         default:
3141           if (c < 0)
3142             continue;
3143           if (c < 0x80)
3144             {
3145               if (composition_count >= 0)
3146                 composition_count++;
3147               single_shifting = 0;
3148               break;
3149             }
3150           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3151           if (c >= 0xA0)
3152             {
3153               found |= CATEGORY_MASK_ISO_8_1;
3154               /* Check the length of succeeding codes of the range
3155                  0xA0..0FF.  If the byte length is even, we include
3156                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3157                  only when we are not single shifting.  */
3158               if (! single_shifting
3159                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3160                 {
3161                   ptrdiff_t len = 1;
3162                   while (src < src_end)
3163                     {
3164                       src_base = src;
3165                       ONE_MORE_BYTE (c);
3166                       if (c < 0xA0)
3167                         {
3168                           src = src_base;
3169                           break;
3170                         }
3171                       len++;
3172                     }
3173
3174                   if (len & 1 && src < src_end)
3175                     {
3176                       rejected |= CATEGORY_MASK_ISO_8_2;
3177                       if (composition_count >= 0)
3178                         composition_count += len;
3179                     }
3180                   else
3181                     {
3182                       found |= CATEGORY_MASK_ISO_8_2;
3183                       if (composition_count >= 0)
3184                         composition_count += len / 2;
3185                     }
3186                 }
3187               break;
3188             }
3189         check_extra_latin:
3190           if (! VECTORP (Vlatin_extra_code_table)
3191               || NILP (AREF (Vlatin_extra_code_table, c)))
3192             {
3193               rejected = CATEGORY_MASK_ISO;
3194               break;
3195             }
3196           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3197               & CODING_ISO_FLAG_LATIN_EXTRA)
3198             found |= CATEGORY_MASK_ISO_8_1;
3199           else
3200             rejected |= CATEGORY_MASK_ISO_8_1;
3201           rejected |= CATEGORY_MASK_ISO_8_2;
3202           break;
3203         }
3204     }
3205   detect_info->rejected |= CATEGORY_MASK_ISO;
3206   return 0;
3207
3208  no_more_source:
3209   detect_info->rejected |= rejected;
3210   detect_info->found |= (found & ~rejected);
3211   return 1;
3212 }
3213
3214
3215 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3216    escape sequence should be kept.  */
3217 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3218   do {                                                                  \
3219     int id, prev;                                                       \
3220                                                                         \
3221     if (final < '0' || final >= 128                                     \
3222         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3223         || !SAFE_CHARSET_P (coding, id))                                \
3224       {                                                                 \
3225         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3226         chars_96 = -1;                                                  \
3227         break;                                                          \
3228       }                                                                 \
3229     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3230     if (id == charset_jisx0201_roman)                                   \
3231       {                                                                 \
3232         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3233           id = charset_ascii;                                           \
3234       }                                                                 \
3235     else if (id == charset_jisx0208_1978)                               \
3236       {                                                                 \
3237         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3238           id = charset_jisx0208;                                        \
3239       }                                                                 \
3240     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3241     /* If there was an invalid designation to REG previously, and this  \
3242        designation is ASCII to REG, we should keep this designation     \
3243        sequence.  */                                                    \
3244     if (prev == -2 && id == charset_ascii)                              \
3245       chars_96 = -1;                                                    \
3246   } while (0)
3247
3248
3249 /* Handle these composition sequence (ALT: alternate char):
3250
3251    (1) relative composition: ESC 0 CHAR ... ESC 1
3252    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3253    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3254    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3255
3256    When the start sequence (ESC 0/2/3/4) is found, this annotation
3257    header is produced.
3258
3259         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3260
3261    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3262    produced until the end sequence (ESC 1) is found:
3263
3264    (1) CHAR ... CHAR
3265    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3266    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3267    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3268
3269    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3270    annotation header is updated as below:
3271
3272    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3273    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3274    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3275    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3276
3277    If an error is found while composing, the annotation header is
3278    changed to:
3279
3280         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3281
3282    and the sequence [ -2 DECODED-RULE ] is changed to the original
3283    byte sequence as below:
3284         o the original byte sequence is B: [ B -1 ]
3285         o the original byte sequence is B1 B2: [ B1 B2 ]
3286    and the sequence [ -1 -1 ] is changed to the original byte
3287    sequence:
3288         [ ESC '0' ]
3289 */
3290
3291 /* Decode a composition rule C1 and maybe one more byte from the
3292    source, and set RULE to the encoded composition rule.  If the rule
3293    is invalid, goto invalid_code.  */
3294
3295 #define DECODE_COMPOSITION_RULE(rule)                                   \
3296   do {                                                                  \
3297     rule = c1 - 32;                                                     \
3298     if (rule < 0)                                                       \
3299       goto invalid_code;                                                \
3300     if (rule < 81)              /* old format (before ver.21) */        \
3301       {                                                                 \
3302         int gref = (rule) / 9;                                          \
3303         int nref = (rule) % 9;                                          \
3304         if (gref == 4) gref = 10;                                       \
3305         if (nref == 4) nref = 10;                                       \
3306         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3307       }                                                                 \
3308     else                        /* new format (after ver.21) */         \
3309       {                                                                 \
3310         int b;                                                          \
3311                                                                         \
3312         ONE_MORE_BYTE (b);                                              \
3313         if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))        \
3314           goto invalid_code;                                            \
3315         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
3316         rule += 0x100;   /* Distinguish it from the old format.  */     \
3317       }                                                                 \
3318   } while (0)
3319
3320 #define ENCODE_COMPOSITION_RULE(rule)                           \
3321   do {                                                          \
3322     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3323                                                                 \
3324     if (rule < 0x100)           /* old format */                \
3325       {                                                         \
3326         if (gref == 10) gref = 4;                               \
3327         if (nref == 10) nref = 4;                               \
3328         charbuf[idx] = 32 + gref * 9 + nref;                    \
3329         charbuf[idx + 1] = -1;                                  \
3330         new_chars++;                                            \
3331       }                                                         \
3332     else                                /* new format */        \
3333       {                                                         \
3334         charbuf[idx] = 32 + 81 + gref;                          \
3335         charbuf[idx + 1] = 32 + nref;                           \
3336         new_chars += 2;                                         \
3337       }                                                         \
3338   } while (0)
3339
3340 /* Finish the current composition as invalid.  */
3341
3342 static int
3343 finish_composition (int *charbuf, struct composition_status *cmp_status)
3344 {
3345   int idx = - cmp_status->length;
3346   int new_chars;
3347
3348   /* Recover the original ESC sequence */
3349   charbuf[idx++] = ISO_CODE_ESC;
3350   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3351                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3352                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3353                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3354                     : '4');
3355   charbuf[idx++] = -2;
3356   charbuf[idx++] = 0;
3357   charbuf[idx++] = -1;
3358   new_chars = cmp_status->nchars;
3359   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3360     for (; idx < 0; idx++)
3361       {
3362         int elt = charbuf[idx];
3363
3364         if (elt == -2)
3365           {
3366             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3367             idx++;
3368           }
3369         else if (elt == -1)
3370           {
3371             charbuf[idx++] = ISO_CODE_ESC;
3372             charbuf[idx] = '0';
3373             new_chars += 2;
3374           }
3375       }
3376   cmp_status->state = COMPOSING_NO;
3377   return new_chars;
3378 }
3379
3380 /* If characters are under composition, finish the composition.  */
3381 #define MAYBE_FINISH_COMPOSITION()                              \
3382   do {                                                          \
3383     if (cmp_status->state != COMPOSING_NO)                      \
3384       char_offset += finish_composition (charbuf, cmp_status);  \
3385   } while (0)
3386
3387 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3388
3389    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3390    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3391    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3392    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3393
3394    Produce this annotation sequence now:
3395
3396    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3397 */
3398
3399 #define DECODE_COMPOSITION_START(c1)                                       \
3400   do {                                                                     \
3401     if (c1 == '0'                                                          \
3402         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3403              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3404             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3405                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3406       {                                                                    \
3407         *charbuf++ = -1;                                                   \
3408         *charbuf++= -1;                                                    \
3409         cmp_status->state = COMPOSING_CHAR;                                \
3410         cmp_status->length += 2;                                           \
3411       }                                                                    \
3412     else                                                                   \
3413       {                                                                    \
3414         MAYBE_FINISH_COMPOSITION ();                                       \
3415         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3416                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3417                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3418                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3419         cmp_status->state                                                  \
3420           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3421         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3422         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3423         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3424         coding->annotated = 1;                                             \
3425       }                                                                    \
3426   } while (0)
3427
3428
3429 /* Handle composition end sequence ESC 1.  */
3430
3431 #define DECODE_COMPOSITION_END()                                        \
3432   do {                                                                  \
3433     if (cmp_status->nchars == 0                                         \
3434         || ((cmp_status->state == COMPOSING_CHAR)                       \
3435             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3436       {                                                                 \
3437         MAYBE_FINISH_COMPOSITION ();                                    \
3438         goto invalid_code;                                              \
3439       }                                                                 \
3440     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3441       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3442     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3443       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3444     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3445     char_offset += cmp_status->nchars;                                  \
3446     cmp_status->state = COMPOSING_NO;                                   \
3447   } while (0)
3448
3449 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3450
3451 #define STORE_COMPOSITION_RULE(rule)    \
3452   do {                                  \
3453     *charbuf++ = -2;                    \
3454     *charbuf++ = rule;                  \
3455     cmp_status->length += 2;            \
3456     cmp_status->state--;                \
3457   } while (0)
3458
3459 /* Store a composed char or a component char C in charbuf, and update
3460    cmp_status.  */
3461
3462 #define STORE_COMPOSITION_CHAR(c)                                       \
3463   do {                                                                  \
3464     *charbuf++ = (c);                                                   \
3465     cmp_status->length++;                                               \
3466     if (cmp_status->state == COMPOSING_CHAR)                            \
3467       cmp_status->nchars++;                                             \
3468     else                                                                \
3469       cmp_status->ncomps++;                                             \
3470     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3471         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3472             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3473       cmp_status->state++;                                              \
3474   } while (0)
3475
3476
3477 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3478
3479 static void
3480 decode_coding_iso_2022 (struct coding_system *coding)
3481 {
3482   const unsigned char *src = coding->source + coding->consumed;
3483   const unsigned char *src_end = coding->source + coding->src_bytes;
3484   const unsigned char *src_base;
3485   int *charbuf = coding->charbuf + coding->charbuf_used;
3486   /* We may produce two annotations (charset and composition) in one
3487      loop and one more charset annotation at the end.  */
3488   int *charbuf_end
3489     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3490   ptrdiff_t consumed_chars = 0, consumed_chars_base;
3491   bool multibytep = coding->src_multibyte;
3492   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3493   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3494   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3495   int charset_id_2, charset_id_3;
3496   struct charset *charset;
3497   int c;
3498   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3499   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3500   ptrdiff_t char_offset = coding->produced_char;
3501   ptrdiff_t last_offset = char_offset;
3502   int last_id = charset_ascii;
3503   bool eol_dos
3504     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3505   int byte_after_cr = -1;
3506   int i;
3507
3508   setup_iso_safe_charsets (attrs);
3509   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3510
3511   if (cmp_status->state != COMPOSING_NO)
3512     {
3513       if (charbuf_end - charbuf < cmp_status->length)
3514         emacs_abort ();
3515       for (i = 0; i < cmp_status->length; i++)
3516         *charbuf++ = cmp_status->carryover[i];
3517       coding->annotated = 1;
3518     }
3519
3520   while (1)
3521     {
3522       int c1, c2, c3;
3523
3524       src_base = src;
3525       consumed_chars_base = consumed_chars;
3526
3527       if (charbuf >= charbuf_end)
3528         {
3529           if (byte_after_cr >= 0)
3530             src_base--;
3531           break;
3532         }
3533
3534       if (byte_after_cr >= 0)
3535         c1 = byte_after_cr, byte_after_cr = -1;
3536       else
3537         ONE_MORE_BYTE (c1);
3538       if (c1 < 0)
3539         goto invalid_code;
3540
3541       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3542         {
3543           *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3544           char_offset++;
3545           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3546           continue;
3547         }
3548
3549       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3550         {
3551           if (c1 == ISO_CODE_ESC)
3552             {
3553               if (src + 1 >= src_end)
3554                 goto no_more_source;
3555               *charbuf++ = ISO_CODE_ESC;
3556               char_offset++;
3557               if (src[0] == '%' && src[1] == '@')
3558                 {
3559                   src += 2;
3560                   consumed_chars += 2;
3561                   char_offset += 2;
3562                   /* We are sure charbuf can contain two more chars. */
3563                   *charbuf++ = '%';
3564                   *charbuf++ = '@';
3565                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3566                 }
3567             }
3568           else
3569             {
3570               *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3571               char_offset++;
3572             }
3573           continue;
3574         }
3575
3576       if ((cmp_status->state == COMPOSING_RULE
3577            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3578           && c1 != ISO_CODE_ESC)
3579         {
3580           int rule;
3581
3582           DECODE_COMPOSITION_RULE (rule);
3583           STORE_COMPOSITION_RULE (rule);
3584           continue;
3585         }
3586
3587       /* We produce at most one character.  */
3588       switch (iso_code_class [c1])
3589         {
3590         case ISO_0x20_or_0x7F:
3591           if (charset_id_0 < 0
3592               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3593             /* This is SPACE or DEL.  */
3594             charset = CHARSET_FROM_ID (charset_ascii);
3595           else
3596             charset = CHARSET_FROM_ID (charset_id_0);
3597           break;
3598
3599         case ISO_graphic_plane_0:
3600           if (charset_id_0 < 0)
3601             charset = CHARSET_FROM_ID (charset_ascii);
3602           else
3603             charset = CHARSET_FROM_ID (charset_id_0);
3604           break;
3605
3606         case ISO_0xA0_or_0xFF:
3607           if (charset_id_1 < 0
3608               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3609               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3610             goto invalid_code;
3611           /* This is a graphic character, we fall down ... */
3612
3613         case ISO_graphic_plane_1:
3614           if (charset_id_1 < 0)
3615             goto invalid_code;
3616           charset = CHARSET_FROM_ID (charset_id_1);
3617           break;
3618
3619         case ISO_control_0:
3620           if (eol_dos && c1 == '\r')
3621             ONE_MORE_BYTE (byte_after_cr);
3622           MAYBE_FINISH_COMPOSITION ();
3623           charset = CHARSET_FROM_ID (charset_ascii);
3624           break;
3625
3626         case ISO_control_1:
3627           goto invalid_code;
3628
3629         case ISO_shift_out:
3630           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3631               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3632             goto invalid_code;
3633           CODING_ISO_INVOCATION (coding, 0) = 1;
3634           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3635           continue;
3636
3637         case ISO_shift_in:
3638           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3639             goto invalid_code;
3640           CODING_ISO_INVOCATION (coding, 0) = 0;
3641           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3642           continue;
3643
3644         case ISO_single_shift_2_7:
3645           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3646             goto invalid_code;
3647         case ISO_single_shift_2:
3648           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3649             goto invalid_code;
3650           /* SS2 is handled as an escape sequence of ESC 'N' */
3651           c1 = 'N';
3652           goto label_escape_sequence;
3653
3654         case ISO_single_shift_3:
3655           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3656             goto invalid_code;
3657           /* SS2 is handled as an escape sequence of ESC 'O' */
3658           c1 = 'O';
3659           goto label_escape_sequence;
3660
3661         case ISO_control_sequence_introducer:
3662           /* CSI is handled as an escape sequence of ESC '[' ...  */
3663           c1 = '[';
3664           goto label_escape_sequence;
3665
3666         case ISO_escape:
3667           ONE_MORE_BYTE (c1);
3668         label_escape_sequence:
3669           /* Escape sequences handled here are invocation,
3670              designation, direction specification, and character
3671              composition specification.  */
3672           switch (c1)
3673             {
3674             case '&':           /* revision of following character set */
3675               ONE_MORE_BYTE (c1);
3676               if (!(c1 >= '@' && c1 <= '~'))
3677                 goto invalid_code;
3678               ONE_MORE_BYTE (c1);
3679               if (c1 != ISO_CODE_ESC)
3680                 goto invalid_code;
3681               ONE_MORE_BYTE (c1);
3682               goto label_escape_sequence;
3683
3684             case '$':           /* designation of 2-byte character set */
3685               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3686                 goto invalid_code;
3687               {
3688                 int reg, chars96;
3689
3690                 ONE_MORE_BYTE (c1);
3691                 if (c1 >= '@' && c1 <= 'B')
3692                   {     /* designation of JISX0208.1978, GB2312.1980,
3693                            or JISX0208.1980 */
3694                     reg = 0, chars96 = 0;
3695                   }
3696                 else if (c1 >= 0x28 && c1 <= 0x2B)
3697                   { /* designation of DIMENSION2_CHARS94 character set */
3698                     reg = c1 - 0x28, chars96 = 0;
3699                     ONE_MORE_BYTE (c1);
3700                   }
3701                 else if (c1 >= 0x2C && c1 <= 0x2F)
3702                   { /* designation of DIMENSION2_CHARS96 character set */
3703                     reg = c1 - 0x2C, chars96 = 1;
3704                     ONE_MORE_BYTE (c1);
3705                   }
3706                 else
3707                   goto invalid_code;
3708                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3709                 /* We must update these variables now.  */
3710                 if (reg == 0)
3711                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3712                 else if (reg == 1)
3713                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3714                 if (chars96 < 0)
3715                   goto invalid_code;
3716               }
3717               continue;
3718
3719             case 'n':           /* invocation of locking-shift-2 */
3720               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3721                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3722                 goto invalid_code;
3723               CODING_ISO_INVOCATION (coding, 0) = 2;
3724               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3725               continue;
3726
3727             case 'o':           /* invocation of locking-shift-3 */
3728               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3729                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3730                 goto invalid_code;
3731               CODING_ISO_INVOCATION (coding, 0) = 3;
3732               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3733               continue;
3734
3735             case 'N':           /* invocation of single-shift-2 */
3736               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3737                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3738                 goto invalid_code;
3739               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3740               if (charset_id_2 < 0)
3741                 charset = CHARSET_FROM_ID (charset_ascii);
3742               else
3743                 charset = CHARSET_FROM_ID (charset_id_2);
3744               ONE_MORE_BYTE (c1);
3745               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3746                   || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3747                       && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3748                           ? c1 >= 0x80 : c1 < 0x80)))
3749                 goto invalid_code;
3750               break;
3751
3752             case 'O':           /* invocation of single-shift-3 */
3753               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3754                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3755                 goto invalid_code;
3756               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3757               if (charset_id_3 < 0)
3758                 charset = CHARSET_FROM_ID (charset_ascii);
3759               else
3760                 charset = CHARSET_FROM_ID (charset_id_3);
3761               ONE_MORE_BYTE (c1);
3762               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3763                   || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3764                       && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3765                           ? c1 >= 0x80 : c1 < 0x80)))
3766                 goto invalid_code;
3767               break;
3768
3769             case '0': case '2': case '3': case '4': /* start composition */
3770               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3771                 goto invalid_code;
3772               if (last_id != charset_ascii)
3773                 {
3774                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3775                   last_id = charset_ascii;
3776                   last_offset = char_offset;
3777                 }
3778               DECODE_COMPOSITION_START (c1);
3779               continue;
3780
3781             case '1':           /* end composition */
3782               if (cmp_status->state == COMPOSING_NO)
3783                 goto invalid_code;
3784               DECODE_COMPOSITION_END ();
3785               continue;
3786
3787             case '[':           /* specification of direction */
3788               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3789                 goto invalid_code;
3790               /* For the moment, nested direction is not supported.
3791                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3792                  left-to-right, and nonzero means right-to-left.  */
3793               ONE_MORE_BYTE (c1);
3794               switch (c1)
3795                 {
3796                 case ']':       /* end of the current direction */
3797                   coding->mode &= ~CODING_MODE_DIRECTION;
3798
3799                 case '0':       /* end of the current direction */
3800                 case '1':       /* start of left-to-right direction */
3801                   ONE_MORE_BYTE (c1);
3802                   if (c1 == ']')
3803                     coding->mode &= ~CODING_MODE_DIRECTION;
3804                   else
3805                     goto invalid_code;
3806                   break;
3807
3808                 case '2':       /* start of right-to-left direction */
3809                   ONE_MORE_BYTE (c1);
3810                   if (c1 == ']')
3811                     coding->mode |= CODING_MODE_DIRECTION;
3812                   else
3813                     goto invalid_code;
3814                   break;
3815
3816                 default:
3817                   goto invalid_code;
3818                 }
3819               continue;
3820
3821             case '%':
3822               ONE_MORE_BYTE (c1);
3823               if (c1 == '/')
3824                 {
3825                   /* CTEXT extended segment:
3826                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3827                      We keep these bytes as is for the moment.
3828                      They may be decoded by post-read-conversion.  */
3829                   int dim, M, L;
3830                   int size;
3831
3832                   ONE_MORE_BYTE (dim);
3833                   if (dim < '0' || dim > '4')
3834                     goto invalid_code;
3835                   ONE_MORE_BYTE (M);
3836                   if (M < 128)
3837                     goto invalid_code;
3838                   ONE_MORE_BYTE (L);
3839                   if (L < 128)
3840                     goto invalid_code;
3841                   size = ((M - 128) * 128) + (L - 128);
3842                   if (charbuf + 6 > charbuf_end)
3843                     goto break_loop;
3844                   *charbuf++ = ISO_CODE_ESC;
3845                   *charbuf++ = '%';
3846                   *charbuf++ = '/';
3847                   *charbuf++ = dim;
3848                   *charbuf++ = BYTE8_TO_CHAR (M);
3849                   *charbuf++ = BYTE8_TO_CHAR (L);
3850                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3851                 }
3852               else if (c1 == 'G')
3853                 {
3854                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3855                      ESC % G --UTF-8-BYTES-- ESC % @
3856                      We keep these bytes as is for the moment.
3857                      They may be decoded by post-read-conversion.  */
3858                   if (charbuf + 3 > charbuf_end)
3859                     goto break_loop;
3860                   *charbuf++ = ISO_CODE_ESC;
3861                   *charbuf++ = '%';
3862                   *charbuf++ = 'G';
3863                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3864                 }
3865               else
3866                 goto invalid_code;
3867               continue;
3868               break;
3869
3870             default:
3871               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3872                 goto invalid_code;
3873               {
3874                 int reg, chars96;
3875
3876                 if (c1 >= 0x28 && c1 <= 0x2B)
3877                   { /* designation of DIMENSION1_CHARS94 character set */
3878                     reg = c1 - 0x28, chars96 = 0;
3879                     ONE_MORE_BYTE (c1);
3880                   }
3881                 else if (c1 >= 0x2C && c1 <= 0x2F)
3882                   { /* designation of DIMENSION1_CHARS96 character set */
3883                     reg = c1 - 0x2C, chars96 = 1;
3884                     ONE_MORE_BYTE (c1);
3885                   }
3886                 else
3887                   goto invalid_code;
3888                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3889                 /* We must update these variables now.  */
3890                 if (reg == 0)
3891                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3892                 else if (reg == 1)
3893                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3894                 if (chars96 < 0)
3895                   goto invalid_code;
3896               }
3897               continue;
3898             }
3899           break;
3900
3901         default:
3902           emacs_abort ();
3903         }
3904
3905       if (cmp_status->state == COMPOSING_NO
3906           && charset->id != charset_ascii
3907           && last_id != charset->id)
3908         {
3909           if (last_id != charset_ascii)
3910             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3911           last_id = charset->id;
3912           last_offset = char_offset;
3913         }
3914
3915       /* Now we know CHARSET and 1st position code C1 of a character.
3916          Produce a decoded character while getting 2nd and 3rd
3917          position codes C2, C3 if necessary.  */
3918       if (CHARSET_DIMENSION (charset) > 1)
3919         {
3920           ONE_MORE_BYTE (c2);
3921           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3922               || ((c1 & 0x80) != (c2 & 0x80)))
3923             /* C2 is not in a valid range.  */
3924             goto invalid_code;
3925           if (CHARSET_DIMENSION (charset) == 2)
3926             c1 = (c1 << 8) | c2;
3927           else
3928             {
3929               ONE_MORE_BYTE (c3);
3930               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3931                   || ((c1 & 0x80) != (c3 & 0x80)))
3932                 /* C3 is not in a valid range.  */
3933                 goto invalid_code;
3934               c1 = (c1 << 16) | (c2 << 8) | c2;
3935             }
3936         }
3937       c1 &= 0x7F7F7F;
3938       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3939       if (c < 0)
3940         {
3941           MAYBE_FINISH_COMPOSITION ();
3942           for (; src_base < src; src_base++, char_offset++)
3943             {
3944               if (ASCII_CHAR_P (*src_base))
3945                 *charbuf++ = *src_base;
3946               else
3947                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3948             }
3949         }
3950       else if (cmp_status->state == COMPOSING_NO)
3951         {
3952           *charbuf++ = c;
3953           char_offset++;
3954         }
3955       else if ((cmp_status->state == COMPOSING_CHAR
3956                 ? cmp_status->nchars
3957                 : cmp_status->ncomps)
3958                >= MAX_COMPOSITION_COMPONENTS)
3959         {
3960           /* Too long composition.  */
3961           MAYBE_FINISH_COMPOSITION ();
3962           *charbuf++ = c;
3963           char_offset++;
3964         }
3965       else
3966         STORE_COMPOSITION_CHAR (c);
3967       continue;
3968
3969     invalid_code:
3970       MAYBE_FINISH_COMPOSITION ();
3971       src = src_base;
3972       consumed_chars = consumed_chars_base;
3973       ONE_MORE_BYTE (c);
3974       *charbuf++ = c < 0 ? -c : ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
3975       char_offset++;
3976       /* Reset the invocation and designation status to the safest
3977          one; i.e. designate ASCII to the graphic register 0, and
3978          invoke that register to the graphic plane 0.  This typically
3979          helps the case that an designation sequence for ASCII "ESC (
3980          B" is somehow broken (e.g. broken by a newline).  */
3981       CODING_ISO_INVOCATION (coding, 0) = 0;
3982       CODING_ISO_DESIGNATION (coding, 0) = charset_ascii;
3983       charset_id_0 = charset_ascii;
3984       continue;
3985
3986     break_loop:
3987       break;
3988     }
3989
3990  no_more_source:
3991   if (cmp_status->state != COMPOSING_NO)
3992     {
3993       if (coding->mode & CODING_MODE_LAST_BLOCK)
3994         MAYBE_FINISH_COMPOSITION ();
3995       else
3996         {
3997           charbuf -= cmp_status->length;
3998           for (i = 0; i < cmp_status->length; i++)
3999             cmp_status->carryover[i] = charbuf[i];
4000         }
4001     }
4002   else if (last_id != charset_ascii)
4003     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4004   coding->consumed_char += consumed_chars_base;
4005   coding->consumed = src_base - coding->source;
4006   coding->charbuf_used = charbuf - coding->charbuf;
4007 }
4008
4009
4010 /* ISO2022 encoding stuff.  */
4011
4012 /*
4013    It is not enough to say just "ISO2022" on encoding, we have to
4014    specify more details.  In Emacs, each coding system of ISO2022
4015    variant has the following specifications:
4016         1. Initial designation to G0 thru G3.
4017         2. Allows short-form designation?
4018         3. ASCII should be designated to G0 before control characters?
4019         4. ASCII should be designated to G0 at end of line?
4020         5. 7-bit environment or 8-bit environment?
4021         6. Use locking-shift?
4022         7. Use Single-shift?
4023    And the following two are only for Japanese:
4024         8. Use ASCII in place of JIS0201-1976-Roman?
4025         9. Use JISX0208-1983 in place of JISX0208-1978?
4026    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4027    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
4028    details.
4029 */
4030
4031 /* Produce codes (escape sequence) for designating CHARSET to graphic
4032    register REG at DST, and increment DST.  If <final-char> of CHARSET is
4033    '@', 'A', or 'B' and the coding system CODING allows, produce
4034    designation sequence of short-form.  */
4035
4036 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
4037   do {                                                                  \
4038     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
4039     const char *intermediate_char_94 = "()*+";                          \
4040     const char *intermediate_char_96 = ",-./";                          \
4041     int revision = -1;                                                  \
4042                                                                         \
4043     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
4044       revision = CHARSET_ISO_REVISION (charset);                        \
4045                                                                         \
4046     if (revision >= 0)                                                  \
4047       {                                                                 \
4048         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4049         EMIT_ONE_BYTE ('@' + revision);                                 \
4050       }                                                                 \
4051     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4052     if (CHARSET_DIMENSION (charset) == 1)                               \
4053       {                                                                 \
4054         int b;                                                          \
4055         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4056           b = intermediate_char_94[reg];                                \
4057         else                                                            \
4058           b = intermediate_char_96[reg];                                \
4059         EMIT_ONE_ASCII_BYTE (b);                                        \
4060       }                                                                 \
4061     else                                                                \
4062       {                                                                 \
4063         EMIT_ONE_ASCII_BYTE ('$');                                      \
4064         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4065           {                                                             \
4066             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4067                 || reg != 0                                             \
4068                 || final_char < '@' || final_char > 'B')                \
4069               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4070           }                                                             \
4071         else                                                            \
4072           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4073       }                                                                 \
4074     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4075                                                                         \
4076     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4077   } while (0)
4078
4079
4080 /* The following two macros produce codes (control character or escape
4081    sequence) for ISO2022 single-shift functions (single-shift-2 and
4082    single-shift-3).  */
4083
4084 #define ENCODE_SINGLE_SHIFT_2                                           \
4085   do {                                                                  \
4086     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4087       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4088     else                                                                \
4089       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4090     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4091   } while (0)
4092
4093
4094 #define ENCODE_SINGLE_SHIFT_3                                           \
4095   do {                                                                  \
4096     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4097       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4098     else                                                                \
4099       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4100     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4101   } while (0)
4102
4103
4104 /* The following four macros produce codes (control character or
4105    escape sequence) for ISO2022 locking-shift functions (shift-in,
4106    shift-out, locking-shift-2, and locking-shift-3).  */
4107
4108 #define ENCODE_SHIFT_IN                                 \
4109   do {                                                  \
4110     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4111     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4112   } while (0)
4113
4114
4115 #define ENCODE_SHIFT_OUT                                \
4116   do {                                                  \
4117     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4118     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4119   } while (0)
4120
4121
4122 #define ENCODE_LOCKING_SHIFT_2                          \
4123   do {                                                  \
4124     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4125     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4126   } while (0)
4127
4128
4129 #define ENCODE_LOCKING_SHIFT_3                          \
4130   do {                                                  \
4131     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4132     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4133   } while (0)
4134
4135
4136 /* Produce codes for a DIMENSION1 character whose character set is
4137    CHARSET and whose position-code is C1.  Designation and invocation
4138    sequences are also produced in advance if necessary.  */
4139
4140 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4141   do {                                                                  \
4142     int id = CHARSET_ID (charset);                                      \
4143                                                                         \
4144     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4145         && id == charset_ascii)                                         \
4146       {                                                                 \
4147         id = charset_jisx0201_roman;                                    \
4148         charset = CHARSET_FROM_ID (id);                                 \
4149       }                                                                 \
4150                                                                         \
4151     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4152       {                                                                 \
4153         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4154           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4155         else                                                            \
4156           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4157         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4158         break;                                                          \
4159       }                                                                 \
4160     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4161       {                                                                 \
4162         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4163         break;                                                          \
4164       }                                                                 \
4165     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4166       {                                                                 \
4167         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4168         break;                                                          \
4169       }                                                                 \
4170     else                                                                \
4171       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4172          must invoke it, or, at first, designate it to some graphic     \
4173          register.  Then repeat the loop to actually produce the        \
4174          character.  */                                                 \
4175       dst = encode_invocation_designation (charset, coding, dst,        \
4176                                            &produced_chars);            \
4177   } while (1)
4178
4179
4180 /* Produce codes for a DIMENSION2 character whose character set is
4181    CHARSET and whose position-codes are C1 and C2.  Designation and
4182    invocation codes are also produced in advance if necessary.  */
4183
4184 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4185   do {                                                                  \
4186     int id = CHARSET_ID (charset);                                      \
4187                                                                         \
4188     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4189         && id == charset_jisx0208)                                      \
4190       {                                                                 \
4191         id = charset_jisx0208_1978;                                     \
4192         charset = CHARSET_FROM_ID (id);                                 \
4193       }                                                                 \
4194                                                                         \
4195     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4196       {                                                                 \
4197         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4198           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4199         else                                                            \
4200           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4201         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4202         break;                                                          \
4203       }                                                                 \
4204     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4205       {                                                                 \
4206         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4207         break;                                                          \
4208       }                                                                 \
4209     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4210       {                                                                 \
4211         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4212         break;                                                          \
4213       }                                                                 \
4214     else                                                                \
4215       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4216          must invoke it, or, at first, designate it to some graphic     \
4217          register.  Then repeat the loop to actually produce the        \
4218          character.  */                                                 \
4219       dst = encode_invocation_designation (charset, coding, dst,        \
4220                                            &produced_chars);            \
4221   } while (1)
4222
4223
4224 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4225   do {                                                                     \
4226     unsigned code;                                                         \
4227     CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code);       \
4228                                                                            \
4229     if (CHARSET_DIMENSION (charset) == 1)                                  \
4230       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4231     else                                                                   \
4232       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4233   } while (0)
4234
4235
4236 /* Produce designation and invocation codes at a place pointed by DST
4237    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4238    Return new DST.  */
4239
4240 static unsigned char *
4241 encode_invocation_designation (struct charset *charset,
4242                                struct coding_system *coding,
4243                                unsigned char *dst, ptrdiff_t *p_nchars)
4244 {
4245   bool multibytep = coding->dst_multibyte;
4246   ptrdiff_t produced_chars = *p_nchars;
4247   int reg;                      /* graphic register number */
4248   int id = CHARSET_ID (charset);
4249
4250   /* At first, check designations.  */
4251   for (reg = 0; reg < 4; reg++)
4252     if (id == CODING_ISO_DESIGNATION (coding, reg))
4253       break;
4254
4255   if (reg >= 4)
4256     {
4257       /* CHARSET is not yet designated to any graphic registers.  */
4258       /* At first check the requested designation.  */
4259       reg = CODING_ISO_REQUEST (coding, id);
4260       if (reg < 0)
4261         /* Since CHARSET requests no special designation, designate it
4262            to graphic register 0.  */
4263         reg = 0;
4264
4265       ENCODE_DESIGNATION (charset, reg, coding);
4266     }
4267
4268   if (CODING_ISO_INVOCATION (coding, 0) != reg
4269       && CODING_ISO_INVOCATION (coding, 1) != reg)
4270     {
4271       /* Since the graphic register REG is not invoked to any graphic
4272          planes, invoke it to graphic plane 0.  */
4273       switch (reg)
4274         {
4275         case 0:                 /* graphic register 0 */
4276           ENCODE_SHIFT_IN;
4277           break;
4278
4279         case 1:                 /* graphic register 1 */
4280           ENCODE_SHIFT_OUT;
4281           break;
4282
4283         case 2:                 /* graphic register 2 */
4284           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4285             ENCODE_SINGLE_SHIFT_2;
4286           else
4287             ENCODE_LOCKING_SHIFT_2;
4288           break;
4289
4290         case 3:                 /* graphic register 3 */
4291           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4292             ENCODE_SINGLE_SHIFT_3;
4293           else
4294             ENCODE_LOCKING_SHIFT_3;
4295           break;
4296
4297         default:
4298           break;
4299         }
4300     }
4301
4302   *p_nchars = produced_chars;
4303   return dst;
4304 }
4305
4306
4307 /* Produce codes for designation and invocation to reset the graphic
4308    planes and registers to initial state.  */
4309 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4310   do {                                                                  \
4311     int reg;                                                            \
4312     struct charset *charset;                                            \
4313                                                                         \
4314     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4315       ENCODE_SHIFT_IN;                                                  \
4316     for (reg = 0; reg < 4; reg++)                                       \
4317       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4318           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4319               != CODING_ISO_INITIAL (coding, reg)))                     \
4320         {                                                               \
4321           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4322           ENCODE_DESIGNATION (charset, reg, coding);                    \
4323         }                                                               \
4324   } while (0)
4325
4326
4327 /* Produce designation sequences of charsets in the line started from
4328    CHARBUF to a place pointed by DST, and return the number of
4329    produced bytes.  DST should not directly point a buffer text area
4330    which may be relocated by char_charset call.
4331
4332    If the current block ends before any end-of-line, we may fail to
4333    find all the necessary designations.  */
4334
4335 static ptrdiff_t
4336 encode_designation_at_bol (struct coding_system *coding,
4337                            int *charbuf, int *charbuf_end,
4338                            unsigned char *dst)
4339 {
4340   unsigned char *orig = dst;
4341   struct charset *charset;
4342   /* Table of charsets to be designated to each graphic register.  */
4343   int r[4];
4344   int c, found = 0, reg;
4345   ptrdiff_t produced_chars = 0;
4346   bool multibytep = coding->dst_multibyte;
4347   Lisp_Object attrs;
4348   Lisp_Object charset_list;
4349
4350   attrs = CODING_ID_ATTRS (coding->id);
4351   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4352   if (EQ (charset_list, Qiso_2022))
4353     charset_list = Viso_2022_charset_list;
4354
4355   for (reg = 0; reg < 4; reg++)
4356     r[reg] = -1;
4357
4358   while (charbuf < charbuf_end && found < 4)
4359     {
4360       int id;
4361
4362       c = *charbuf++;
4363       if (c == '\n')
4364         break;
4365       charset = char_charset (c, charset_list, NULL);
4366       id = CHARSET_ID (charset);
4367       reg = CODING_ISO_REQUEST (coding, id);
4368       if (reg >= 0 && r[reg] < 0)
4369         {
4370           found++;
4371           r[reg] = id;
4372         }
4373     }
4374
4375   if (found)
4376     {
4377       for (reg = 0; reg < 4; reg++)
4378         if (r[reg] >= 0
4379             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4380           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4381     }
4382
4383   return dst - orig;
4384 }
4385
4386 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4387
4388 static bool
4389 encode_coding_iso_2022 (struct coding_system *coding)
4390 {
4391   bool multibytep = coding->dst_multibyte;
4392   int *charbuf = coding->charbuf;
4393   int *charbuf_end = charbuf + coding->charbuf_used;
4394   unsigned char *dst = coding->destination + coding->produced;
4395   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4396   int safe_room = 16;
4397   bool bol_designation
4398     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4399        && CODING_ISO_BOL (coding));
4400   ptrdiff_t produced_chars = 0;
4401   Lisp_Object attrs, eol_type, charset_list;
4402   bool ascii_compatible;
4403   int c;
4404   int preferred_charset_id = -1;
4405
4406   CODING_GET_INFO (coding, attrs, charset_list);
4407   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4408   if (VECTORP (eol_type))
4409     eol_type = Qunix;
4410
4411   setup_iso_safe_charsets (attrs);
4412   /* Charset list may have been changed.  */
4413   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4414   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4415
4416   ascii_compatible
4417     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4418        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4419                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4420
4421   while (charbuf < charbuf_end)
4422     {
4423       ASSURE_DESTINATION (safe_room);
4424
4425       if (bol_designation)
4426         {
4427           /* We have to produce designation sequences if any now.  */
4428           unsigned char desig_buf[16];
4429           ptrdiff_t nbytes;
4430           ptrdiff_t offset;
4431
4432           charset_map_loaded = 0;
4433           nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4434                                               desig_buf);
4435           if (charset_map_loaded
4436               && (offset = coding_change_destination (coding)))
4437             {
4438               dst += offset;
4439               dst_end += offset;
4440             }
4441           memcpy (dst, desig_buf, nbytes);
4442           dst += nbytes;
4443           /* We are sure that designation sequences are all ASCII bytes.  */
4444           produced_chars += nbytes;
4445           bol_designation = 0;
4446           ASSURE_DESTINATION (safe_room);
4447         }
4448
4449       c = *charbuf++;
4450
4451       if (c < 0)
4452         {
4453           /* Handle an annotation.  */
4454           switch (*charbuf)
4455             {
4456             case CODING_ANNOTATE_COMPOSITION_MASK:
4457               /* Not yet implemented.  */
4458               break;
4459             case CODING_ANNOTATE_CHARSET_MASK:
4460               preferred_charset_id = charbuf[2];
4461               if (preferred_charset_id >= 0
4462                   && NILP (Fmemq (make_number (preferred_charset_id),
4463                                   charset_list)))
4464                 preferred_charset_id = -1;
4465               break;
4466             default:
4467               emacs_abort ();
4468             }
4469           charbuf += -c - 1;
4470           continue;
4471         }
4472
4473       /* Now encode the character C.  */
4474       if (c < 0x20 || c == 0x7F)
4475         {
4476           if (c == '\n'
4477               || (c == '\r' && EQ (eol_type, Qmac)))
4478             {
4479               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4480                 ENCODE_RESET_PLANE_AND_REGISTER ();
4481               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4482                 {
4483                   int i;
4484
4485                   for (i = 0; i < 4; i++)
4486                     CODING_ISO_DESIGNATION (coding, i)
4487                       = CODING_ISO_INITIAL (coding, i);
4488                 }
4489               bol_designation = ((CODING_ISO_FLAGS (coding)
4490                                   & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4491                                  != 0);
4492             }
4493           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4494             ENCODE_RESET_PLANE_AND_REGISTER ();
4495           EMIT_ONE_ASCII_BYTE (c);
4496         }
4497       else if (ASCII_CHAR_P (c))
4498         {
4499           if (ascii_compatible)
4500             EMIT_ONE_ASCII_BYTE (c);
4501           else
4502             {
4503               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4504               ENCODE_ISO_CHARACTER (charset, c);
4505             }
4506         }
4507       else if (CHAR_BYTE8_P (c))
4508         {
4509           c = CHAR_TO_BYTE8 (c);
4510           EMIT_ONE_BYTE (c);
4511         }
4512       else
4513         {
4514           struct charset *charset;
4515
4516           if (preferred_charset_id >= 0)
4517             {
4518               bool result;
4519
4520               charset = CHARSET_FROM_ID (preferred_charset_id);
4521               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4522               if (! result)
4523                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4524                                      NULL, charset);
4525             }
4526           else
4527             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4528                                  NULL, charset);
4529           if (!charset)
4530             {
4531               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4532                 {
4533                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4534                   charset = CHARSET_FROM_ID (charset_ascii);
4535                 }
4536               else
4537                 {
4538                   c = coding->default_char;
4539                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4540                                        charset_list, NULL, charset);
4541                 }
4542             }
4543           ENCODE_ISO_CHARACTER (charset, c);
4544         }
4545     }
4546
4547   if (coding->mode & CODING_MODE_LAST_BLOCK
4548       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4549     {
4550       ASSURE_DESTINATION (safe_room);
4551       ENCODE_RESET_PLANE_AND_REGISTER ();
4552     }
4553   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4554   CODING_ISO_BOL (coding) = bol_designation;
4555   coding->produced_char += produced_chars;
4556   coding->produced = dst - coding->destination;
4557   return 0;
4558 }
4559
4560 \f
4561 /*** 8,9. SJIS and BIG5 handlers ***/
4562
4563 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4564    quite widely.  So, for the moment, Emacs supports them in the bare
4565    C code.  But, in the future, they may be supported only by CCL.  */
4566
4567 /* SJIS is a coding system encoding three character sets: ASCII, right
4568    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4569    as is.  A character of charset katakana-jisx0201 is encoded by
4570    "position-code + 0x80".  A character of charset japanese-jisx0208
4571    is encoded in 2-byte but two position-codes are divided and shifted
4572    so that it fit in the range below.
4573
4574    --- CODE RANGE of SJIS ---
4575    (character set)      (range)
4576    ASCII                0x00 .. 0x7F
4577    KATAKANA-JISX0201    0xA0 .. 0xDF
4578    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4579             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4580    -------------------------------
4581
4582 */
4583
4584 /* BIG5 is a coding system encoding two character sets: ASCII and
4585    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4586    character set and is encoded in two-byte.
4587
4588    --- CODE RANGE of BIG5 ---
4589    (character set)      (range)
4590    ASCII                0x00 .. 0x7F
4591    Big5 (1st byte)      0xA1 .. 0xFE
4592         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4593    --------------------------
4594
4595   */
4596
4597 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4598    Return true if a text is encoded in SJIS.  */
4599
4600 static bool
4601 detect_coding_sjis (struct coding_system *coding,
4602                     struct coding_detection_info *detect_info)
4603 {
4604   const unsigned char *src = coding->source, *src_base;
4605   const unsigned char *src_end = coding->source + coding->src_bytes;
4606   bool multibytep = coding->src_multibyte;
4607   ptrdiff_t consumed_chars = 0;
4608   int found = 0;
4609   int c;
4610   Lisp_Object attrs, charset_list;
4611   int max_first_byte_of_2_byte_code;
4612
4613   CODING_GET_INFO (coding, attrs, charset_list);
4614   max_first_byte_of_2_byte_code
4615     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4616
4617   detect_info->checked |= CATEGORY_MASK_SJIS;
4618   /* A coding system of this category is always ASCII compatible.  */
4619   src += coding->head_ascii;
4620
4621   while (1)
4622     {
4623       src_base = src;
4624       ONE_MORE_BYTE (c);
4625       if (c < 0x80)
4626         continue;
4627       if ((c >= 0x81 && c <= 0x9F)
4628           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4629         {
4630           ONE_MORE_BYTE (c);
4631           if (c < 0x40 || c == 0x7F || c > 0xFC)
4632             break;
4633           found = CATEGORY_MASK_SJIS;
4634         }
4635       else if (c >= 0xA0 && c < 0xE0)
4636         found = CATEGORY_MASK_SJIS;
4637       else
4638         break;
4639     }
4640   detect_info->rejected |= CATEGORY_MASK_SJIS;
4641   return 0;
4642
4643  no_more_source:
4644   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4645     {
4646       detect_info->rejected |= CATEGORY_MASK_SJIS;
4647       return 0;
4648     }
4649   detect_info->found |= found;
4650   return 1;
4651 }
4652
4653 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4654    Return true if a text is encoded in BIG5.  */
4655
4656 static bool
4657 detect_coding_big5 (struct coding_system *coding,
4658                     struct coding_detection_info *detect_info)
4659 {
4660   const unsigned char *src = coding->source, *src_base;
4661   const unsigned char *src_end = coding->source + coding->src_bytes;
4662   bool multibytep = coding->src_multibyte;
4663   ptrdiff_t consumed_chars = 0;
4664   int found = 0;
4665   int c;
4666
4667   detect_info->checked |= CATEGORY_MASK_BIG5;
4668   /* A coding system of this category is always ASCII compatible.  */
4669   src += coding->head_ascii;
4670
4671   while (1)
4672     {
4673       src_base = src;
4674       ONE_MORE_BYTE (c);
4675       if (c < 0x80)
4676         continue;
4677       if (c >= 0xA1)
4678         {
4679           ONE_MORE_BYTE (c);
4680           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4681             return 0;
4682           found = CATEGORY_MASK_BIG5;
4683         }
4684       else
4685         break;
4686     }
4687   detect_info->rejected |= CATEGORY_MASK_BIG5;
4688   return 0;
4689
4690  no_more_source:
4691   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4692     {
4693       detect_info->rejected |= CATEGORY_MASK_BIG5;
4694       return 0;
4695     }
4696   detect_info->found |= found;
4697   return 1;
4698 }
4699
4700 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4701
4702 static void
4703 decode_coding_sjis (struct coding_system *coding)
4704 {
4705   const unsigned char *src = coding->source + coding->consumed;
4706   const unsigned char *src_end = coding->source + coding->src_bytes;
4707   const unsigned char *src_base;
4708   int *charbuf = coding->charbuf + coding->charbuf_used;
4709   /* We may produce one charset annotation in one loop and one more at
4710      the end.  */
4711   int *charbuf_end
4712     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4713   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4714   bool multibytep = coding->src_multibyte;
4715   struct charset *charset_roman, *charset_kanji, *charset_kana;
4716   struct charset *charset_kanji2;
4717   Lisp_Object attrs, charset_list, val;
4718   ptrdiff_t char_offset = coding->produced_char;
4719   ptrdiff_t last_offset = char_offset;
4720   int last_id = charset_ascii;
4721   bool eol_dos
4722     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4723   int byte_after_cr = -1;
4724
4725   CODING_GET_INFO (coding, attrs, charset_list);
4726
4727   val = charset_list;
4728   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4729   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4730   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4731   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4732
4733   while (1)
4734     {
4735       int c, c1;
4736       struct charset *charset;
4737
4738       src_base = src;
4739       consumed_chars_base = consumed_chars;
4740
4741       if (charbuf >= charbuf_end)
4742         {
4743           if (byte_after_cr >= 0)
4744             src_base--;
4745           break;
4746         }
4747
4748       if (byte_after_cr >= 0)
4749         c = byte_after_cr, byte_after_cr = -1;
4750       else
4751         ONE_MORE_BYTE (c);
4752       if (c < 0)
4753         goto invalid_code;
4754       if (c < 0x80)
4755         {
4756           if (eol_dos && c == '\r')
4757             ONE_MORE_BYTE (byte_after_cr);
4758           charset = charset_roman;
4759         }
4760       else if (c == 0x80 || c == 0xA0)
4761         goto invalid_code;
4762       else if (c >= 0xA1 && c <= 0xDF)
4763         {
4764           /* SJIS -> JISX0201-Kana */
4765           c &= 0x7F;
4766           charset = charset_kana;
4767         }
4768       else if (c <= 0xEF)
4769         {
4770           /* SJIS -> JISX0208 */
4771           ONE_MORE_BYTE (c1);
4772           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4773             goto invalid_code;
4774           c = (c << 8) | c1;
4775           SJIS_TO_JIS (c);
4776           charset = charset_kanji;
4777         }
4778       else if (c <= 0xFC && charset_kanji2)
4779         {
4780           /* SJIS -> JISX0213-2 */
4781           ONE_MORE_BYTE (c1);
4782           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4783             goto invalid_code;
4784           c = (c << 8) | c1;
4785           SJIS_TO_JIS2 (c);
4786           charset = charset_kanji2;
4787         }
4788       else
4789         goto invalid_code;
4790       if (charset->id != charset_ascii
4791           && last_id != charset->id)
4792         {
4793           if (last_id != charset_ascii)
4794             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4795           last_id = charset->id;
4796           last_offset = char_offset;
4797         }
4798       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4799       *charbuf++ = c;
4800       char_offset++;
4801       continue;
4802
4803     invalid_code:
4804       src = src_base;
4805       consumed_chars = consumed_chars_base;
4806       ONE_MORE_BYTE (c);
4807       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4808       char_offset++;
4809     }
4810
4811  no_more_source:
4812   if (last_id != charset_ascii)
4813     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4814   coding->consumed_char += consumed_chars_base;
4815   coding->consumed = src_base - coding->source;
4816   coding->charbuf_used = charbuf - coding->charbuf;
4817 }
4818
4819 static void
4820 decode_coding_big5 (struct coding_system *coding)
4821 {
4822   const unsigned char *src = coding->source + coding->consumed;
4823   const unsigned char *src_end = coding->source + coding->src_bytes;
4824   const unsigned char *src_base;
4825   int *charbuf = coding->charbuf + coding->charbuf_used;
4826   /* We may produce one charset annotation in one loop and one more at
4827      the end.  */
4828   int *charbuf_end
4829     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4830   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4831   bool multibytep = coding->src_multibyte;
4832   struct charset *charset_roman, *charset_big5;
4833   Lisp_Object attrs, charset_list, val;
4834   ptrdiff_t char_offset = coding->produced_char;
4835   ptrdiff_t last_offset = char_offset;
4836   int last_id = charset_ascii;
4837   bool eol_dos
4838     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4839   int byte_after_cr = -1;
4840
4841   CODING_GET_INFO (coding, attrs, charset_list);
4842   val = charset_list;
4843   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4844   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4845
4846   while (1)
4847     {
4848       int c, c1;
4849       struct charset *charset;
4850
4851       src_base = src;
4852       consumed_chars_base = consumed_chars;
4853
4854       if (charbuf >= charbuf_end)
4855         {
4856           if (byte_after_cr >= 0)
4857             src_base--;
4858           break;
4859         }
4860
4861       if (byte_after_cr >= 0)
4862         c = byte_after_cr, byte_after_cr = -1;
4863       else
4864         ONE_MORE_BYTE (c);
4865
4866       if (c < 0)
4867         goto invalid_code;
4868       if (c < 0x80)
4869         {
4870           if (eol_dos && c == '\r')
4871             ONE_MORE_BYTE (byte_after_cr);
4872           charset = charset_roman;
4873         }
4874       else
4875         {
4876           /* BIG5 -> Big5 */
4877           if (c < 0xA1 || c > 0xFE)
4878             goto invalid_code;
4879           ONE_MORE_BYTE (c1);
4880           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4881             goto invalid_code;
4882           c = c << 8 | c1;
4883           charset = charset_big5;
4884         }
4885       if (charset->id != charset_ascii
4886           && last_id != charset->id)
4887         {
4888           if (last_id != charset_ascii)
4889             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4890           last_id = charset->id;
4891           last_offset = char_offset;
4892         }
4893       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4894       *charbuf++ = c;
4895       char_offset++;
4896       continue;
4897
4898     invalid_code:
4899       src = src_base;
4900       consumed_chars = consumed_chars_base;
4901       ONE_MORE_BYTE (c);
4902       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4903       char_offset++;
4904     }
4905
4906  no_more_source:
4907   if (last_id != charset_ascii)
4908     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4909   coding->consumed_char += consumed_chars_base;
4910   coding->consumed = src_base - coding->source;
4911   coding->charbuf_used = charbuf - coding->charbuf;
4912 }
4913
4914 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4915    This function can encode charsets `ascii', `katakana-jisx0201',
4916    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4917    are sure that all these charsets are registered as official charset
4918    (i.e. do not have extended leading-codes).  Characters of other
4919    charsets are produced without any encoding.  */
4920
4921 static bool
4922 encode_coding_sjis (struct coding_system *coding)
4923 {
4924   bool multibytep = coding->dst_multibyte;
4925   int *charbuf = coding->charbuf;
4926   int *charbuf_end = charbuf + coding->charbuf_used;
4927   unsigned char *dst = coding->destination + coding->produced;
4928   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4929   int safe_room = 4;
4930   ptrdiff_t produced_chars = 0;
4931   Lisp_Object attrs, charset_list, val;
4932   bool ascii_compatible;
4933   struct charset *charset_kanji, *charset_kana;
4934   struct charset *charset_kanji2;
4935   int c;
4936
4937   CODING_GET_INFO (coding, attrs, charset_list);
4938   val = XCDR (charset_list);
4939   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4940   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4941   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4942
4943   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4944
4945   while (charbuf < charbuf_end)
4946     {
4947       ASSURE_DESTINATION (safe_room);
4948       c = *charbuf++;
4949       /* Now encode the character C.  */
4950       if (ASCII_CHAR_P (c) && ascii_compatible)
4951         EMIT_ONE_ASCII_BYTE (c);
4952       else if (CHAR_BYTE8_P (c))
4953         {
4954           c = CHAR_TO_BYTE8 (c);
4955           EMIT_ONE_BYTE (c);
4956         }
4957       else
4958         {
4959           unsigned code;
4960           struct charset *charset;
4961           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4962                                &code, charset);
4963
4964           if (!charset)
4965             {
4966               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4967                 {
4968                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4969                   charset = CHARSET_FROM_ID (charset_ascii);
4970                 }
4971               else
4972                 {
4973                   c = coding->default_char;
4974                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4975                                        charset_list, &code, charset);
4976                 }
4977             }
4978           if (code == CHARSET_INVALID_CODE (charset))
4979             emacs_abort ();
4980           if (charset == charset_kanji)
4981             {
4982               int c1, c2;
4983               JIS_TO_SJIS (code);
4984               c1 = code >> 8, c2 = code & 0xFF;
4985               EMIT_TWO_BYTES (c1, c2);
4986             }
4987           else if (charset == charset_kana)
4988             EMIT_ONE_BYTE (code | 0x80);
4989           else if (charset_kanji2 && charset == charset_kanji2)
4990             {
4991               int c1, c2;
4992
4993               c1 = code >> 8;
4994               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4995                   || c1 == 0x28
4996                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4997                 {
4998                   JIS_TO_SJIS2 (code);
4999                   c1 = code >> 8, c2 = code & 0xFF;
5000                   EMIT_TWO_BYTES (c1, c2);
5001                 }
5002               else
5003                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5004             }
5005           else
5006             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5007         }
5008     }
5009   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5010   coding->produced_char += produced_chars;
5011   coding->produced = dst - coding->destination;
5012   return 0;
5013 }
5014
5015 static bool
5016 encode_coding_big5 (struct coding_system *coding)
5017 {
5018   bool multibytep = coding->dst_multibyte;
5019   int *charbuf = coding->charbuf;
5020   int *charbuf_end = charbuf + coding->charbuf_used;
5021   unsigned char *dst = coding->destination + coding->produced;
5022   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5023   int safe_room = 4;
5024   ptrdiff_t produced_chars = 0;
5025   Lisp_Object attrs, charset_list, val;
5026   bool ascii_compatible;
5027   struct charset *charset_big5;
5028   int c;
5029
5030   CODING_GET_INFO (coding, attrs, charset_list);
5031   val = XCDR (charset_list);
5032   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5033   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5034
5035   while (charbuf < charbuf_end)
5036     {
5037       ASSURE_DESTINATION (safe_room);
5038       c = *charbuf++;
5039       /* Now encode the character C.  */
5040       if (ASCII_CHAR_P (c) && ascii_compatible)
5041         EMIT_ONE_ASCII_BYTE (c);
5042       else if (CHAR_BYTE8_P (c))
5043         {
5044           c = CHAR_TO_BYTE8 (c);
5045           EMIT_ONE_BYTE (c);
5046         }
5047       else
5048         {
5049           unsigned code;
5050           struct charset *charset;
5051           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5052                                &code, charset);
5053
5054           if (! charset)
5055             {
5056               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5057                 {
5058                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5059                   charset = CHARSET_FROM_ID (charset_ascii);
5060                 }
5061               else
5062                 {
5063                   c = coding->default_char;
5064                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5065                                        charset_list, &code, charset);
5066                 }
5067             }
5068           if (code == CHARSET_INVALID_CODE (charset))
5069             emacs_abort ();
5070           if (charset == charset_big5)
5071             {
5072               int c1, c2;
5073
5074               c1 = code >> 8, c2 = code & 0xFF;
5075               EMIT_TWO_BYTES (c1, c2);
5076             }
5077           else
5078             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5079         }
5080     }
5081   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5082   coding->produced_char += produced_chars;
5083   coding->produced = dst - coding->destination;
5084   return 0;
5085 }
5086
5087 \f
5088 /*** 10. CCL handlers ***/
5089
5090 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5091    Return true if a text is encoded in a coding system of which
5092    encoder/decoder are written in CCL program.  */
5093
5094 static bool
5095 detect_coding_ccl (struct coding_system *coding,
5096                    struct coding_detection_info *detect_info)
5097 {
5098   const unsigned char *src = coding->source, *src_base;
5099   const unsigned char *src_end = coding->source + coding->src_bytes;
5100   bool multibytep = coding->src_multibyte;
5101   ptrdiff_t consumed_chars = 0;
5102   int found = 0;
5103   unsigned char *valids;
5104   ptrdiff_t head_ascii = coding->head_ascii;
5105   Lisp_Object attrs;
5106
5107   detect_info->checked |= CATEGORY_MASK_CCL;
5108
5109   coding = &coding_categories[coding_category_ccl];
5110   valids = CODING_CCL_VALIDS (coding);
5111   attrs = CODING_ID_ATTRS (coding->id);
5112   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5113     src += head_ascii;
5114
5115   while (1)
5116     {
5117       int c;
5118
5119       src_base = src;
5120       ONE_MORE_BYTE (c);
5121       if (c < 0 || ! valids[c])
5122         break;
5123       if ((valids[c] > 1))
5124         found = CATEGORY_MASK_CCL;
5125     }
5126   detect_info->rejected |= CATEGORY_MASK_CCL;
5127   return 0;
5128
5129  no_more_source:
5130   detect_info->found |= found;
5131   return 1;
5132 }
5133
5134 static void
5135 decode_coding_ccl (struct coding_system *coding)
5136 {
5137   const unsigned char *src = coding->source + coding->consumed;
5138   const unsigned char *src_end = coding->source + coding->src_bytes;
5139   int *charbuf = coding->charbuf + coding->charbuf_used;
5140   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5141   ptrdiff_t consumed_chars = 0;
5142   bool multibytep = coding->src_multibyte;
5143   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5144   int source_charbuf[1024];
5145   int source_byteidx[1025];
5146   Lisp_Object attrs, charset_list;
5147
5148   CODING_GET_INFO (coding, attrs, charset_list);
5149
5150   while (1)
5151     {
5152       const unsigned char *p = src;
5153       ptrdiff_t offset;
5154       int i = 0;
5155
5156       if (multibytep)
5157         {
5158           while (i < 1024 && p < src_end)
5159             {
5160               source_byteidx[i] = p - src;
5161               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5162             }
5163           source_byteidx[i] = p - src;
5164         }
5165       else
5166         while (i < 1024 && p < src_end)
5167           source_charbuf[i++] = *p++;
5168
5169       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5170         ccl->last_block = true;
5171       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5172       charset_map_loaded = 0;
5173       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5174                   charset_list);
5175       if (charset_map_loaded
5176           && (offset = coding_change_source (coding)))
5177         {
5178           p += offset;
5179           src += offset;
5180           src_end += offset;
5181         }
5182       charbuf += ccl->produced;
5183       if (multibytep)
5184         src += source_byteidx[ccl->consumed];
5185       else
5186         src += ccl->consumed;
5187       consumed_chars += ccl->consumed;
5188       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5189         break;
5190     }
5191
5192   switch (ccl->status)
5193     {
5194     case CCL_STAT_SUSPEND_BY_SRC:
5195       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5196       break;
5197     case CCL_STAT_SUSPEND_BY_DST:
5198       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5199       break;
5200     case CCL_STAT_QUIT:
5201     case CCL_STAT_INVALID_CMD:
5202       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5203       break;
5204     default:
5205       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5206       break;
5207     }
5208   coding->consumed_char += consumed_chars;
5209   coding->consumed = src - coding->source;
5210   coding->charbuf_used = charbuf - coding->charbuf;
5211 }
5212
5213 static bool
5214 encode_coding_ccl (struct coding_system *coding)
5215 {
5216   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5217   bool multibytep = coding->dst_multibyte;
5218   int *charbuf = coding->charbuf;
5219   int *charbuf_end = charbuf + coding->charbuf_used;
5220   unsigned char *dst = coding->destination + coding->produced;
5221   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5222   int destination_charbuf[1024];
5223   ptrdiff_t produced_chars = 0;
5224   int i;
5225   Lisp_Object attrs, charset_list;
5226
5227   CODING_GET_INFO (coding, attrs, charset_list);
5228   if (coding->consumed_char == coding->src_chars
5229       && coding->mode & CODING_MODE_LAST_BLOCK)
5230     ccl->last_block = true;
5231
5232   do
5233     {
5234       ptrdiff_t offset;
5235
5236       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5237       charset_map_loaded = 0;
5238       ccl_driver (ccl, charbuf, destination_charbuf,
5239                   charbuf_end - charbuf, 1024, charset_list);
5240       if (charset_map_loaded
5241           && (offset = coding_change_destination (coding)))
5242         dst += offset;
5243       if (multibytep)
5244         {
5245           ASSURE_DESTINATION (ccl->produced * 2);
5246           for (i = 0; i < ccl->produced; i++)
5247             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5248         }
5249       else
5250         {
5251           ASSURE_DESTINATION (ccl->produced);
5252           for (i = 0; i < ccl->produced; i++)
5253             *dst++ = destination_charbuf[i] & 0xFF;
5254           produced_chars += ccl->produced;
5255         }
5256       charbuf += ccl->consumed;
5257       if (ccl->status == CCL_STAT_QUIT
5258           || ccl->status == CCL_STAT_INVALID_CMD)
5259         break;
5260     }
5261   while (charbuf < charbuf_end);
5262
5263   switch (ccl->status)
5264     {
5265     case CCL_STAT_SUSPEND_BY_SRC:
5266       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5267       break;
5268     case CCL_STAT_SUSPEND_BY_DST:
5269       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5270       break;
5271     case CCL_STAT_QUIT:
5272     case CCL_STAT_INVALID_CMD:
5273       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5274       break;
5275     default:
5276       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5277       break;
5278     }
5279
5280   coding->produced_char += produced_chars;
5281   coding->produced = dst - coding->destination;
5282   return 0;
5283 }
5284
5285 \f
5286 /*** 10, 11. no-conversion handlers ***/
5287
5288 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5289
5290 static void
5291 decode_coding_raw_text (struct coding_system *coding)
5292 {
5293   bool eol_dos
5294     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5295
5296   coding->chars_at_source = 1;
5297   coding->consumed_char = coding->src_chars;
5298   coding->consumed = coding->src_bytes;
5299   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5300     {
5301       coding->consumed_char--;
5302       coding->consumed--;
5303       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5304     }
5305   else
5306     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5307 }
5308
5309 static bool
5310 encode_coding_raw_text (struct coding_system *coding)
5311 {
5312   bool multibytep = coding->dst_multibyte;
5313   int *charbuf = coding->charbuf;
5314   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5315   unsigned char *dst = coding->destination + coding->produced;
5316   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5317   ptrdiff_t produced_chars = 0;
5318   int c;
5319
5320   if (multibytep)
5321     {
5322       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5323
5324       if (coding->src_multibyte)
5325         while (charbuf < charbuf_end)
5326           {
5327             ASSURE_DESTINATION (safe_room);
5328             c = *charbuf++;
5329             if (ASCII_CHAR_P (c))
5330               EMIT_ONE_ASCII_BYTE (c);
5331             else if (CHAR_BYTE8_P (c))
5332               {
5333                 c = CHAR_TO_BYTE8 (c);
5334                 EMIT_ONE_BYTE (c);
5335               }
5336             else
5337               {
5338                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5339
5340                 CHAR_STRING_ADVANCE (c, p1);
5341                 do
5342                   {
5343                     EMIT_ONE_BYTE (*p0);
5344                     p0++;
5345                   }
5346                 while (p0 < p1);
5347               }
5348           }
5349       else
5350         while (charbuf < charbuf_end)
5351           {
5352             ASSURE_DESTINATION (safe_room);
5353             c = *charbuf++;
5354             EMIT_ONE_BYTE (c);
5355           }
5356     }
5357   else
5358     {
5359       if (coding->src_multibyte)
5360         {
5361           int safe_room = MAX_MULTIBYTE_LENGTH;
5362
5363           while (charbuf < charbuf_end)
5364             {
5365               ASSURE_DESTINATION (safe_room);
5366               c = *charbuf++;
5367               if (ASCII_CHAR_P (c))
5368                 *dst++ = c;
5369               else if (CHAR_BYTE8_P (c))
5370                 *dst++ = CHAR_TO_BYTE8 (c);
5371               else
5372                 CHAR_STRING_ADVANCE (c, dst);
5373             }
5374         }
5375       else
5376         {
5377           ASSURE_DESTINATION (charbuf_end - charbuf);
5378           while (charbuf < charbuf_end && dst < dst_end)
5379             *dst++ = *charbuf++;
5380         }
5381       produced_chars = dst - (coding->destination + coding->produced);
5382     }
5383   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5384   coding->produced_char += produced_chars;
5385   coding->produced = dst - coding->destination;
5386   return 0;
5387 }
5388
5389 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5390    Return true if a text is encoded in a charset-based coding system.  */
5391
5392 static bool
5393 detect_coding_charset (struct coding_system *coding,
5394                        struct coding_detection_info *detect_info)
5395 {
5396   const unsigned char *src = coding->source, *src_base;
5397   const unsigned char *src_end = coding->source + coding->src_bytes;
5398   bool multibytep = coding->src_multibyte;
5399   ptrdiff_t consumed_chars = 0;
5400   Lisp_Object attrs, valids, name;
5401   int found = 0;
5402   ptrdiff_t head_ascii = coding->head_ascii;
5403   bool check_latin_extra = 0;
5404
5405   detect_info->checked |= CATEGORY_MASK_CHARSET;
5406
5407   coding = &coding_categories[coding_category_charset];
5408   attrs = CODING_ID_ATTRS (coding->id);
5409   valids = AREF (attrs, coding_attr_charset_valids);
5410   name = CODING_ID_NAME (coding->id);
5411   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5412                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5413       || strncmp (SSDATA (SYMBOL_NAME (name)),
5414                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5415     check_latin_extra = 1;
5416
5417   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5418     src += head_ascii;
5419
5420   while (1)
5421     {
5422       int c;
5423       Lisp_Object val;
5424       struct charset *charset;
5425       int dim, idx;
5426
5427       src_base = src;
5428       ONE_MORE_BYTE (c);
5429       if (c < 0)
5430         continue;
5431       val = AREF (valids, c);
5432       if (NILP (val))
5433         break;
5434       if (c >= 0x80)
5435         {
5436           if (c < 0xA0
5437               && check_latin_extra
5438               && (!VECTORP (Vlatin_extra_code_table)
5439                   || NILP (AREF (Vlatin_extra_code_table, c))))
5440             break;
5441           found = CATEGORY_MASK_CHARSET;
5442         }
5443       if (INTEGERP (val))
5444         {
5445           charset = CHARSET_FROM_ID (XFASTINT (val));
5446           dim = CHARSET_DIMENSION (charset);
5447           for (idx = 1; idx < dim; idx++)
5448             {
5449               if (src == src_end)
5450                 goto too_short;
5451               ONE_MORE_BYTE (c);
5452               if (c < charset->code_space[(dim - 1 - idx) * 4]
5453                   || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5454                 break;
5455             }
5456           if (idx < dim)
5457             break;
5458         }
5459       else
5460         {
5461           idx = 1;
5462           for (; CONSP (val); val = XCDR (val))
5463             {
5464               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5465               dim = CHARSET_DIMENSION (charset);
5466               while (idx < dim)
5467                 {
5468                   if (src == src_end)
5469                     goto too_short;
5470                   ONE_MORE_BYTE (c);
5471                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5472                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5473                     break;
5474                   idx++;
5475                 }
5476               if (idx == dim)
5477                 {
5478                   val = Qnil;
5479                   break;
5480                 }
5481             }
5482           if (CONSP (val))
5483             break;
5484         }
5485     }
5486  too_short:
5487   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5488   return 0;
5489
5490  no_more_source:
5491   detect_info->found |= found;
5492   return 1;
5493 }
5494
5495 static void
5496 decode_coding_charset (struct coding_system *coding)
5497 {
5498   const unsigned char *src = coding->source + coding->consumed;
5499   const unsigned char *src_end = coding->source + coding->src_bytes;
5500   const unsigned char *src_base;
5501   int *charbuf = coding->charbuf + coding->charbuf_used;
5502   /* We may produce one charset annotation in one loop and one more at
5503      the end.  */
5504   int *charbuf_end
5505     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5506   ptrdiff_t consumed_chars = 0, consumed_chars_base;
5507   bool multibytep = coding->src_multibyte;
5508   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5509   Lisp_Object valids;
5510   ptrdiff_t char_offset = coding->produced_char;
5511   ptrdiff_t last_offset = char_offset;
5512   int last_id = charset_ascii;
5513   bool eol_dos
5514     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5515   int byte_after_cr = -1;
5516
5517   valids = AREF (attrs, coding_attr_charset_valids);
5518
5519   while (1)
5520     {
5521       int c;
5522       Lisp_Object val;
5523       struct charset *charset;
5524       int dim;
5525       int len = 1;
5526       unsigned code;
5527
5528       src_base = src;
5529       consumed_chars_base = consumed_chars;
5530
5531       if (charbuf >= charbuf_end)
5532         {
5533           if (byte_after_cr >= 0)
5534             src_base--;
5535           break;
5536         }
5537
5538       if (byte_after_cr >= 0)
5539         {
5540           c = byte_after_cr;
5541           byte_after_cr = -1;
5542         }
5543       else
5544         {
5545           ONE_MORE_BYTE (c);
5546           if (eol_dos && c == '\r')
5547             ONE_MORE_BYTE (byte_after_cr);
5548         }
5549       if (c < 0)
5550         goto invalid_code;
5551       code = c;
5552
5553       val = AREF (valids, c);
5554       if (! INTEGERP (val) && ! CONSP (val))
5555         goto invalid_code;
5556       if (INTEGERP (val))
5557         {
5558           charset = CHARSET_FROM_ID (XFASTINT (val));
5559           dim = CHARSET_DIMENSION (charset);
5560           while (len < dim)
5561             {
5562               ONE_MORE_BYTE (c);
5563               code = (code << 8) | c;
5564               len++;
5565             }
5566           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5567                               charset, code, c);
5568         }
5569       else
5570         {
5571           /* VAL is a list of charset IDs.  It is assured that the
5572              list is sorted by charset dimensions (smaller one
5573              comes first).  */
5574           while (CONSP (val))
5575             {
5576               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5577               dim = CHARSET_DIMENSION (charset);
5578               while (len < dim)
5579                 {
5580                   ONE_MORE_BYTE (c);
5581                   code = (code << 8) | c;
5582                   len++;
5583                 }
5584               CODING_DECODE_CHAR (coding, src, src_base,
5585                                   src_end, charset, code, c);
5586               if (c >= 0)
5587                 break;
5588               val = XCDR (val);
5589             }
5590         }
5591       if (c < 0)
5592         goto invalid_code;
5593       if (charset->id != charset_ascii
5594           && last_id != charset->id)
5595         {
5596           if (last_id != charset_ascii)
5597             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5598           last_id = charset->id;
5599           last_offset = char_offset;
5600         }
5601
5602       *charbuf++ = c;
5603       char_offset++;
5604       continue;
5605
5606     invalid_code:
5607       src = src_base;
5608       consumed_chars = consumed_chars_base;
5609       ONE_MORE_BYTE (c);
5610       *charbuf++ = c < 0 ? -c : ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
5611       char_offset++;
5612     }
5613
5614  no_more_source:
5615   if (last_id != charset_ascii)
5616     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5617   coding->consumed_char += consumed_chars_base;
5618   coding->consumed = src_base - coding->source;
5619   coding->charbuf_used = charbuf - coding->charbuf;
5620 }
5621
5622 static bool
5623 encode_coding_charset (struct coding_system *coding)
5624 {
5625   bool multibytep = coding->dst_multibyte;
5626   int *charbuf = coding->charbuf;
5627   int *charbuf_end = charbuf + coding->charbuf_used;
5628   unsigned char *dst = coding->destination + coding->produced;
5629   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5630   int safe_room = MAX_MULTIBYTE_LENGTH;
5631   ptrdiff_t produced_chars = 0;
5632   Lisp_Object attrs, charset_list;
5633   bool ascii_compatible;
5634   int c;
5635
5636   CODING_GET_INFO (coding, attrs, charset_list);
5637   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5638
5639   while (charbuf < charbuf_end)
5640     {
5641       struct charset *charset;
5642       unsigned code;
5643
5644       ASSURE_DESTINATION (safe_room);
5645       c = *charbuf++;
5646       if (ascii_compatible && ASCII_CHAR_P (c))
5647         EMIT_ONE_ASCII_BYTE (c);
5648       else if (CHAR_BYTE8_P (c))
5649         {
5650           c = CHAR_TO_BYTE8 (c);
5651           EMIT_ONE_BYTE (c);
5652         }
5653       else
5654         {
5655           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5656                                &code, charset);
5657
5658           if (charset)
5659             {
5660               if (CHARSET_DIMENSION (charset) == 1)
5661                 EMIT_ONE_BYTE (code);
5662               else if (CHARSET_DIMENSION (charset) == 2)
5663                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5664               else if (CHARSET_DIMENSION (charset) == 3)
5665                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5666               else
5667                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5668                                  (code >> 8) & 0xFF, code & 0xFF);
5669             }
5670           else
5671             {
5672               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5673                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5674               else
5675                 c = coding->default_char;
5676               EMIT_ONE_BYTE (c);
5677             }
5678         }
5679     }
5680
5681   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5682   coding->produced_char += produced_chars;
5683   coding->produced = dst - coding->destination;
5684   return 0;
5685 }
5686
5687 \f
5688 /*** 7. C library functions ***/
5689
5690 /* Setup coding context CODING from information about CODING_SYSTEM.
5691    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5692    CODING_SYSTEM is invalid, signal an error.  */
5693
5694 void
5695 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5696 {
5697   Lisp_Object attrs;
5698   Lisp_Object eol_type;
5699   Lisp_Object coding_type;
5700   Lisp_Object val;
5701
5702   if (NILP (coding_system))
5703     coding_system = Qundecided;
5704
5705   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5706
5707   attrs = CODING_ID_ATTRS (coding->id);
5708   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5709
5710   coding->mode = 0;
5711   if (VECTORP (eol_type))
5712     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5713                             | CODING_REQUIRE_DETECTION_MASK);
5714   else if (! EQ (eol_type, Qunix))
5715     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5716                             | CODING_REQUIRE_ENCODING_MASK);
5717   else
5718     coding->common_flags = 0;
5719   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5720     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5721   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5722     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5723   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5724     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5725
5726   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5727   coding->max_charset_id = SCHARS (val) - 1;
5728   coding->safe_charsets = SDATA (val);
5729   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5730   coding->carryover_bytes = 0;
5731   coding->raw_destination = 0;
5732
5733   coding_type = CODING_ATTR_TYPE (attrs);
5734   if (EQ (coding_type, Qundecided))
5735     {
5736       coding->detector = NULL;
5737       coding->decoder = decode_coding_raw_text;
5738       coding->encoder = encode_coding_raw_text;
5739       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5740       coding->spec.undecided.inhibit_nbd
5741         = (encode_inhibit_flag
5742            (AREF (attrs, coding_attr_undecided_inhibit_null_byte_detection)));
5743       coding->spec.undecided.inhibit_ied
5744         = (encode_inhibit_flag
5745            (AREF (attrs, coding_attr_undecided_inhibit_iso_escape_detection)));
5746       coding->spec.undecided.prefer_utf_8
5747         = ! NILP (AREF (attrs, coding_attr_undecided_prefer_utf_8));
5748     }
5749   else if (EQ (coding_type, Qiso_2022))
5750     {
5751       int i;
5752       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5753
5754       /* Invoke graphic register 0 to plane 0.  */
5755       CODING_ISO_INVOCATION (coding, 0) = 0;
5756       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5757       CODING_ISO_INVOCATION (coding, 1)
5758         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5759       /* Setup the initial status of designation.  */
5760       for (i = 0; i < 4; i++)
5761         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5762       /* Not single shifting initially.  */
5763       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5764       /* Beginning of buffer should also be regarded as bol. */
5765       CODING_ISO_BOL (coding) = 1;
5766       coding->detector = detect_coding_iso_2022;
5767       coding->decoder = decode_coding_iso_2022;
5768       coding->encoder = encode_coding_iso_2022;
5769       if (flags & CODING_ISO_FLAG_SAFE)
5770         coding->mode |= CODING_MODE_SAFE_ENCODING;
5771       coding->common_flags
5772         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5773             | CODING_REQUIRE_FLUSHING_MASK);
5774       if (flags & CODING_ISO_FLAG_COMPOSITION)
5775         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5776       if (flags & CODING_ISO_FLAG_DESIGNATION)
5777         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5778       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5779         {
5780           setup_iso_safe_charsets (attrs);
5781           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5782           coding->max_charset_id = SCHARS (val) - 1;
5783           coding->safe_charsets = SDATA (val);
5784         }
5785       CODING_ISO_FLAGS (coding) = flags;
5786       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5787       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5788       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5789       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5790     }
5791   else if (EQ (coding_type, Qcharset))
5792     {
5793       coding->detector = detect_coding_charset;
5794       coding->decoder = decode_coding_charset;
5795       coding->encoder = encode_coding_charset;
5796       coding->common_flags
5797         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5798     }
5799   else if (EQ (coding_type, Qutf_8))
5800     {
5801       val = AREF (attrs, coding_attr_utf_bom);
5802       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5803                                    : EQ (val, Qt) ? utf_with_bom
5804                                    : utf_without_bom);
5805       coding->detector = detect_coding_utf_8;
5806       coding->decoder = decode_coding_utf_8;
5807       coding->encoder = encode_coding_utf_8;
5808       coding->common_flags
5809         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5810       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5811         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5812     }
5813   else if (EQ (coding_type, Qutf_16))
5814     {
5815       val = AREF (attrs, coding_attr_utf_bom);
5816       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5817                                     : EQ (val, Qt) ? utf_with_bom
5818                                     : utf_without_bom);
5819       val = AREF (attrs, coding_attr_utf_16_endian);
5820       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5821                                        : utf_16_little_endian);
5822       CODING_UTF_16_SURROGATE (coding) = 0;
5823       coding->detector = detect_coding_utf_16;
5824       coding->decoder = decode_coding_utf_16;
5825       coding->encoder = encode_coding_utf_16;
5826       coding->common_flags
5827         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5828       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5829         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5830     }
5831   else if (EQ (coding_type, Qccl))
5832     {
5833       coding->detector = detect_coding_ccl;
5834       coding->decoder = decode_coding_ccl;
5835       coding->encoder = encode_coding_ccl;
5836       coding->common_flags
5837         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5838             | CODING_REQUIRE_FLUSHING_MASK);
5839     }
5840   else if (EQ (coding_type, Qemacs_mule))
5841     {
5842       coding->detector = detect_coding_emacs_mule;
5843       coding->decoder = decode_coding_emacs_mule;
5844       coding->encoder = encode_coding_emacs_mule;
5845       coding->common_flags
5846         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5847       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5848           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5849         {
5850           Lisp_Object tail, safe_charsets;
5851           int max_charset_id = 0;
5852
5853           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5854                tail = XCDR (tail))
5855             if (max_charset_id < XFASTINT (XCAR (tail)))
5856               max_charset_id = XFASTINT (XCAR (tail));
5857           safe_charsets = make_uninit_string (max_charset_id + 1);
5858           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5859           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5860                tail = XCDR (tail))
5861             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5862           coding->max_charset_id = max_charset_id;
5863           coding->safe_charsets = SDATA (safe_charsets);
5864         }
5865       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5866       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5867     }
5868   else if (EQ (coding_type, Qshift_jis))
5869     {
5870       coding->detector = detect_coding_sjis;
5871       coding->decoder = decode_coding_sjis;
5872       coding->encoder = encode_coding_sjis;
5873       coding->common_flags
5874         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5875     }
5876   else if (EQ (coding_type, Qbig5))
5877     {
5878       coding->detector = detect_coding_big5;
5879       coding->decoder = decode_coding_big5;
5880       coding->encoder = encode_coding_big5;
5881       coding->common_flags
5882         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5883     }
5884   else                          /* EQ (coding_type, Qraw_text) */
5885     {
5886       coding->detector = NULL;
5887       coding->decoder = decode_coding_raw_text;
5888       coding->encoder = encode_coding_raw_text;
5889       if (! EQ (eol_type, Qunix))
5890         {
5891           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5892           if (! VECTORP (eol_type))
5893             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5894         }
5895
5896     }
5897
5898   return;
5899 }
5900
5901 /* Return a list of charsets supported by CODING.  */
5902
5903 Lisp_Object
5904 coding_charset_list (struct coding_system *coding)
5905 {
5906   Lisp_Object attrs, charset_list;
5907
5908   CODING_GET_INFO (coding, attrs, charset_list);
5909   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5910     {
5911       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5912
5913       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5914         charset_list = Viso_2022_charset_list;
5915     }
5916   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5917     {
5918       charset_list = Vemacs_mule_charset_list;
5919     }
5920   return charset_list;
5921 }
5922
5923
5924 /* Return a list of charsets supported by CODING-SYSTEM.  */
5925
5926 Lisp_Object
5927 coding_system_charset_list (Lisp_Object coding_system)
5928 {
5929   ptrdiff_t id;
5930   Lisp_Object attrs, charset_list;
5931
5932   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5933   attrs = CODING_ID_ATTRS (id);
5934
5935   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5936     {
5937       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5938
5939       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5940         charset_list = Viso_2022_charset_list;
5941       else
5942         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5943     }
5944   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5945     {
5946       charset_list = Vemacs_mule_charset_list;
5947     }
5948   else
5949     {
5950       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5951     }
5952   return charset_list;
5953 }
5954
5955
5956 /* Return raw-text or one of its subsidiaries that has the same
5957    eol_type as CODING-SYSTEM.  */
5958
5959 Lisp_Object
5960 raw_text_coding_system (Lisp_Object coding_system)
5961 {
5962   Lisp_Object spec, attrs;
5963   Lisp_Object eol_type, raw_text_eol_type;
5964
5965   if (NILP (coding_system))
5966     return Qraw_text;
5967   spec = CODING_SYSTEM_SPEC (coding_system);
5968   attrs = AREF (spec, 0);
5969
5970   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5971     return coding_system;
5972
5973   eol_type = AREF (spec, 2);
5974   if (VECTORP (eol_type))
5975     return Qraw_text;
5976   spec = CODING_SYSTEM_SPEC (Qraw_text);
5977   raw_text_eol_type = AREF (spec, 2);
5978   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5979           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5980           : AREF (raw_text_eol_type, 2));
5981 }
5982
5983 /* Return true if CODING corresponds to raw-text coding-system.  */
5984
5985 bool
5986 raw_text_coding_system_p (struct coding_system *coding)
5987 {
5988   return (coding->decoder == decode_coding_raw_text
5989           && coding->encoder == encode_coding_raw_text) ? true : false;
5990 }
5991
5992
5993 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5994    the subsidiary that has the same eol-spec as PARENT (if it is not
5995    nil and specifies end-of-line format) or the system's setting
5996    (system_eol_type).  */
5997
5998 Lisp_Object
5999 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
6000 {
6001   Lisp_Object spec, eol_type;
6002
6003   if (NILP (coding_system))
6004     coding_system = Qraw_text;
6005   else
6006     CHECK_CODING_SYSTEM (coding_system);
6007   spec = CODING_SYSTEM_SPEC (coding_system);
6008   eol_type = AREF (spec, 2);
6009   if (VECTORP (eol_type))
6010     {
6011       Lisp_Object parent_eol_type;
6012
6013       if (! NILP (parent))
6014         {
6015           Lisp_Object parent_spec;
6016
6017           CHECK_CODING_SYSTEM (parent);
6018           parent_spec = CODING_SYSTEM_SPEC (parent);
6019           parent_eol_type = AREF (parent_spec, 2);
6020           if (VECTORP (parent_eol_type))
6021             parent_eol_type = system_eol_type;
6022         }
6023       else
6024         parent_eol_type = system_eol_type;
6025       if (EQ (parent_eol_type, Qunix))
6026         coding_system = AREF (eol_type, 0);
6027       else if (EQ (parent_eol_type, Qdos))
6028         coding_system = AREF (eol_type, 1);
6029       else if (EQ (parent_eol_type, Qmac))
6030         coding_system = AREF (eol_type, 2);
6031     }
6032   return coding_system;
6033 }
6034
6035
6036 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
6037    decided for writing to a process.  If not, complement them, and
6038    return a new coding system.  */
6039
6040 Lisp_Object
6041 complement_process_encoding_system (Lisp_Object coding_system)
6042 {
6043   Lisp_Object coding_base = Qnil, eol_base = Qnil;
6044   Lisp_Object spec, attrs;
6045   int i;
6046
6047   for (i = 0; i < 3; i++)
6048     {
6049       if (i == 1)
6050         coding_system = CDR_SAFE (Vdefault_process_coding_system);
6051       else if (i == 2)
6052         coding_system = preferred_coding_system ();
6053       spec = CODING_SYSTEM_SPEC (coding_system);
6054       if (NILP (spec))
6055         continue;
6056       attrs = AREF (spec, 0);
6057       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6058         coding_base = CODING_ATTR_BASE_NAME (attrs);
6059       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
6060         eol_base = coding_system;
6061       if (! NILP (coding_base) && ! NILP (eol_base))
6062         break;
6063     }
6064
6065   if (i > 0)
6066     /* The original CODING_SYSTEM didn't specify text-conversion or
6067        eol-conversion.  Be sure that we return a fully complemented
6068        coding system.  */
6069     coding_system = coding_inherit_eol_type (coding_base, eol_base);
6070   return coding_system;
6071 }
6072
6073
6074 /* Emacs has a mechanism to automatically detect a coding system if it
6075    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
6076    it's impossible to distinguish some coding systems accurately
6077    because they use the same range of codes.  So, at first, coding
6078    systems are categorized into 7, those are:
6079
6080    o coding-category-emacs-mule
6081
6082         The category for a coding system which has the same code range
6083         as Emacs' internal format.  Assigned the coding-system (Lisp
6084         symbol) `emacs-mule' by default.
6085
6086    o coding-category-sjis
6087
6088         The category for a coding system which has the same code range
6089         as SJIS.  Assigned the coding-system (Lisp
6090         symbol) `japanese-shift-jis' by default.
6091
6092    o coding-category-iso-7
6093
6094         The category for a coding system which has the same code range
6095         as ISO2022 of 7-bit environment.  This doesn't use any locking
6096         shift and single shift functions.  This can encode/decode all
6097         charsets.  Assigned the coding-system (Lisp symbol)
6098         `iso-2022-7bit' by default.
6099
6100    o coding-category-iso-7-tight
6101
6102         Same as coding-category-iso-7 except that this can
6103         encode/decode only the specified charsets.
6104
6105    o coding-category-iso-8-1
6106
6107         The category for a coding system which has the same code range
6108         as ISO2022 of 8-bit environment and graphic plane 1 used only
6109         for DIMENSION1 charset.  This doesn't use any locking shift
6110         and single shift functions.  Assigned the coding-system (Lisp
6111         symbol) `iso-latin-1' by default.
6112
6113    o coding-category-iso-8-2
6114
6115         The category for a coding system which has the same code range
6116         as ISO2022 of 8-bit environment and graphic plane 1 used only
6117         for DIMENSION2 charset.  This doesn't use any locking shift
6118         and single shift functions.  Assigned the coding-system (Lisp
6119         symbol) `japanese-iso-8bit' by default.
6120
6121    o coding-category-iso-7-else
6122
6123         The category for a coding system which has the same code range
6124         as ISO2022 of 7-bit environment but uses locking shift or
6125         single shift functions.  Assigned the coding-system (Lisp
6126         symbol) `iso-2022-7bit-lock' by default.
6127
6128    o coding-category-iso-8-else
6129
6130         The category for a coding system which has the same code range
6131         as ISO2022 of 8-bit environment but uses locking shift or
6132         single shift functions.  Assigned the coding-system (Lisp
6133         symbol) `iso-2022-8bit-ss2' by default.
6134
6135    o coding-category-big5
6136
6137         The category for a coding system which has the same code range
6138         as BIG5.  Assigned the coding-system (Lisp symbol)
6139         `cn-big5' by default.
6140
6141    o coding-category-utf-8
6142
6143         The category for a coding system which has the same code range
6144         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6145         symbol) `utf-8' by default.
6146
6147    o coding-category-utf-16-be
6148
6149         The category for a coding system in which a text has an
6150         Unicode signature (cf. Unicode Standard) in the order of BIG
6151         endian at the head.  Assigned the coding-system (Lisp symbol)
6152         `utf-16-be' by default.
6153
6154    o coding-category-utf-16-le
6155
6156         The category for a coding system in which a text has an
6157         Unicode signature (cf. Unicode Standard) in the order of
6158         LITTLE endian at the head.  Assigned the coding-system (Lisp
6159         symbol) `utf-16-le' by default.
6160
6161    o coding-category-ccl
6162
6163         The category for a coding system of which encoder/decoder is
6164         written in CCL programs.  The default value is nil, i.e., no
6165         coding system is assigned.
6166
6167    o coding-category-binary
6168
6169         The category for a coding system not categorized in any of the
6170         above.  Assigned the coding-system (Lisp symbol)
6171         `no-conversion' by default.
6172
6173    Each of them is a Lisp symbol and the value is an actual
6174    `coding-system's (this is also a Lisp symbol) assigned by a user.
6175    What Emacs does actually is to detect a category of coding system.
6176    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6177    decide only one possible category, it selects a category of the
6178    highest priority.  Priorities of categories are also specified by a
6179    user in a Lisp variable `coding-category-list'.
6180
6181 */
6182
6183 static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,
6184                                            int eol_seen);
6185
6186
6187 /* Return the number of ASCII characters at the head of the source.
6188    By side effects, set coding->head_ascii and update
6189    coding->eol_seen.  The value of coding->eol_seen is "logical or" of
6190    EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is
6191    reliable only when all the source bytes are ASCII.  */
6192
6193 static ptrdiff_t
6194 check_ascii (struct coding_system *coding)
6195 {
6196   const unsigned char *src, *end;
6197   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6198   int eol_seen = coding->eol_seen;
6199
6200   coding_set_source (coding);
6201   src = coding->source;
6202   end = src + coding->src_bytes;
6203
6204   if (inhibit_eol_conversion
6205       || SYMBOLP (eol_type))
6206     {
6207       /* We don't have to check EOL format.  */
6208       while (src < end && !( *src & 0x80))
6209         {
6210           if (*src++ == '\n')
6211             eol_seen |= EOL_SEEN_LF;
6212         }
6213     }
6214   else
6215     {
6216       end--;                /* We look ahead one byte for "CR LF".  */
6217       while (src < end)
6218         {
6219           int c = *src;
6220
6221           if (c & 0x80)
6222             break;
6223           src++;
6224           if (c == '\r')
6225             {
6226               if (*src == '\n')
6227                 {
6228                   eol_seen |= EOL_SEEN_CRLF;
6229                   src++;
6230                 }
6231               else
6232                 eol_seen |= EOL_SEEN_CR;
6233             }
6234           else if (c == '\n')
6235             eol_seen |= EOL_SEEN_LF;
6236         }
6237       if (src == end)
6238         {
6239           int c = *src;
6240
6241           /* All bytes but the last one C are ASCII.  */
6242           if (! (c & 0x80))
6243             {
6244               if (c == '\r')
6245                 eol_seen |= EOL_SEEN_CR;
6246               else if (c  == '\n')
6247                 eol_seen |= EOL_SEEN_LF;
6248               src++;
6249             }
6250         }
6251     }
6252   coding->head_ascii = src - coding->source;
6253   coding->eol_seen = eol_seen;
6254   return (coding->head_ascii);
6255 }
6256
6257
6258 /* Return the number of characters at the source if all the bytes are
6259    valid UTF-8 (of Unicode range).  Otherwise, return -1.  By side
6260    effects, update coding->eol_seen.  The value of coding->eol_seen is
6261    "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but
6262    the value is reliable only when all the source bytes are valid
6263    UTF-8.  */
6264
6265 static ptrdiff_t
6266 check_utf_8 (struct coding_system *coding)
6267 {
6268   const unsigned char *src, *end;
6269   int eol_seen;
6270   ptrdiff_t nchars = coding->head_ascii;
6271
6272   if (coding->head_ascii < 0)
6273     check_ascii (coding);
6274   else
6275     coding_set_source (coding);
6276   src = coding->source + coding->head_ascii;
6277   /* We look ahead one byte for CR LF.  */
6278   end = coding->source + coding->src_bytes - 1;
6279   eol_seen = coding->eol_seen;
6280   while (src < end)
6281     {
6282       int c = *src;
6283
6284       if (UTF_8_1_OCTET_P (*src))
6285         {
6286           src++;
6287           if (c < 0x20)
6288             {
6289               if (c == '\r')
6290                 {
6291                   if (*src == '\n')
6292                     {
6293                       eol_seen |= EOL_SEEN_CRLF;
6294                       src++;
6295                       nchars++;
6296                     }
6297                   else
6298                     eol_seen |= EOL_SEEN_CR;
6299                 }
6300               else if (c == '\n')
6301                 eol_seen |= EOL_SEEN_LF;
6302             }
6303         }
6304       else if (UTF_8_2_OCTET_LEADING_P (c))
6305         {
6306           if (c < 0xC2          /* overlong sequence */
6307               || src + 1 >= end
6308               || ! UTF_8_EXTRA_OCTET_P (src[1]))
6309             return -1;
6310           src += 2;
6311         }
6312       else if (UTF_8_3_OCTET_LEADING_P (c))
6313         {
6314           if (src + 2 >= end
6315               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6316                     && UTF_8_EXTRA_OCTET_P (src[2])))
6317             return -1;
6318           c = (((c & 0xF) << 12)
6319                | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
6320           if (c < 0x800                       /* overlong sequence */
6321               || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
6322             return -1;
6323           src += 3;
6324         }
6325       else if (UTF_8_4_OCTET_LEADING_P (c))
6326         {
6327           if (src + 3 >= end
6328               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6329                     && UTF_8_EXTRA_OCTET_P (src[2])
6330                     && UTF_8_EXTRA_OCTET_P (src[3])))
6331             return -1;
6332           c = (((c & 0x7) << 18) | ((src[1] & 0x3F) << 12)
6333                | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
6334           if (c < 0x10000       /* overlong sequence */
6335               || c >= 0x110000) /* non-Unicode character  */
6336             return -1;
6337           src += 4;
6338         }
6339       else
6340         return -1;
6341       nchars++;
6342     }
6343
6344   if (src == end)
6345     {
6346       if (! UTF_8_1_OCTET_P (*src))
6347         return -1;
6348       nchars++;
6349       if (*src == '\r')
6350         eol_seen |= EOL_SEEN_CR;
6351       else if (*src  == '\n')
6352         eol_seen |= EOL_SEEN_LF;
6353     }
6354   coding->eol_seen = eol_seen;
6355   return nchars;
6356 }
6357
6358
6359 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6360    SOURCE is encoded.  If CATEGORY is one of
6361    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6362    two-byte, else they are encoded by one-byte.
6363
6364    Return one of EOL_SEEN_XXX.  */
6365
6366 #define MAX_EOL_CHECK_COUNT 3
6367
6368 static int
6369 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6370             enum coding_category category)
6371 {
6372   const unsigned char *src = source, *src_end = src + src_bytes;
6373   unsigned char c;
6374   int total  = 0;
6375   int eol_seen = EOL_SEEN_NONE;
6376
6377   if ((1 << category) & CATEGORY_MASK_UTF_16)
6378     {
6379       bool msb = category == (coding_category_utf_16_le
6380                               | coding_category_utf_16_le_nosig);
6381       bool lsb = !msb;
6382
6383       while (src + 1 < src_end)
6384         {
6385           c = src[lsb];
6386           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6387             {
6388               int this_eol;
6389
6390               if (c == '\n')
6391                 this_eol = EOL_SEEN_LF;
6392               else if (src + 3 >= src_end
6393                        || src[msb + 2] != 0
6394                        || src[lsb + 2] != '\n')
6395                 this_eol = EOL_SEEN_CR;
6396               else
6397                 {
6398                   this_eol = EOL_SEEN_CRLF;
6399                   src += 2;
6400                 }
6401
6402               if (eol_seen == EOL_SEEN_NONE)
6403                 /* This is the first end-of-line.  */
6404                 eol_seen = this_eol;
6405               else if (eol_seen != this_eol)
6406                 {
6407                   /* The found type is different from what found before.
6408                      Allow for stray ^M characters in DOS EOL files.  */
6409                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6410                       || (eol_seen == EOL_SEEN_CRLF
6411                           && this_eol == EOL_SEEN_CR))
6412                     eol_seen = EOL_SEEN_CRLF;
6413                   else
6414                     {
6415                       eol_seen = EOL_SEEN_LF;
6416                       break;
6417                     }
6418                 }
6419               if (++total == MAX_EOL_CHECK_COUNT)
6420                 break;
6421             }
6422           src += 2;
6423         }
6424     }
6425   else
6426     while (src < src_end)
6427       {
6428         c = *src++;
6429         if (c == '\n' || c == '\r')
6430           {
6431             int this_eol;
6432
6433             if (c == '\n')
6434               this_eol = EOL_SEEN_LF;
6435             else if (src >= src_end || *src != '\n')
6436               this_eol = EOL_SEEN_CR;
6437             else
6438               this_eol = EOL_SEEN_CRLF, src++;
6439
6440             if (eol_seen == EOL_SEEN_NONE)
6441               /* This is the first end-of-line.  */
6442               eol_seen = this_eol;
6443             else if (eol_seen != this_eol)
6444               {
6445                 /* The found type is different from what found before.
6446                    Allow for stray ^M characters in DOS EOL files.  */
6447                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6448                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6449                   eol_seen = EOL_SEEN_CRLF;
6450                 else
6451                   {
6452                     eol_seen = EOL_SEEN_LF;
6453                     break;
6454                   }
6455               }
6456             if (++total == MAX_EOL_CHECK_COUNT)
6457               break;
6458           }
6459       }
6460   return eol_seen;
6461 }
6462
6463
6464 static Lisp_Object
6465 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6466 {
6467   Lisp_Object eol_type;
6468
6469   eol_type = CODING_ID_EOL_TYPE (coding->id);
6470   if (! VECTORP (eol_type))
6471     /* Already adjusted.  */
6472     return eol_type;
6473   if (eol_seen & EOL_SEEN_LF)
6474     {
6475       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6476       eol_type = Qunix;
6477     }
6478   else if (eol_seen & EOL_SEEN_CRLF)
6479     {
6480       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6481       eol_type = Qdos;
6482     }
6483   else if (eol_seen & EOL_SEEN_CR)
6484     {
6485       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6486       eol_type = Qmac;
6487     }
6488   return eol_type;
6489 }
6490
6491 /* Detect how a text specified in CODING is encoded.  If a coding
6492    system is detected, update fields of CODING by the detected coding
6493    system.  */
6494
6495 static void
6496 detect_coding (struct coding_system *coding)
6497 {
6498   const unsigned char *src, *src_end;
6499   unsigned int saved_mode = coding->mode;
6500   Lisp_Object found = Qnil;
6501   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6502
6503   coding->consumed = coding->consumed_char = 0;
6504   coding->produced = coding->produced_char = 0;
6505   coding_set_source (coding);
6506
6507   src_end = coding->source + coding->src_bytes;
6508
6509   coding->eol_seen = EOL_SEEN_NONE;
6510   /* If we have not yet decided the text encoding type, detect it
6511      now.  */
6512   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6513     {
6514       int c, i;
6515       struct coding_detection_info detect_info;
6516       bool null_byte_found = 0, eight_bit_found = 0;
6517       bool inhibit_nbd = inhibit_flag (coding->spec.undecided.inhibit_nbd,
6518                                        inhibit_null_byte_detection);
6519       bool inhibit_ied = inhibit_flag (coding->spec.undecided.inhibit_ied,
6520                                        inhibit_iso_escape_detection);
6521       bool prefer_utf_8 = coding->spec.undecided.prefer_utf_8;
6522
6523       coding->head_ascii = 0;
6524       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6525       for (src = coding->source; src < src_end; src++)
6526         {
6527           c = *src;
6528           if (c & 0x80)
6529             {
6530               eight_bit_found = 1;
6531               if (null_byte_found)
6532                 break;
6533             }
6534           else if (c < 0x20)
6535             {
6536               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6537                   && ! inhibit_ied
6538                   && ! detect_info.checked)
6539                 {
6540                   if (detect_coding_iso_2022 (coding, &detect_info))
6541                     {
6542                       /* We have scanned the whole data.  */
6543                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6544                         {
6545                           /* We didn't find an 8-bit code.  We may
6546                              have found a null-byte, but it's very
6547                              rare that a binary file conforms to
6548                              ISO-2022.  */
6549                           src = src_end;
6550                           coding->head_ascii = src - coding->source;
6551                         }
6552                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6553                       break;
6554                     }
6555                 }
6556               else if (! c && !inhibit_nbd)
6557                 {
6558                   null_byte_found = 1;
6559                   if (eight_bit_found)
6560                     break;
6561                 }
6562               else if (! disable_ascii_optimization
6563                        && ! inhibit_eol_conversion)
6564                 {
6565                   if (c == '\r')
6566                     {
6567                       if (src < src_end && src[1] == '\n')
6568                         {
6569                           coding->eol_seen |= EOL_SEEN_CRLF;
6570                           src++;
6571                           if (! eight_bit_found)
6572                             coding->head_ascii++;
6573                         }
6574                       else
6575                         coding->eol_seen |= EOL_SEEN_CR;
6576                     }
6577                   else if (c == '\n')
6578                     {
6579                       coding->eol_seen |= EOL_SEEN_LF;
6580                     }
6581                 }
6582
6583               if (! eight_bit_found)
6584                 coding->head_ascii++;
6585             }
6586           else if (! eight_bit_found)
6587             coding->head_ascii++;
6588         }
6589
6590       if (null_byte_found || eight_bit_found
6591           || coding->head_ascii < coding->src_bytes
6592           || detect_info.found)
6593         {
6594           enum coding_category category;
6595           struct coding_system *this;
6596
6597           if (coding->head_ascii == coding->src_bytes)
6598             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6599             for (i = 0; i < coding_category_raw_text; i++)
6600               {
6601                 category = coding_priorities[i];
6602                 this = coding_categories + category;
6603                 if (detect_info.found & (1 << category))
6604                   break;
6605               }
6606           else
6607             {
6608               if (null_byte_found)
6609                 {
6610                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6611                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6612                 }
6613               else if (prefer_utf_8
6614                        && detect_coding_utf_8 (coding, &detect_info))
6615                 {
6616                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
6617                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
6618                 }
6619               for (i = 0; i < coding_category_raw_text; i++)
6620                 {
6621                   category = coding_priorities[i];
6622                   this = coding_categories + category;
6623                   /* Some of this->detector (e.g. detect_coding_sjis)
6624                      require this information.  */
6625                   coding->id = this->id;
6626                   if (this->id < 0)
6627                     {
6628                       /* No coding system of this category is defined.  */
6629                       detect_info.rejected |= (1 << category);
6630                     }
6631                   else if (category >= coding_category_raw_text)
6632                     continue;
6633                   else if (detect_info.checked & (1 << category))
6634                     {
6635                       if (detect_info.found & (1 << category))
6636                         break;
6637                     }
6638                   else if ((*(this->detector)) (coding, &detect_info)
6639                            && detect_info.found & (1 << category))
6640                     break;
6641                 }
6642             }
6643
6644           if (i < coding_category_raw_text)
6645             {
6646               if (category == coding_category_utf_8_auto)
6647                 {
6648                   Lisp_Object coding_systems;
6649
6650                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6651                                          coding_attr_utf_bom);
6652                   if (CONSP (coding_systems))
6653                     {
6654                       if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6655                         found = XCAR (coding_systems);
6656                       else
6657                         found = XCDR (coding_systems);
6658                     }
6659                   else
6660                     found = CODING_ID_NAME (this->id);
6661                 }
6662               else if (category == coding_category_utf_16_auto)
6663                 {
6664                   Lisp_Object coding_systems;
6665
6666                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6667                                          coding_attr_utf_bom);
6668                   if (CONSP (coding_systems))
6669                     {
6670                       if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6671                         found = XCAR (coding_systems);
6672                       else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6673                         found = XCDR (coding_systems);
6674                     }
6675                   else
6676                     found = CODING_ID_NAME (this->id);
6677                 }
6678               else
6679                 found = CODING_ID_NAME (this->id);
6680             }
6681           else if (null_byte_found)
6682             found = Qno_conversion;
6683           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6684                    == CATEGORY_MASK_ANY)
6685             found = Qraw_text;
6686           else if (detect_info.rejected)
6687             for (i = 0; i < coding_category_raw_text; i++)
6688               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6689                 {
6690                   this = coding_categories + coding_priorities[i];
6691                   found = CODING_ID_NAME (this->id);
6692                   break;
6693                 }
6694         }
6695     }
6696   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6697            == coding_category_utf_8_auto)
6698     {
6699       Lisp_Object coding_systems;
6700       struct coding_detection_info detect_info;
6701
6702       coding_systems
6703         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6704       detect_info.found = detect_info.rejected = 0;
6705       if (check_ascii (coding) == coding->src_bytes)
6706         {
6707           if (CONSP (coding_systems))
6708             found = XCDR (coding_systems);
6709         }
6710       else
6711         {
6712           if (CONSP (coding_systems)
6713               && detect_coding_utf_8 (coding, &detect_info))
6714             {
6715               if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6716                 found = XCAR (coding_systems);
6717               else
6718                 found = XCDR (coding_systems);
6719             }
6720         }
6721     }
6722   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6723            == coding_category_utf_16_auto)
6724     {
6725       Lisp_Object coding_systems;
6726       struct coding_detection_info detect_info;
6727
6728       coding_systems
6729         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6730       detect_info.found = detect_info.rejected = 0;
6731       coding->head_ascii = 0;
6732       if (CONSP (coding_systems)
6733           && detect_coding_utf_16 (coding, &detect_info))
6734         {
6735           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6736             found = XCAR (coding_systems);
6737           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6738             found = XCDR (coding_systems);
6739         }
6740     }
6741
6742   if (! NILP (found))
6743     {
6744       int specified_eol = (VECTORP (eol_type) ? EOL_SEEN_NONE
6745                            : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
6746                            : EQ (eol_type, Qmac) ? EOL_SEEN_CR
6747                            : EOL_SEEN_LF);
6748
6749       setup_coding_system (found, coding);
6750       if (specified_eol != EOL_SEEN_NONE)
6751         adjust_coding_eol_type (coding, specified_eol);
6752     }
6753
6754   coding->mode = saved_mode;
6755 }
6756
6757
6758 static void
6759 decode_eol (struct coding_system *coding)
6760 {
6761   Lisp_Object eol_type;
6762   unsigned char *p, *pbeg, *pend;
6763
6764   eol_type = CODING_ID_EOL_TYPE (coding->id);
6765   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6766     return;
6767
6768   if (NILP (coding->dst_object))
6769     pbeg = coding->destination;
6770   else
6771     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6772   pend = pbeg + coding->produced;
6773
6774   if (VECTORP (eol_type))
6775     {
6776       int eol_seen = EOL_SEEN_NONE;
6777
6778       for (p = pbeg; p < pend; p++)
6779         {
6780           if (*p == '\n')
6781             eol_seen |= EOL_SEEN_LF;
6782           else if (*p == '\r')
6783             {
6784               if (p + 1 < pend && *(p + 1) == '\n')
6785                 {
6786                   eol_seen |= EOL_SEEN_CRLF;
6787                   p++;
6788                 }
6789               else
6790                 eol_seen |= EOL_SEEN_CR;
6791             }
6792         }
6793       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6794       if ((eol_seen & EOL_SEEN_CRLF) != 0
6795           && (eol_seen & EOL_SEEN_CR) != 0
6796           && (eol_seen & EOL_SEEN_LF) == 0)
6797         eol_seen = EOL_SEEN_CRLF;
6798       else if (eol_seen != EOL_SEEN_NONE
6799           && eol_seen != EOL_SEEN_LF
6800           && eol_seen != EOL_SEEN_CRLF
6801           && eol_seen != EOL_SEEN_CR)
6802         eol_seen = EOL_SEEN_LF;
6803       if (eol_seen != EOL_SEEN_NONE)
6804         eol_type = adjust_coding_eol_type (coding, eol_seen);
6805     }
6806
6807   if (EQ (eol_type, Qmac))
6808     {
6809       for (p = pbeg; p < pend; p++)
6810         if (*p == '\r')
6811           *p = '\n';
6812     }
6813   else if (EQ (eol_type, Qdos))
6814     {
6815       ptrdiff_t n = 0;
6816
6817       if (NILP (coding->dst_object))
6818         {
6819           /* Start deleting '\r' from the tail to minimize the memory
6820              movement.  */
6821           for (p = pend - 2; p >= pbeg; p--)
6822             if (*p == '\r')
6823               {
6824                 memmove (p, p + 1, pend-- - p - 1);
6825                 n++;
6826               }
6827         }
6828       else
6829         {
6830           ptrdiff_t pos_byte = coding->dst_pos_byte;
6831           ptrdiff_t pos = coding->dst_pos;
6832           ptrdiff_t pos_end = pos + coding->produced_char - 1;
6833
6834           while (pos < pos_end)
6835             {
6836               p = BYTE_POS_ADDR (pos_byte);
6837               if (*p == '\r' && p[1] == '\n')
6838                 {
6839                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6840                   n++;
6841                   pos_end--;
6842                 }
6843               pos++;
6844               if (coding->dst_multibyte)
6845                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6846               else
6847                 pos_byte++;
6848             }
6849         }
6850       coding->produced -= n;
6851       coding->produced_char -= n;
6852     }
6853 }
6854
6855
6856 /* MAX_LOOKUP's maximum value.  MAX_LOOKUP is an int and so cannot
6857    exceed INT_MAX.  Also, MAX_LOOKUP is multiplied by sizeof (int) for
6858    alloca, so it cannot exceed MAX_ALLOCA / sizeof (int).  */
6859 enum { MAX_LOOKUP_MAX = min (INT_MAX, MAX_ALLOCA / sizeof (int)) };
6860
6861 /* Return a translation table (or list of them) from coding system
6862    attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6863    not ENCODEP). */
6864
6865 static Lisp_Object
6866 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6867 {
6868   Lisp_Object standard, translation_table;
6869   Lisp_Object val;
6870
6871   if (NILP (Venable_character_translation))
6872     {
6873       if (max_lookup)
6874         *max_lookup = 0;
6875       return Qnil;
6876     }
6877   if (encodep)
6878     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6879       standard = Vstandard_translation_table_for_encode;
6880   else
6881     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6882       standard = Vstandard_translation_table_for_decode;
6883   if (NILP (translation_table))
6884     translation_table = standard;
6885   else
6886     {
6887       if (SYMBOLP (translation_table))
6888         translation_table = Fget (translation_table, Qtranslation_table);
6889       else if (CONSP (translation_table))
6890         {
6891           translation_table = Fcopy_sequence (translation_table);
6892           for (val = translation_table; CONSP (val); val = XCDR (val))
6893             if (SYMBOLP (XCAR (val)))
6894               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6895         }
6896       if (CHAR_TABLE_P (standard))
6897         {
6898           if (CONSP (translation_table))
6899             translation_table = nconc2 (translation_table, list1 (standard));
6900           else
6901             translation_table = list2 (translation_table, standard);
6902         }
6903     }
6904
6905   if (max_lookup)
6906     {
6907       *max_lookup = 1;
6908       if (CHAR_TABLE_P (translation_table)
6909           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6910         {
6911           val = XCHAR_TABLE (translation_table)->extras[1];
6912           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6913             *max_lookup = min (XFASTINT (val), MAX_LOOKUP_MAX);
6914         }
6915       else if (CONSP (translation_table))
6916         {
6917           Lisp_Object tail;
6918
6919           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6920             if (CHAR_TABLE_P (XCAR (tail))
6921                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6922               {
6923                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6924                 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6925                   *max_lookup = min (XFASTINT (tailval), MAX_LOOKUP_MAX);
6926               }
6927         }
6928     }
6929   return translation_table;
6930 }
6931
6932 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6933   do {                                                          \
6934     trans = Qnil;                                               \
6935     if (CHAR_TABLE_P (table))                                   \
6936       {                                                         \
6937         trans = CHAR_TABLE_REF (table, c);                      \
6938         if (CHARACTERP (trans))                                 \
6939           c = XFASTINT (trans), trans = Qnil;                   \
6940       }                                                         \
6941     else if (CONSP (table))                                     \
6942       {                                                         \
6943         Lisp_Object tail;                                       \
6944                                                                 \
6945         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6946           if (CHAR_TABLE_P (XCAR (tail)))                       \
6947             {                                                   \
6948               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6949               if (CHARACTERP (trans))                           \
6950                 c = XFASTINT (trans), trans = Qnil;             \
6951               else if (! NILP (trans))                          \
6952                 break;                                          \
6953             }                                                   \
6954       }                                                         \
6955   } while (0)
6956
6957
6958 /* Return a translation of character(s) at BUF according to TRANS.
6959    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6960    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6961    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6962    translation is found, and Qnil if not found..
6963    If BUF is too short to lookup characters in FROM, return Qt.  */
6964
6965 static Lisp_Object
6966 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6967 {
6968
6969   if (INTEGERP (trans))
6970     return trans;
6971   for (; CONSP (trans); trans = XCDR (trans))
6972     {
6973       Lisp_Object val = XCAR (trans);
6974       Lisp_Object from = XCAR (val);
6975       ptrdiff_t len = ASIZE (from);
6976       ptrdiff_t i;
6977
6978       for (i = 0; i < len; i++)
6979         {
6980           if (buf + i == buf_end)
6981             return Qt;
6982           if (XINT (AREF (from, i)) != buf[i])
6983             break;
6984         }
6985       if (i == len)
6986         return val;
6987     }
6988   return Qnil;
6989 }
6990
6991
6992 static int
6993 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6994                bool last_block)
6995 {
6996   unsigned char *dst = coding->destination + coding->produced;
6997   unsigned char *dst_end = coding->destination + coding->dst_bytes;
6998   ptrdiff_t produced;
6999   ptrdiff_t produced_chars = 0;
7000   int carryover = 0;
7001
7002   if (! coding->chars_at_source)
7003     {
7004       /* Source characters are in coding->charbuf.  */
7005       int *buf = coding->charbuf;
7006       int *buf_end = buf + coding->charbuf_used;
7007
7008       if (EQ (coding->src_object, coding->dst_object)
7009           && ! NILP (coding->dst_object))
7010         {
7011           eassert (growable_destination (coding));
7012           coding_set_source (coding);
7013           dst_end = ((unsigned char *) coding->source) + coding->consumed;
7014         }
7015
7016       while (buf < buf_end)
7017         {
7018           int c = *buf;
7019           ptrdiff_t i;
7020
7021           if (c >= 0)
7022             {
7023               ptrdiff_t from_nchars = 1, to_nchars = 1;
7024               Lisp_Object trans = Qnil;
7025
7026               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7027               if (! NILP (trans))
7028                 {
7029                   trans = get_translation (trans, buf, buf_end);
7030                   if (INTEGERP (trans))
7031                     c = XINT (trans);
7032                   else if (CONSP (trans))
7033                     {
7034                       from_nchars = ASIZE (XCAR (trans));
7035                       trans = XCDR (trans);
7036                       if (INTEGERP (trans))
7037                         c = XINT (trans);
7038                       else
7039                         {
7040                           to_nchars = ASIZE (trans);
7041                           c = XINT (AREF (trans, 0));
7042                         }
7043                     }
7044                   else if (EQ (trans, Qt) && ! last_block)
7045                     break;
7046                 }
7047
7048               if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
7049                 {
7050                   eassert (growable_destination (coding));
7051                   if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
7052                        / MAX_MULTIBYTE_LENGTH)
7053                       < to_nchars)
7054                     memory_full (SIZE_MAX);
7055                   dst = alloc_destination (coding,
7056                                            buf_end - buf
7057                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
7058                                            dst);
7059                   if (EQ (coding->src_object, coding->dst_object))
7060                     {
7061                       coding_set_source (coding);
7062                       dst_end = (((unsigned char *) coding->source)
7063                                  + coding->consumed);
7064                     }
7065                   else
7066                     dst_end = coding->destination + coding->dst_bytes;
7067                 }
7068
7069               for (i = 0; i < to_nchars; i++)
7070                 {
7071                   if (i > 0)
7072                     c = XINT (AREF (trans, i));
7073                   if (coding->dst_multibyte
7074                       || ! CHAR_BYTE8_P (c))
7075                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
7076                   else
7077                     *dst++ = CHAR_TO_BYTE8 (c);
7078                 }
7079               produced_chars += to_nchars;
7080               buf += from_nchars;
7081             }
7082           else
7083             /* This is an annotation datum.  (-C) is the length.  */
7084             buf += -c;
7085         }
7086       carryover = buf_end - buf;
7087     }
7088   else
7089     {
7090       /* Source characters are at coding->source.  */
7091       const unsigned char *src = coding->source;
7092       const unsigned char *src_end = src + coding->consumed;
7093
7094       if (EQ (coding->dst_object, coding->src_object))
7095         {
7096           eassert (growable_destination (coding));
7097           dst_end = (unsigned char *) src;
7098         }
7099       if (coding->src_multibyte != coding->dst_multibyte)
7100         {
7101           if (coding->src_multibyte)
7102             {
7103               bool multibytep = 1;
7104               ptrdiff_t consumed_chars = 0;
7105
7106               while (1)
7107                 {
7108                   const unsigned char *src_base = src;
7109                   int c;
7110
7111                   ONE_MORE_BYTE (c);
7112                   if (dst == dst_end)
7113                     {
7114                       eassert (growable_destination (coding));
7115                       if (EQ (coding->src_object, coding->dst_object))
7116                         dst_end = (unsigned char *) src;
7117                       if (dst == dst_end)
7118                         {
7119                           ptrdiff_t offset = src - coding->source;
7120
7121                           dst = alloc_destination (coding, src_end - src + 1,
7122                                                    dst);
7123                           dst_end = coding->destination + coding->dst_bytes;
7124                           coding_set_source (coding);
7125                           src = coding->source + offset;
7126                           src_end = coding->source + coding->consumed;
7127                           if (EQ (coding->src_object, coding->dst_object))
7128                             dst_end = (unsigned char *) src;
7129                         }
7130                     }
7131                   *dst++ = c;
7132                   produced_chars++;
7133                 }
7134             no_more_source:
7135               ;
7136             }
7137           else
7138             while (src < src_end)
7139               {
7140                 bool multibytep = 1;
7141                 int c = *src++;
7142
7143                 if (dst >= dst_end - 1)
7144                   {
7145                     eassert (growable_destination (coding));
7146                     if (EQ (coding->src_object, coding->dst_object))
7147                       dst_end = (unsigned char *) src;
7148                     if (dst >= dst_end - 1)
7149                       {
7150                         ptrdiff_t offset = src - coding->source;
7151                         ptrdiff_t more_bytes;
7152
7153                         if (EQ (coding->src_object, coding->dst_object))
7154                           more_bytes = ((src_end - src) / 2) + 2;
7155                         else
7156                           more_bytes = src_end - src + 2;
7157                         dst = alloc_destination (coding, more_bytes, dst);
7158                         dst_end = coding->destination + coding->dst_bytes;
7159                         coding_set_source (coding);
7160                         src = coding->source + offset;
7161                         src_end = coding->source + coding->consumed;
7162                         if (EQ (coding->src_object, coding->dst_object))
7163                           dst_end = (unsigned char *) src;
7164                       }
7165                   }
7166                 EMIT_ONE_BYTE (c);
7167               }
7168         }
7169       else
7170         {
7171           if (!EQ (coding->src_object, coding->dst_object))
7172             {
7173               ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
7174
7175               if (require > 0)
7176                 {
7177                   ptrdiff_t offset = src - coding->source;
7178
7179                   dst = alloc_destination (coding, require, dst);
7180                   coding_set_source (coding);
7181                   src = coding->source + offset;
7182                   src_end = coding->source + coding->consumed;
7183                 }
7184             }
7185           produced_chars = coding->consumed_char;
7186           while (src < src_end)
7187             *dst++ = *src++;
7188         }
7189     }
7190
7191   produced = dst - (coding->destination + coding->produced);
7192   if (BUFFERP (coding->dst_object) && produced_chars > 0)
7193     insert_from_gap (produced_chars, produced, 0);
7194   coding->produced += produced;
7195   coding->produced_char += produced_chars;
7196   return carryover;
7197 }
7198
7199 /* Compose text in CODING->object according to the annotation data at
7200    CHARBUF.  CHARBUF is an array:
7201      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
7202  */
7203
7204 static void
7205 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7206 {
7207   int len;
7208   ptrdiff_t to;
7209   enum composition_method method;
7210   Lisp_Object components;
7211
7212   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
7213   to = pos + charbuf[2];
7214   method = (enum composition_method) (charbuf[4]);
7215
7216   if (method == COMPOSITION_RELATIVE)
7217     components = Qnil;
7218   else
7219     {
7220       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
7221       int i, j;
7222
7223       if (method == COMPOSITION_WITH_RULE)
7224         len = charbuf[2] * 3 - 2;
7225       charbuf += MAX_ANNOTATION_LENGTH;
7226       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
7227       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
7228         {
7229           if (charbuf[i] >= 0)
7230             args[j] = make_number (charbuf[i]);
7231           else
7232             {
7233               i++;
7234               args[j] = make_number (charbuf[i] % 0x100);
7235             }
7236         }
7237       components = (i == j ? Fstring (j, args) : Fvector (j, args));
7238     }
7239   compose_text (pos, to, components, Qnil, coding->dst_object);
7240 }
7241
7242
7243 /* Put `charset' property on text in CODING->object according to
7244    the annotation data at CHARBUF.  CHARBUF is an array:
7245      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
7246  */
7247
7248 static void
7249 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7250 {
7251   ptrdiff_t from = pos - charbuf[2];
7252   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
7253
7254   Fput_text_property (make_number (from), make_number (pos),
7255                       Qcharset, CHARSET_NAME (charset),
7256                       coding->dst_object);
7257 }
7258
7259 #define MAX_CHARBUF_SIZE 0x4000
7260 /* How many units decoding functions expect in coding->charbuf at
7261    most.  Currently, decode_coding_emacs_mule expects the following
7262    size, and that is the largest value.  */
7263 #define MAX_CHARBUF_EXTRA_SIZE ((MAX_ANNOTATION_LENGTH * 3) + 1)
7264
7265 #define ALLOC_CONVERSION_WORK_AREA(coding, size)                \
7266   do {                                                          \
7267     ptrdiff_t units = min ((size) + MAX_CHARBUF_EXTRA_SIZE,     \
7268                            MAX_CHARBUF_SIZE);                   \
7269     coding->charbuf = SAFE_ALLOCA (units * sizeof (int));       \
7270     coding->charbuf_size = units;                               \
7271   } while (0)
7272
7273 static void
7274 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
7275 {
7276   int *charbuf = coding->charbuf;
7277   int *charbuf_end = charbuf + coding->charbuf_used;
7278
7279   if (NILP (coding->dst_object))
7280     return;
7281
7282   while (charbuf < charbuf_end)
7283     {
7284       if (*charbuf >= 0)
7285         pos++, charbuf++;
7286       else
7287         {
7288           int len = -*charbuf;
7289
7290           if (len > 2)
7291             switch (charbuf[1])
7292               {
7293               case CODING_ANNOTATE_COMPOSITION_MASK:
7294                 produce_composition (coding, charbuf, pos);
7295                 break;
7296               case CODING_ANNOTATE_CHARSET_MASK:
7297                 produce_charset (coding, charbuf, pos);
7298                 break;
7299               default:
7300                 break;
7301               }
7302           charbuf += len;
7303         }
7304     }
7305 }
7306
7307 /* Decode the data at CODING->src_object into CODING->dst_object.
7308    CODING->src_object is a buffer, a string, or nil.
7309    CODING->dst_object is a buffer.
7310
7311    If CODING->src_object is a buffer, it must be the current buffer.
7312    In this case, if CODING->src_pos is positive, it is a position of
7313    the source text in the buffer, otherwise, the source text is in the
7314    gap area of the buffer, and CODING->src_pos specifies the offset of
7315    the text from GPT (which must be the same as PT).  If this is the
7316    same buffer as CODING->dst_object, CODING->src_pos must be
7317    negative.
7318
7319    If CODING->src_object is a string, CODING->src_pos is an index to
7320    that string.
7321
7322    If CODING->src_object is nil, CODING->source must already point to
7323    the non-relocatable memory area.  In this case, CODING->src_pos is
7324    an offset from CODING->source.
7325
7326    The decoded data is inserted at the current point of the buffer
7327    CODING->dst_object.
7328 */
7329
7330 static void
7331 decode_coding (struct coding_system *coding)
7332 {
7333   Lisp_Object attrs;
7334   Lisp_Object undo_list;
7335   Lisp_Object translation_table;
7336   struct ccl_spec cclspec;
7337   int carryover;
7338   int i;
7339
7340   USE_SAFE_ALLOCA;
7341
7342   if (BUFFERP (coding->src_object)
7343       && coding->src_pos > 0
7344       && coding->src_pos < GPT
7345       && coding->src_pos + coding->src_chars > GPT)
7346     move_gap_both (coding->src_pos, coding->src_pos_byte);
7347
7348   undo_list = Qt;
7349   if (BUFFERP (coding->dst_object))
7350     {
7351       set_buffer_internal (XBUFFER (coding->dst_object));
7352       if (GPT != PT)
7353         move_gap_both (PT, PT_BYTE);
7354
7355       /* We must disable undo_list in order to record the whole insert
7356          transaction via record_insert at the end.  But doing so also
7357          disables the recording of the first change to the undo_list.
7358          Therefore we check for first change here and record it via
7359          record_first_change if needed.  */
7360       if (MODIFF <= SAVE_MODIFF)
7361         record_first_change ();
7362
7363       undo_list = BVAR (current_buffer, undo_list);
7364       bset_undo_list (current_buffer, Qt);
7365     }
7366
7367   coding->consumed = coding->consumed_char = 0;
7368   coding->produced = coding->produced_char = 0;
7369   coding->chars_at_source = 0;
7370   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7371
7372   ALLOC_CONVERSION_WORK_AREA (coding, coding->src_bytes);
7373
7374   attrs = CODING_ID_ATTRS (coding->id);
7375   translation_table = get_translation_table (attrs, 0, NULL);
7376
7377   carryover = 0;
7378   if (coding->decoder == decode_coding_ccl)
7379     {
7380       coding->spec.ccl = &cclspec;
7381       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7382     }
7383   do
7384     {
7385       ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7386
7387       coding_set_source (coding);
7388       coding->annotated = 0;
7389       coding->charbuf_used = carryover;
7390       (*(coding->decoder)) (coding);
7391       coding_set_destination (coding);
7392       carryover = produce_chars (coding, translation_table, 0);
7393       if (coding->annotated)
7394         produce_annotation (coding, pos);
7395       for (i = 0; i < carryover; i++)
7396         coding->charbuf[i]
7397           = coding->charbuf[coding->charbuf_used - carryover + i];
7398     }
7399   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7400          || (coding->consumed < coding->src_bytes
7401              && (coding->result == CODING_RESULT_SUCCESS
7402                  || coding->result == CODING_RESULT_INVALID_SRC)));
7403
7404   if (carryover > 0)
7405     {
7406       coding_set_destination (coding);
7407       coding->charbuf_used = carryover;
7408       produce_chars (coding, translation_table, 1);
7409     }
7410
7411   coding->carryover_bytes = 0;
7412   if (coding->consumed < coding->src_bytes)
7413     {
7414       ptrdiff_t nbytes = coding->src_bytes - coding->consumed;
7415       const unsigned char *src;
7416
7417       coding_set_source (coding);
7418       coding_set_destination (coding);
7419       src = coding->source + coding->consumed;
7420
7421       if (coding->mode & CODING_MODE_LAST_BLOCK)
7422         {
7423           /* Flush out unprocessed data as binary chars.  We are sure
7424              that the number of data is less than the size of
7425              coding->charbuf.  */
7426           coding->charbuf_used = 0;
7427           coding->chars_at_source = 0;
7428
7429           while (nbytes-- > 0)
7430             {
7431               int c = *src++;
7432
7433               if (c & 0x80)
7434                 c = BYTE8_TO_CHAR (c);
7435               coding->charbuf[coding->charbuf_used++] = c;
7436             }
7437           produce_chars (coding, Qnil, 1);
7438         }
7439       else
7440         {
7441           /* Record unprocessed bytes in coding->carryover.  We are
7442              sure that the number of data is less than the size of
7443              coding->carryover.  */
7444           unsigned char *p = coding->carryover;
7445
7446           if (nbytes > sizeof coding->carryover)
7447             nbytes = sizeof coding->carryover;
7448           coding->carryover_bytes = nbytes;
7449           while (nbytes-- > 0)
7450             *p++ = *src++;
7451         }
7452       coding->consumed = coding->src_bytes;
7453     }
7454
7455   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7456       && !inhibit_eol_conversion)
7457     decode_eol (coding);
7458   if (BUFFERP (coding->dst_object))
7459     {
7460       bset_undo_list (current_buffer, undo_list);
7461       record_insert (coding->dst_pos, coding->produced_char);
7462     }
7463
7464   SAFE_FREE ();
7465 }
7466
7467
7468 /* Extract an annotation datum from a composition starting at POS and
7469    ending before LIMIT of CODING->src_object (buffer or string), store
7470    the data in BUF, set *STOP to a starting position of the next
7471    composition (if any) or to LIMIT, and return the address of the
7472    next element of BUF.
7473
7474    If such an annotation is not found, set *STOP to a starting
7475    position of a composition after POS (if any) or to LIMIT, and
7476    return BUF.  */
7477
7478 static int *
7479 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7480                                struct coding_system *coding, int *buf,
7481                                ptrdiff_t *stop)
7482 {
7483   ptrdiff_t start, end;
7484   Lisp_Object prop;
7485
7486   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7487       || end > limit)
7488     *stop = limit;
7489   else if (start > pos)
7490     *stop = start;
7491   else
7492     {
7493       if (start == pos)
7494         {
7495           /* We found a composition.  Store the corresponding
7496              annotation data in BUF.  */
7497           int *head = buf;
7498           enum composition_method method = composition_method (prop);
7499           int nchars = COMPOSITION_LENGTH (prop);
7500
7501           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7502           if (method != COMPOSITION_RELATIVE)
7503             {
7504               Lisp_Object components;
7505               ptrdiff_t i, len, i_byte;
7506
7507               components = COMPOSITION_COMPONENTS (prop);
7508               if (VECTORP (components))
7509                 {
7510                   len = ASIZE (components);
7511                   for (i = 0; i < len; i++)
7512                     *buf++ = XINT (AREF (components, i));
7513                 }
7514               else if (STRINGP (components))
7515                 {
7516                   len = SCHARS (components);
7517                   i = i_byte = 0;
7518                   while (i < len)
7519                     {
7520                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7521                       buf++;
7522                     }
7523                 }
7524               else if (INTEGERP (components))
7525                 {
7526                   len = 1;
7527                   *buf++ = XINT (components);
7528                 }
7529               else if (CONSP (components))
7530                 {
7531                   for (len = 0; CONSP (components);
7532                        len++, components = XCDR (components))
7533                     *buf++ = XINT (XCAR (components));
7534                 }
7535               else
7536                 emacs_abort ();
7537               *head -= len;
7538             }
7539         }
7540
7541       if (find_composition (end, limit, &start, &end, &prop,
7542                             coding->src_object)
7543           && end <= limit)
7544         *stop = start;
7545       else
7546         *stop = limit;
7547     }
7548   return buf;
7549 }
7550
7551
7552 /* Extract an annotation datum from a text property `charset' at POS of
7553    CODING->src_object (buffer of string), store the data in BUF, set
7554    *STOP to the position where the value of `charset' property changes
7555    (limiting by LIMIT), and return the address of the next element of
7556    BUF.
7557
7558    If the property value is nil, set *STOP to the position where the
7559    property value is non-nil (limiting by LIMIT), and return BUF.  */
7560
7561 static int *
7562 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7563                            struct coding_system *coding, int *buf,
7564                            ptrdiff_t *stop)
7565 {
7566   Lisp_Object val, next;
7567   int id;
7568
7569   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7570   if (! NILP (val) && CHARSETP (val))
7571     id = XINT (CHARSET_SYMBOL_ID (val));
7572   else
7573     id = -1;
7574   ADD_CHARSET_DATA (buf, 0, id);
7575   next = Fnext_single_property_change (make_number (pos), Qcharset,
7576                                        coding->src_object,
7577                                        make_number (limit));
7578   *stop = XINT (next);
7579   return buf;
7580 }
7581
7582
7583 static void
7584 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7585                int max_lookup)
7586 {
7587   int *buf = coding->charbuf;
7588   int *buf_end = coding->charbuf + coding->charbuf_size;
7589   const unsigned char *src = coding->source + coding->consumed;
7590   const unsigned char *src_end = coding->source + coding->src_bytes;
7591   ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7592   ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7593   bool multibytep = coding->src_multibyte;
7594   Lisp_Object eol_type;
7595   int c;
7596   ptrdiff_t stop, stop_composition, stop_charset;
7597   int *lookup_buf = NULL;
7598
7599   if (! NILP (translation_table))
7600     lookup_buf = alloca (sizeof (int) * max_lookup);
7601
7602   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7603   if (VECTORP (eol_type))
7604     eol_type = Qunix;
7605
7606   /* Note: composition handling is not yet implemented.  */
7607   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7608
7609   if (NILP (coding->src_object))
7610     stop = stop_composition = stop_charset = end_pos;
7611   else
7612     {
7613       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7614         stop = stop_composition = pos;
7615       else
7616         stop = stop_composition = end_pos;
7617       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7618         stop = stop_charset = pos;
7619       else
7620         stop_charset = end_pos;
7621     }
7622
7623   /* Compensate for CRLF and conversion.  */
7624   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7625   while (buf < buf_end)
7626     {
7627       Lisp_Object trans;
7628
7629       if (pos == stop)
7630         {
7631           if (pos == end_pos)
7632             break;
7633           if (pos == stop_composition)
7634             buf = handle_composition_annotation (pos, end_pos, coding,
7635                                                  buf, &stop_composition);
7636           if (pos == stop_charset)
7637             buf = handle_charset_annotation (pos, end_pos, coding,
7638                                              buf, &stop_charset);
7639           stop = (stop_composition < stop_charset
7640                   ? stop_composition : stop_charset);
7641         }
7642
7643       if (! multibytep)
7644         {
7645           int bytes;
7646
7647           if (coding->encoder == encode_coding_raw_text
7648               || coding->encoder == encode_coding_ccl)
7649             c = *src++, pos++;
7650           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7651             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7652           else
7653             c = BYTE8_TO_CHAR (*src), src++, pos++;
7654         }
7655       else
7656         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7657       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7658         c = '\n';
7659       if (! EQ (eol_type, Qunix))
7660         {
7661           if (c == '\n')
7662             {
7663               if (EQ (eol_type, Qdos))
7664                 *buf++ = '\r';
7665               else
7666                 c = '\r';
7667             }
7668         }
7669
7670       trans = Qnil;
7671       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7672       if (NILP (trans))
7673         *buf++ = c;
7674       else
7675         {
7676           ptrdiff_t from_nchars = 1, to_nchars = 1;
7677           int *lookup_buf_end;
7678           const unsigned char *p = src;
7679           int i;
7680
7681           lookup_buf[0] = c;
7682           for (i = 1; i < max_lookup && p < src_end; i++)
7683             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7684           lookup_buf_end = lookup_buf + i;
7685           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7686           if (INTEGERP (trans))
7687             c = XINT (trans);
7688           else if (CONSP (trans))
7689             {
7690               from_nchars = ASIZE (XCAR (trans));
7691               trans = XCDR (trans);
7692               if (INTEGERP (trans))
7693                 c = XINT (trans);
7694               else
7695                 {
7696                   to_nchars = ASIZE (trans);
7697                   if (buf_end - buf < to_nchars)
7698                     break;
7699                   c = XINT (AREF (trans, 0));
7700                 }
7701             }
7702           else
7703             break;
7704           *buf++ = c;
7705           for (i = 1; i < to_nchars; i++)
7706             *buf++ = XINT (AREF (trans, i));
7707           for (i = 1; i < from_nchars; i++, pos++)
7708             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7709         }
7710     }
7711
7712   coding->consumed = src - coding->source;
7713   coding->consumed_char = pos - coding->src_pos;
7714   coding->charbuf_used = buf - coding->charbuf;
7715   coding->chars_at_source = 0;
7716 }
7717
7718
7719 /* Encode the text at CODING->src_object into CODING->dst_object.
7720    CODING->src_object is a buffer or a string.
7721    CODING->dst_object is a buffer or nil.
7722
7723    If CODING->src_object is a buffer, it must be the current buffer.
7724    In this case, if CODING->src_pos is positive, it is a position of
7725    the source text in the buffer, otherwise. the source text is in the
7726    gap area of the buffer, and coding->src_pos specifies the offset of
7727    the text from GPT (which must be the same as PT).  If this is the
7728    same buffer as CODING->dst_object, CODING->src_pos must be
7729    negative and CODING should not have `pre-write-conversion'.
7730
7731    If CODING->src_object is a string, CODING should not have
7732    `pre-write-conversion'.
7733
7734    If CODING->dst_object is a buffer, the encoded data is inserted at
7735    the current point of that buffer.
7736
7737    If CODING->dst_object is nil, the encoded data is placed at the
7738    memory area specified by CODING->destination.  */
7739
7740 static void
7741 encode_coding (struct coding_system *coding)
7742 {
7743   Lisp_Object attrs;
7744   Lisp_Object translation_table;
7745   int max_lookup;
7746   struct ccl_spec cclspec;
7747
7748   USE_SAFE_ALLOCA;
7749
7750   attrs = CODING_ID_ATTRS (coding->id);
7751   if (coding->encoder == encode_coding_raw_text)
7752     translation_table = Qnil, max_lookup = 0;
7753   else
7754     translation_table = get_translation_table (attrs, 1, &max_lookup);
7755
7756   if (BUFFERP (coding->dst_object))
7757     {
7758       set_buffer_internal (XBUFFER (coding->dst_object));
7759       coding->dst_multibyte
7760         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7761     }
7762
7763   coding->consumed = coding->consumed_char = 0;
7764   coding->produced = coding->produced_char = 0;
7765   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7766
7767   ALLOC_CONVERSION_WORK_AREA (coding, coding->src_chars);
7768
7769   if (coding->encoder == encode_coding_ccl)
7770     {
7771       coding->spec.ccl = &cclspec;
7772       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7773     }
7774   do {
7775     coding_set_source (coding);
7776     consume_chars (coding, translation_table, max_lookup);
7777     coding_set_destination (coding);
7778     (*(coding->encoder)) (coding);
7779   } while (coding->consumed_char < coding->src_chars);
7780
7781   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7782     insert_from_gap (coding->produced_char, coding->produced, 0);
7783
7784   SAFE_FREE ();
7785 }
7786
7787
7788 /* Name (or base name) of work buffer for code conversion.  */
7789 static Lisp_Object Vcode_conversion_workbuf_name;
7790
7791 /* A working buffer used by the top level conversion.  Once it is
7792    created, it is never destroyed.  It has the name
7793    Vcode_conversion_workbuf_name.  The other working buffers are
7794    destroyed after the use is finished, and their names are modified
7795    versions of Vcode_conversion_workbuf_name.  */
7796 static Lisp_Object Vcode_conversion_reused_workbuf;
7797
7798 /* True iff Vcode_conversion_reused_workbuf is already in use.  */
7799 static bool reused_workbuf_in_use;
7800
7801
7802 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7803    multibyteness of returning buffer.  */
7804
7805 static Lisp_Object
7806 make_conversion_work_buffer (bool multibyte)
7807 {
7808   Lisp_Object name, workbuf;
7809   struct buffer *current;
7810
7811   if (reused_workbuf_in_use)
7812     {
7813       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7814       workbuf = Fget_buffer_create (name);
7815     }
7816   else
7817     {
7818       reused_workbuf_in_use = 1;
7819       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7820         Vcode_conversion_reused_workbuf
7821           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7822       workbuf = Vcode_conversion_reused_workbuf;
7823     }
7824   current = current_buffer;
7825   set_buffer_internal (XBUFFER (workbuf));
7826   /* We can't allow modification hooks to run in the work buffer.  For
7827      instance, directory_files_internal assumes that file decoding
7828      doesn't compile new regexps.  */
7829   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7830   Ferase_buffer ();
7831   bset_undo_list (current_buffer, Qt);
7832   bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7833   set_buffer_internal (current);
7834   return workbuf;
7835 }
7836
7837
7838 static void
7839 code_conversion_restore (Lisp_Object arg)
7840 {
7841   Lisp_Object current, workbuf;
7842
7843   current = XCAR (arg);
7844   workbuf = XCDR (arg);
7845   if (! NILP (workbuf))
7846     {
7847       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7848         reused_workbuf_in_use = 0;
7849       else
7850         Fkill_buffer (workbuf);
7851     }
7852   set_buffer_internal (XBUFFER (current));
7853 }
7854
7855 Lisp_Object
7856 code_conversion_save (bool with_work_buf, bool multibyte)
7857 {
7858   Lisp_Object workbuf = Qnil;
7859
7860   if (with_work_buf)
7861     workbuf = make_conversion_work_buffer (multibyte);
7862   record_unwind_protect (code_conversion_restore,
7863                          Fcons (Fcurrent_buffer (), workbuf));
7864   return workbuf;
7865 }
7866
7867 void
7868 decode_coding_gap (struct coding_system *coding,
7869                    ptrdiff_t chars, ptrdiff_t bytes)
7870 {
7871   ptrdiff_t count = SPECPDL_INDEX ();
7872   Lisp_Object attrs;
7873
7874   coding->src_object = Fcurrent_buffer ();
7875   coding->src_chars = chars;
7876   coding->src_bytes = bytes;
7877   coding->src_pos = -chars;
7878   coding->src_pos_byte = -bytes;
7879   coding->src_multibyte = chars < bytes;
7880   coding->dst_object = coding->src_object;
7881   coding->dst_pos = PT;
7882   coding->dst_pos_byte = PT_BYTE;
7883   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7884
7885   coding->head_ascii = -1;
7886   coding->detected_utf8_bytes = coding->detected_utf8_chars = -1;
7887   coding->eol_seen = EOL_SEEN_NONE;
7888   if (CODING_REQUIRE_DETECTION (coding))
7889     detect_coding (coding);
7890   attrs = CODING_ID_ATTRS (coding->id);
7891   if (! disable_ascii_optimization
7892       && ! coding->src_multibyte
7893       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
7894       && NILP (CODING_ATTR_POST_READ (attrs))
7895       && NILP (get_translation_table (attrs, 0, NULL)))
7896     {
7897       chars = coding->head_ascii;
7898       if (chars < 0)
7899         chars = check_ascii (coding);
7900       if (chars != bytes)
7901         {
7902           /* There exists a non-ASCII byte.  */
7903           if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8)
7904               && coding->detected_utf8_bytes == coding->src_bytes)
7905             {
7906               if (coding->detected_utf8_chars >= 0)
7907                 chars = coding->detected_utf8_chars;
7908               else
7909                 chars = check_utf_8 (coding);
7910               if (CODING_UTF_8_BOM (coding) != utf_without_bom
7911                   && coding->head_ascii == 0
7912                   && coding->source[0] == UTF_8_BOM_1
7913                   && coding->source[1] == UTF_8_BOM_2
7914                   && coding->source[2] == UTF_8_BOM_3)
7915                 {
7916                   chars--;
7917                   bytes -= 3;
7918                   coding->src_bytes -= 3;
7919                 }
7920             }
7921           else
7922             chars = -1;
7923         }
7924       if (chars >= 0)
7925         {
7926           Lisp_Object eol_type;
7927
7928           eol_type = CODING_ID_EOL_TYPE (coding->id);
7929           if (VECTORP (eol_type))
7930             {
7931               if (coding->eol_seen != EOL_SEEN_NONE)
7932                 eol_type = adjust_coding_eol_type (coding, coding->eol_seen);
7933             }
7934           if (EQ (eol_type, Qmac))
7935             {
7936               unsigned char *src_end = GAP_END_ADDR;
7937               unsigned char *src = src_end - coding->src_bytes;
7938
7939               while (src < src_end)
7940                 {
7941                   if (*src++ == '\r')
7942                     src[-1] = '\n';
7943                 }
7944             }
7945           else if (EQ (eol_type, Qdos))
7946             {
7947               unsigned char *src = GAP_END_ADDR;
7948               unsigned char *src_beg = src - coding->src_bytes;
7949               unsigned char *dst = src;
7950               ptrdiff_t diff;
7951
7952               while (src_beg < src)
7953                 {
7954                   *--dst = *--src;
7955                   if (*src == '\n' && src > src_beg && src[-1] == '\r')
7956                     src--;
7957                 }
7958               diff = dst - src;
7959               bytes -= diff;
7960               chars -= diff;
7961             }
7962           coding->produced = bytes;
7963           coding->produced_char = chars;
7964           insert_from_gap (chars, bytes, 1);
7965           return;
7966         }
7967     }
7968   code_conversion_save (0, 0);
7969
7970   coding->mode |= CODING_MODE_LAST_BLOCK;
7971   current_buffer->text->inhibit_shrinking = 1;
7972   decode_coding (coding);
7973   current_buffer->text->inhibit_shrinking = 0;
7974
7975   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7976     {
7977       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7978       Lisp_Object val;
7979
7980       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7981       val = call1 (CODING_ATTR_POST_READ (attrs),
7982                    make_number (coding->produced_char));
7983       CHECK_NATNUM (val);
7984       coding->produced_char += Z - prev_Z;
7985       coding->produced += Z_BYTE - prev_Z_BYTE;
7986     }
7987
7988   unbind_to (count, Qnil);
7989 }
7990
7991
7992 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7993    SRC_OBJECT into DST_OBJECT by coding context CODING.
7994
7995    SRC_OBJECT is a buffer, a string, or Qnil.
7996
7997    If it is a buffer, the text is at point of the buffer.  FROM and TO
7998    are positions in the buffer.
7999
8000    If it is a string, the text is at the beginning of the string.
8001    FROM and TO are indices to the string.
8002
8003    If it is nil, the text is at coding->source.  FROM and TO are
8004    indices to coding->source.
8005
8006    DST_OBJECT is a buffer, Qt, or Qnil.
8007
8008    If it is a buffer, the decoded text is inserted at point of the
8009    buffer.  If the buffer is the same as SRC_OBJECT, the source text
8010    is deleted.
8011
8012    If it is Qt, a string is made from the decoded text, and
8013    set in CODING->dst_object.
8014
8015    If it is Qnil, the decoded text is stored at CODING->destination.
8016    The caller must allocate CODING->dst_bytes bytes at
8017    CODING->destination by xmalloc.  If the decoded text is longer than
8018    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
8019  */
8020
8021 void
8022 decode_coding_object (struct coding_system *coding,
8023                       Lisp_Object src_object,
8024                       ptrdiff_t from, ptrdiff_t from_byte,
8025                       ptrdiff_t to, ptrdiff_t to_byte,
8026                       Lisp_Object dst_object)
8027 {
8028   ptrdiff_t count = SPECPDL_INDEX ();
8029   unsigned char *destination IF_LINT (= NULL);
8030   ptrdiff_t dst_bytes IF_LINT (= 0);
8031   ptrdiff_t chars = to - from;
8032   ptrdiff_t bytes = to_byte - from_byte;
8033   Lisp_Object attrs;
8034   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
8035   bool need_marker_adjustment = 0;
8036   Lisp_Object old_deactivate_mark;
8037
8038   old_deactivate_mark = Vdeactivate_mark;
8039
8040   if (NILP (dst_object))
8041     {
8042       destination = coding->destination;
8043       dst_bytes = coding->dst_bytes;
8044     }
8045
8046   coding->src_object = src_object;
8047   coding->src_chars = chars;
8048   coding->src_bytes = bytes;
8049   coding->src_multibyte = chars < bytes;
8050
8051   if (STRINGP (src_object))
8052     {
8053       coding->src_pos = from;
8054       coding->src_pos_byte = from_byte;
8055     }
8056   else if (BUFFERP (src_object))
8057     {
8058       set_buffer_internal (XBUFFER (src_object));
8059       if (from != GPT)
8060         move_gap_both (from, from_byte);
8061       if (EQ (src_object, dst_object))
8062         {
8063           struct Lisp_Marker *tail;
8064
8065           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8066             {
8067               tail->need_adjustment
8068                 = tail->charpos == (tail->insertion_type ? from : to);
8069               need_marker_adjustment |= tail->need_adjustment;
8070             }
8071           saved_pt = PT, saved_pt_byte = PT_BYTE;
8072           TEMP_SET_PT_BOTH (from, from_byte);
8073           current_buffer->text->inhibit_shrinking = 1;
8074           del_range_both (from, from_byte, to, to_byte, 1);
8075           coding->src_pos = -chars;
8076           coding->src_pos_byte = -bytes;
8077         }
8078       else
8079         {
8080           coding->src_pos = from;
8081           coding->src_pos_byte = from_byte;
8082         }
8083     }
8084
8085   if (CODING_REQUIRE_DETECTION (coding))
8086     detect_coding (coding);
8087   attrs = CODING_ID_ATTRS (coding->id);
8088
8089   if (EQ (dst_object, Qt)
8090       || (! NILP (CODING_ATTR_POST_READ (attrs))
8091           && NILP (dst_object)))
8092     {
8093       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
8094       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
8095       coding->dst_pos = BEG;
8096       coding->dst_pos_byte = BEG_BYTE;
8097     }
8098   else if (BUFFERP (dst_object))
8099     {
8100       code_conversion_save (0, 0);
8101       coding->dst_object = dst_object;
8102       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
8103       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
8104       coding->dst_multibyte
8105         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8106     }
8107   else
8108     {
8109       code_conversion_save (0, 0);
8110       coding->dst_object = Qnil;
8111       /* Most callers presume this will return a multibyte result, and they
8112          won't use `binary' or `raw-text' anyway, so let's not worry about
8113          CODING_FOR_UNIBYTE.  */
8114       coding->dst_multibyte = 1;
8115     }
8116
8117   decode_coding (coding);
8118
8119   if (BUFFERP (coding->dst_object))
8120     set_buffer_internal (XBUFFER (coding->dst_object));
8121
8122   if (! NILP (CODING_ATTR_POST_READ (attrs)))
8123     {
8124       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
8125       Lisp_Object val;
8126
8127       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
8128       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
8129                         make_number (coding->produced_char));
8130       CHECK_NATNUM (val);
8131       coding->produced_char += Z - prev_Z;
8132       coding->produced += Z_BYTE - prev_Z_BYTE;
8133     }
8134
8135   if (EQ (dst_object, Qt))
8136     {
8137       coding->dst_object = Fbuffer_string ();
8138     }
8139   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
8140     {
8141       set_buffer_internal (XBUFFER (coding->dst_object));
8142       if (dst_bytes < coding->produced)
8143         {
8144           eassert (coding->produced > 0);
8145           destination = xrealloc (destination, coding->produced);
8146           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
8147             move_gap_both (BEGV, BEGV_BYTE);
8148           memcpy (destination, BEGV_ADDR, coding->produced);
8149           coding->destination = destination;
8150         }
8151     }
8152
8153   if (saved_pt >= 0)
8154     {
8155       /* This is the case of:
8156          (BUFFERP (src_object) && EQ (src_object, dst_object))
8157          As we have moved PT while replacing the original buffer
8158          contents, we must recover it now.  */
8159       set_buffer_internal (XBUFFER (src_object));
8160       current_buffer->text->inhibit_shrinking = 0;
8161       if (saved_pt < from)
8162         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8163       else if (saved_pt < from + chars)
8164         TEMP_SET_PT_BOTH (from, from_byte);
8165       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8166         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8167                           saved_pt_byte + (coding->produced - bytes));
8168       else
8169         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8170                           saved_pt_byte + (coding->produced - bytes));
8171
8172       if (need_marker_adjustment)
8173         {
8174           struct Lisp_Marker *tail;
8175
8176           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8177             if (tail->need_adjustment)
8178               {
8179                 tail->need_adjustment = 0;
8180                 if (tail->insertion_type)
8181                   {
8182                     tail->bytepos = from_byte;
8183                     tail->charpos = from;
8184                   }
8185                 else
8186                   {
8187                     tail->bytepos = from_byte + coding->produced;
8188                     tail->charpos
8189                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8190                          ? tail->bytepos : from + coding->produced_char);
8191                   }
8192               }
8193         }
8194     }
8195
8196   Vdeactivate_mark = old_deactivate_mark;
8197   unbind_to (count, coding->dst_object);
8198 }
8199
8200
8201 void
8202 encode_coding_object (struct coding_system *coding,
8203                       Lisp_Object src_object,
8204                       ptrdiff_t from, ptrdiff_t from_byte,
8205                       ptrdiff_t to, ptrdiff_t to_byte,
8206                       Lisp_Object dst_object)
8207 {
8208   ptrdiff_t count = SPECPDL_INDEX ();
8209   ptrdiff_t chars = to - from;
8210   ptrdiff_t bytes = to_byte - from_byte;
8211   Lisp_Object attrs;
8212   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
8213   bool need_marker_adjustment = 0;
8214   bool kill_src_buffer = 0;
8215   Lisp_Object old_deactivate_mark;
8216
8217   old_deactivate_mark = Vdeactivate_mark;
8218
8219   coding->src_object = src_object;
8220   coding->src_chars = chars;
8221   coding->src_bytes = bytes;
8222   coding->src_multibyte = chars < bytes;
8223
8224   attrs = CODING_ID_ATTRS (coding->id);
8225
8226   if (EQ (src_object, dst_object))
8227     {
8228       struct Lisp_Marker *tail;
8229
8230       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8231         {
8232           tail->need_adjustment
8233             = tail->charpos == (tail->insertion_type ? from : to);
8234           need_marker_adjustment |= tail->need_adjustment;
8235         }
8236     }
8237
8238   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
8239     {
8240       coding->src_object = code_conversion_save (1, coding->src_multibyte);
8241       set_buffer_internal (XBUFFER (coding->src_object));
8242       if (STRINGP (src_object))
8243         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
8244       else if (BUFFERP (src_object))
8245         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
8246       else
8247         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
8248
8249       if (EQ (src_object, dst_object))
8250         {
8251           set_buffer_internal (XBUFFER (src_object));
8252           saved_pt = PT, saved_pt_byte = PT_BYTE;
8253           del_range_both (from, from_byte, to, to_byte, 1);
8254           set_buffer_internal (XBUFFER (coding->src_object));
8255         }
8256
8257       safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
8258                   make_number (BEG), make_number (Z));
8259       if (XBUFFER (coding->src_object) != current_buffer)
8260         kill_src_buffer = 1;
8261       coding->src_object = Fcurrent_buffer ();
8262       if (BEG != GPT)
8263         move_gap_both (BEG, BEG_BYTE);
8264       coding->src_chars = Z - BEG;
8265       coding->src_bytes = Z_BYTE - BEG_BYTE;
8266       coding->src_pos = BEG;
8267       coding->src_pos_byte = BEG_BYTE;
8268       coding->src_multibyte = Z < Z_BYTE;
8269     }
8270   else if (STRINGP (src_object))
8271     {
8272       code_conversion_save (0, 0);
8273       coding->src_pos = from;
8274       coding->src_pos_byte = from_byte;
8275     }
8276   else if (BUFFERP (src_object))
8277     {
8278       code_conversion_save (0, 0);
8279       set_buffer_internal (XBUFFER (src_object));
8280       if (EQ (src_object, dst_object))
8281         {
8282           saved_pt = PT, saved_pt_byte = PT_BYTE;
8283           coding->src_object = del_range_1 (from, to, 1, 1);
8284           coding->src_pos = 0;
8285           coding->src_pos_byte = 0;
8286         }
8287       else
8288         {
8289           if (from < GPT && to >= GPT)
8290             move_gap_both (from, from_byte);
8291           coding->src_pos = from;
8292           coding->src_pos_byte = from_byte;
8293         }
8294     }
8295   else
8296     {
8297       code_conversion_save (0, 0);
8298       coding->src_pos = from;
8299       coding->src_pos_byte = from_byte;
8300     }
8301
8302   if (BUFFERP (dst_object))
8303     {
8304       coding->dst_object = dst_object;
8305       if (EQ (src_object, dst_object))
8306         {
8307           coding->dst_pos = from;
8308           coding->dst_pos_byte = from_byte;
8309         }
8310       else
8311         {
8312           struct buffer *current = current_buffer;
8313
8314           set_buffer_temp (XBUFFER (dst_object));
8315           coding->dst_pos = PT;
8316           coding->dst_pos_byte = PT_BYTE;
8317           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8318           set_buffer_temp (current);
8319         }
8320       coding->dst_multibyte
8321         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8322     }
8323   else if (EQ (dst_object, Qt))
8324     {
8325       ptrdiff_t dst_bytes = max (1, coding->src_chars);
8326       coding->dst_object = Qnil;
8327       coding->destination = xmalloc (dst_bytes);
8328       coding->dst_bytes = dst_bytes;
8329       coding->dst_multibyte = 0;
8330     }
8331   else
8332     {
8333       coding->dst_object = Qnil;
8334       coding->dst_multibyte = 0;
8335     }
8336
8337   encode_coding (coding);
8338
8339   if (EQ (dst_object, Qt))
8340     {
8341       if (BUFFERP (coding->dst_object))
8342         coding->dst_object = Fbuffer_string ();
8343       else if (coding->raw_destination)
8344         /* This is used to avoid creating huge Lisp string.
8345            NOTE: caller who sets `raw_destination' is also
8346            responsible for freeing `destination' buffer.  */
8347         coding->dst_object = Qnil;
8348       else
8349         {
8350           coding->dst_object
8351             = make_unibyte_string ((char *) coding->destination,
8352                                    coding->produced);
8353           xfree (coding->destination);
8354         }
8355     }
8356
8357   if (saved_pt >= 0)
8358     {
8359       /* This is the case of:
8360          (BUFFERP (src_object) && EQ (src_object, dst_object))
8361          As we have moved PT while replacing the original buffer
8362          contents, we must recover it now.  */
8363       set_buffer_internal (XBUFFER (src_object));
8364       if (saved_pt < from)
8365         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8366       else if (saved_pt < from + chars)
8367         TEMP_SET_PT_BOTH (from, from_byte);
8368       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8369         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8370                           saved_pt_byte + (coding->produced - bytes));
8371       else
8372         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8373                           saved_pt_byte + (coding->produced - bytes));
8374
8375       if (need_marker_adjustment)
8376         {
8377           struct Lisp_Marker *tail;
8378
8379           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8380             if (tail->need_adjustment)
8381               {
8382                 tail->need_adjustment = 0;
8383                 if (tail->insertion_type)
8384                   {
8385                     tail->bytepos = from_byte;
8386                     tail->charpos = from;
8387                   }
8388                 else
8389                   {
8390                     tail->bytepos = from_byte + coding->produced;
8391                     tail->charpos
8392                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8393                          ? tail->bytepos : from + coding->produced_char);
8394                   }
8395               }
8396         }
8397     }
8398
8399   if (kill_src_buffer)
8400     Fkill_buffer (coding->src_object);
8401
8402   Vdeactivate_mark = old_deactivate_mark;
8403   unbind_to (count, Qnil);
8404 }
8405
8406
8407 Lisp_Object
8408 preferred_coding_system (void)
8409 {
8410   int id = coding_categories[coding_priorities[0]].id;
8411
8412   return CODING_ID_NAME (id);
8413 }
8414
8415 #if defined (WINDOWSNT) || defined (CYGWIN)
8416
8417 Lisp_Object
8418 from_unicode (Lisp_Object str)
8419 {
8420   CHECK_STRING (str);
8421   if (!STRING_MULTIBYTE (str) &&
8422       SBYTES (str) & 1)
8423     {
8424       str = Fsubstring (str, make_number (0), make_number (-1));
8425     }
8426
8427   return code_convert_string_norecord (str, Qutf_16le, 0);
8428 }
8429
8430 Lisp_Object
8431 from_unicode_buffer (const wchar_t *wstr)
8432 {
8433     return from_unicode (
8434         make_unibyte_string (
8435             (char *) wstr,
8436             /* we get one of the two final 0 bytes for free. */
8437             1 + sizeof (wchar_t) * wcslen (wstr)));
8438 }
8439
8440 wchar_t *
8441 to_unicode (Lisp_Object str, Lisp_Object *buf)
8442 {
8443   *buf = code_convert_string_norecord (str, Qutf_16le, 1);
8444   /* We need to make another copy (in addition to the one made by
8445      code_convert_string_norecord) to ensure that the final string is
8446      _doubly_ zero terminated --- that is, that the string is
8447      terminated by two zero bytes and one utf-16le null character.
8448      Because strings are already terminated with a single zero byte,
8449      we just add one additional zero. */
8450   str = make_uninit_string (SBYTES (*buf) + 1);
8451   memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
8452   SDATA (str) [SBYTES (*buf)] = '\0';
8453   *buf = str;
8454   return WCSDATA (*buf);
8455 }
8456
8457 #endif /* WINDOWSNT || CYGWIN */
8458
8459 \f
8460 #ifdef emacs
8461 /*** 8. Emacs Lisp library functions ***/
8462
8463 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8464        doc: /* Return t if OBJECT is nil or a coding-system.
8465 See the documentation of `define-coding-system' for information
8466 about coding-system objects.  */)
8467   (Lisp_Object object)
8468 {
8469   if (NILP (object)
8470       || CODING_SYSTEM_ID (object) >= 0)
8471     return Qt;
8472   if (! SYMBOLP (object)
8473       || NILP (Fget (object, Qcoding_system_define_form)))
8474     return Qnil;
8475   return Qt;
8476 }
8477
8478 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8479        Sread_non_nil_coding_system, 1, 1, 0,
8480        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8481   (Lisp_Object prompt)
8482 {
8483   Lisp_Object val;
8484   do
8485     {
8486       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8487                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8488     }
8489   while (SCHARS (val) == 0);
8490   return (Fintern (val, Qnil));
8491 }
8492
8493 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8494        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8495 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8496 Ignores case when completing coding systems (all Emacs coding systems
8497 are lower-case).  */)
8498   (Lisp_Object prompt, Lisp_Object default_coding_system)
8499 {
8500   Lisp_Object val;
8501   ptrdiff_t count = SPECPDL_INDEX ();
8502
8503   if (SYMBOLP (default_coding_system))
8504     default_coding_system = SYMBOL_NAME (default_coding_system);
8505   specbind (Qcompletion_ignore_case, Qt);
8506   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8507                           Qt, Qnil, Qcoding_system_history,
8508                           default_coding_system, Qnil);
8509   unbind_to (count, Qnil);
8510   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8511 }
8512
8513 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8514        1, 1, 0,
8515        doc: /* Check validity of CODING-SYSTEM.
8516 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8517 It is valid if it is nil or a symbol defined as a coding system by the
8518 function `define-coding-system'.  */)
8519   (Lisp_Object coding_system)
8520 {
8521   Lisp_Object define_form;
8522
8523   define_form = Fget (coding_system, Qcoding_system_define_form);
8524   if (! NILP (define_form))
8525     {
8526       Fput (coding_system, Qcoding_system_define_form, Qnil);
8527       safe_eval (define_form);
8528     }
8529   if (!NILP (Fcoding_system_p (coding_system)))
8530     return coding_system;
8531   xsignal1 (Qcoding_system_error, coding_system);
8532 }
8533
8534 \f
8535 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8536    HIGHEST, return the coding system of the highest
8537    priority among the detected coding systems.  Otherwise return a
8538    list of detected coding systems sorted by their priorities.  If
8539    MULTIBYTEP, it is assumed that the bytes are in correct
8540    multibyte form but contains only ASCII and eight-bit chars.
8541    Otherwise, the bytes are raw bytes.
8542
8543    CODING-SYSTEM controls the detection as below:
8544
8545    If it is nil, detect both text-format and eol-format.  If the
8546    text-format part of CODING-SYSTEM is already specified
8547    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8548    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8549    detect only text-format.  */
8550
8551 Lisp_Object
8552 detect_coding_system (const unsigned char *src,
8553                       ptrdiff_t src_chars, ptrdiff_t src_bytes,
8554                       bool highest, bool multibytep,
8555                       Lisp_Object coding_system)
8556 {
8557   const unsigned char *src_end = src + src_bytes;
8558   Lisp_Object attrs, eol_type;
8559   Lisp_Object val = Qnil;
8560   struct coding_system coding;
8561   ptrdiff_t id;
8562   struct coding_detection_info detect_info;
8563   enum coding_category base_category;
8564   bool null_byte_found = 0, eight_bit_found = 0;
8565
8566   if (NILP (coding_system))
8567     coding_system = Qundecided;
8568   setup_coding_system (coding_system, &coding);
8569   attrs = CODING_ID_ATTRS (coding.id);
8570   eol_type = CODING_ID_EOL_TYPE (coding.id);
8571   coding_system = CODING_ATTR_BASE_NAME (attrs);
8572
8573   coding.source = src;
8574   coding.src_chars = src_chars;
8575   coding.src_bytes = src_bytes;
8576   coding.src_multibyte = multibytep;
8577   coding.consumed = 0;
8578   coding.mode |= CODING_MODE_LAST_BLOCK;
8579   coding.head_ascii = 0;
8580
8581   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8582
8583   /* At first, detect text-format if necessary.  */
8584   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8585   if (base_category == coding_category_undecided)
8586     {
8587       enum coding_category category IF_LINT (= 0);
8588       struct coding_system *this IF_LINT (= NULL);
8589       int c, i;
8590       bool inhibit_nbd = inhibit_flag (coding.spec.undecided.inhibit_nbd,
8591                                        inhibit_null_byte_detection);
8592       bool inhibit_ied = inhibit_flag (coding.spec.undecided.inhibit_ied,
8593                                        inhibit_iso_escape_detection);
8594       bool prefer_utf_8 = coding.spec.undecided.prefer_utf_8;
8595
8596       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8597       for (; src < src_end; src++)
8598         {
8599           c = *src;
8600           if (c & 0x80)
8601             {
8602               eight_bit_found = 1;
8603               if (null_byte_found)
8604                 break;
8605             }
8606           else if (c < 0x20)
8607             {
8608               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8609                   && ! inhibit_ied
8610                   && ! detect_info.checked)
8611                 {
8612                   if (detect_coding_iso_2022 (&coding, &detect_info))
8613                     {
8614                       /* We have scanned the whole data.  */
8615                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8616                         {
8617                           /* We didn't find an 8-bit code.  We may
8618                              have found a null-byte, but it's very
8619                              rare that a binary file confirm to
8620                              ISO-2022.  */
8621                           src = src_end;
8622                           coding.head_ascii = src - coding.source;
8623                         }
8624                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8625                       break;
8626                     }
8627                 }
8628               else if (! c && !inhibit_nbd)
8629                 {
8630                   null_byte_found = 1;
8631                   if (eight_bit_found)
8632                     break;
8633                 }
8634               if (! eight_bit_found)
8635                 coding.head_ascii++;
8636             }
8637           else if (! eight_bit_found)
8638             coding.head_ascii++;
8639         }
8640
8641       if (null_byte_found || eight_bit_found
8642           || coding.head_ascii < coding.src_bytes
8643           || detect_info.found)
8644         {
8645           if (coding.head_ascii == coding.src_bytes)
8646             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8647             for (i = 0; i < coding_category_raw_text; i++)
8648               {
8649                 category = coding_priorities[i];
8650                 this = coding_categories + category;
8651                 if (detect_info.found & (1 << category))
8652                   break;
8653               }
8654           else
8655             {
8656               if (null_byte_found)
8657                 {
8658                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8659                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8660                 }
8661               else if (prefer_utf_8
8662                        && detect_coding_utf_8 (&coding, &detect_info))
8663                 {
8664                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
8665                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
8666                 }
8667               for (i = 0; i < coding_category_raw_text; i++)
8668                 {
8669                   category = coding_priorities[i];
8670                   this = coding_categories + category;
8671
8672                   if (this->id < 0)
8673                     {
8674                       /* No coding system of this category is defined.  */
8675                       detect_info.rejected |= (1 << category);
8676                     }
8677                   else if (category >= coding_category_raw_text)
8678                     continue;
8679                   else if (detect_info.checked & (1 << category))
8680                     {
8681                       if (highest
8682                           && (detect_info.found & (1 << category)))
8683                         break;
8684                     }
8685                   else if ((*(this->detector)) (&coding, &detect_info)
8686                            && highest
8687                            && (detect_info.found & (1 << category)))
8688                     {
8689                       if (category == coding_category_utf_16_auto)
8690                         {
8691                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8692                             category = coding_category_utf_16_le;
8693                           else
8694                             category = coding_category_utf_16_be;
8695                         }
8696                       break;
8697                     }
8698                 }
8699             }
8700         }
8701
8702       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8703           || null_byte_found)
8704         {
8705           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8706           id = CODING_SYSTEM_ID (Qno_conversion);
8707           val = list1 (make_number (id));
8708         }
8709       else if (! detect_info.rejected && ! detect_info.found)
8710         {
8711           detect_info.found = CATEGORY_MASK_ANY;
8712           id = coding_categories[coding_category_undecided].id;
8713           val = list1 (make_number (id));
8714         }
8715       else if (highest)
8716         {
8717           if (detect_info.found)
8718             {
8719               detect_info.found = 1 << category;
8720               val = list1 (make_number (this->id));
8721             }
8722           else
8723             for (i = 0; i < coding_category_raw_text; i++)
8724               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8725                 {
8726                   detect_info.found = 1 << coding_priorities[i];
8727                   id = coding_categories[coding_priorities[i]].id;
8728                   val = list1 (make_number (id));
8729                   break;
8730                 }
8731         }
8732       else
8733         {
8734           int mask = detect_info.rejected | detect_info.found;
8735           int found = 0;
8736
8737           for (i = coding_category_raw_text - 1; i >= 0; i--)
8738             {
8739               category = coding_priorities[i];
8740               if (! (mask & (1 << category)))
8741                 {
8742                   found |= 1 << category;
8743                   id = coding_categories[category].id;
8744                   if (id >= 0)
8745                     val = list1 (make_number (id));
8746                 }
8747             }
8748           for (i = coding_category_raw_text - 1; i >= 0; i--)
8749             {
8750               category = coding_priorities[i];
8751               if (detect_info.found & (1 << category))
8752                 {
8753                   id = coding_categories[category].id;
8754                   val = Fcons (make_number (id), val);
8755                 }
8756             }
8757           detect_info.found |= found;
8758         }
8759     }
8760   else if (base_category == coding_category_utf_8_auto)
8761     {
8762       if (detect_coding_utf_8 (&coding, &detect_info))
8763         {
8764           struct coding_system *this;
8765
8766           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8767             this = coding_categories + coding_category_utf_8_sig;
8768           else
8769             this = coding_categories + coding_category_utf_8_nosig;
8770           val = list1 (make_number (this->id));
8771         }
8772     }
8773   else if (base_category == coding_category_utf_16_auto)
8774     {
8775       if (detect_coding_utf_16 (&coding, &detect_info))
8776         {
8777           struct coding_system *this;
8778
8779           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8780             this = coding_categories + coding_category_utf_16_le;
8781           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8782             this = coding_categories + coding_category_utf_16_be;
8783           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8784             this = coding_categories + coding_category_utf_16_be_nosig;
8785           else
8786             this = coding_categories + coding_category_utf_16_le_nosig;
8787           val = list1 (make_number (this->id));
8788         }
8789     }
8790   else
8791     {
8792       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8793       val = list1 (make_number (coding.id));
8794     }
8795
8796   /* Then, detect eol-format if necessary.  */
8797   {
8798     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8799     Lisp_Object tail;
8800
8801     if (VECTORP (eol_type))
8802       {
8803         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8804           {
8805             if (null_byte_found)
8806               normal_eol = EOL_SEEN_LF;
8807             else
8808               normal_eol = detect_eol (coding.source, src_bytes,
8809                                        coding_category_raw_text);
8810           }
8811         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8812                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8813           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8814                                       coding_category_utf_16_be);
8815         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8816                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8817           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8818                                       coding_category_utf_16_le);
8819       }
8820     else
8821       {
8822         if (EQ (eol_type, Qunix))
8823           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8824         else if (EQ (eol_type, Qdos))
8825           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8826         else
8827           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8828       }
8829
8830     for (tail = val; CONSP (tail); tail = XCDR (tail))
8831       {
8832         enum coding_category category;
8833         int this_eol;
8834
8835         id = XINT (XCAR (tail));
8836         attrs = CODING_ID_ATTRS (id);
8837         category = XINT (CODING_ATTR_CATEGORY (attrs));
8838         eol_type = CODING_ID_EOL_TYPE (id);
8839         if (VECTORP (eol_type))
8840           {
8841             if (category == coding_category_utf_16_be
8842                 || category == coding_category_utf_16_be_nosig)
8843               this_eol = utf_16_be_eol;
8844             else if (category == coding_category_utf_16_le
8845                      || category == coding_category_utf_16_le_nosig)
8846               this_eol = utf_16_le_eol;
8847             else
8848               this_eol = normal_eol;
8849
8850             if (this_eol == EOL_SEEN_LF)
8851               XSETCAR (tail, AREF (eol_type, 0));
8852             else if (this_eol == EOL_SEEN_CRLF)
8853               XSETCAR (tail, AREF (eol_type, 1));
8854             else if (this_eol == EOL_SEEN_CR)
8855               XSETCAR (tail, AREF (eol_type, 2));
8856             else
8857               XSETCAR (tail, CODING_ID_NAME (id));
8858           }
8859         else
8860           XSETCAR (tail, CODING_ID_NAME (id));
8861       }
8862   }
8863
8864   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8865 }
8866
8867
8868 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8869        2, 3, 0,
8870        doc: /* Detect coding system of the text in the region between START and END.
8871 Return a list of possible coding systems ordered by priority.
8872 The coding systems to try and their priorities follows what
8873 the function `coding-system-priority-list' (which see) returns.
8874
8875 If only ASCII characters are found (except for such ISO-2022 control
8876 characters as ESC), it returns a list of single element `undecided'
8877 or its subsidiary coding system according to a detected end-of-line
8878 format.
8879
8880 If optional argument HIGHEST is non-nil, return the coding system of
8881 highest priority.  */)
8882   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8883 {
8884   ptrdiff_t from, to;
8885   ptrdiff_t from_byte, to_byte;
8886
8887   validate_region (&start, &end);
8888   from = XINT (start), to = XINT (end);
8889   from_byte = CHAR_TO_BYTE (from);
8890   to_byte = CHAR_TO_BYTE (to);
8891
8892   if (from < GPT && to >= GPT)
8893     move_gap_both (to, to_byte);
8894
8895   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8896                                to - from, to_byte - from_byte,
8897                                !NILP (highest),
8898                                !NILP (BVAR (current_buffer
8899                                       , enable_multibyte_characters)),
8900                                Qnil);
8901 }
8902
8903 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8904        1, 2, 0,
8905        doc: /* Detect coding system of the text in STRING.
8906 Return a list of possible coding systems ordered by priority.
8907 The coding systems to try and their priorities follows what
8908 the function `coding-system-priority-list' (which see) returns.
8909
8910 If only ASCII characters are found (except for such ISO-2022 control
8911 characters as ESC), it returns a list of single element `undecided'
8912 or its subsidiary coding system according to a detected end-of-line
8913 format.
8914
8915 If optional argument HIGHEST is non-nil, return the coding system of
8916 highest priority.  */)
8917   (Lisp_Object string, Lisp_Object highest)
8918 {
8919   CHECK_STRING (string);
8920
8921   return detect_coding_system (SDATA (string),
8922                                SCHARS (string), SBYTES (string),
8923                                !NILP (highest), STRING_MULTIBYTE (string),
8924                                Qnil);
8925 }
8926
8927
8928 static bool
8929 char_encodable_p (int c, Lisp_Object attrs)
8930 {
8931   Lisp_Object tail;
8932   struct charset *charset;
8933   Lisp_Object translation_table;
8934
8935   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8936   if (! NILP (translation_table))
8937     c = translate_char (translation_table, c);
8938   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8939        CONSP (tail); tail = XCDR (tail))
8940     {
8941       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8942       if (CHAR_CHARSET_P (c, charset))
8943         break;
8944     }
8945   return (! NILP (tail));
8946 }
8947
8948
8949 /* Return a list of coding systems that safely encode the text between
8950    START and END.  If EXCLUDE is non-nil, it is a list of coding
8951    systems not to check.  The returned list doesn't contain any such
8952    coding systems.  In any case, if the text contains only ASCII or is
8953    unibyte, return t.  */
8954
8955 DEFUN ("find-coding-systems-region-internal",
8956        Ffind_coding_systems_region_internal,
8957        Sfind_coding_systems_region_internal, 2, 3, 0,
8958        doc: /* Internal use only.  */)
8959   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8960 {
8961   Lisp_Object coding_attrs_list, safe_codings;
8962   ptrdiff_t start_byte, end_byte;
8963   const unsigned char *p, *pbeg, *pend;
8964   int c;
8965   Lisp_Object tail, elt, work_table;
8966
8967   if (STRINGP (start))
8968     {
8969       if (!STRING_MULTIBYTE (start)
8970           || SCHARS (start) == SBYTES (start))
8971         return Qt;
8972       start_byte = 0;
8973       end_byte = SBYTES (start);
8974     }
8975   else
8976     {
8977       CHECK_NUMBER_COERCE_MARKER (start);
8978       CHECK_NUMBER_COERCE_MARKER (end);
8979       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8980         args_out_of_range (start, end);
8981       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8982         return Qt;
8983       start_byte = CHAR_TO_BYTE (XINT (start));
8984       end_byte = CHAR_TO_BYTE (XINT (end));
8985       if (XINT (end) - XINT (start) == end_byte - start_byte)
8986         return Qt;
8987
8988       if (XINT (start) < GPT && XINT (end) > GPT)
8989         {
8990           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8991             move_gap_both (XINT (start), start_byte);
8992           else
8993             move_gap_both (XINT (end), end_byte);
8994         }
8995     }
8996
8997   coding_attrs_list = Qnil;
8998   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8999     if (NILP (exclude)
9000         || NILP (Fmemq (XCAR (tail), exclude)))
9001       {
9002         Lisp_Object attrs;
9003
9004         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
9005         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs)))
9006           {
9007             ASET (attrs, coding_attr_trans_tbl,
9008                   get_translation_table (attrs, 1, NULL));
9009             coding_attrs_list = Fcons (attrs, coding_attrs_list);
9010           }
9011       }
9012
9013   if (STRINGP (start))
9014     p = pbeg = SDATA (start);
9015   else
9016     p = pbeg = BYTE_POS_ADDR (start_byte);
9017   pend = p + (end_byte - start_byte);
9018
9019   while (p < pend && ASCII_CHAR_P (*p)) p++;
9020   while (p < pend && ASCII_CHAR_P (*(pend - 1))) pend--;
9021
9022   work_table = Fmake_char_table (Qnil, Qnil);
9023   while (p < pend)
9024     {
9025       if (ASCII_CHAR_P (*p))
9026         p++;
9027       else
9028         {
9029           c = STRING_CHAR_ADVANCE (p);
9030           if (!NILP (char_table_ref (work_table, c)))
9031             /* This character was already checked.  Ignore it.  */
9032             continue;
9033
9034           charset_map_loaded = 0;
9035           for (tail = coding_attrs_list; CONSP (tail);)
9036             {
9037               elt = XCAR (tail);
9038               if (NILP (elt))
9039                 tail = XCDR (tail);
9040               else if (char_encodable_p (c, elt))
9041                 tail = XCDR (tail);
9042               else if (CONSP (XCDR (tail)))
9043                 {
9044                   XSETCAR (tail, XCAR (XCDR (tail)));
9045                   XSETCDR (tail, XCDR (XCDR (tail)));
9046                 }
9047               else
9048                 {
9049                   XSETCAR (tail, Qnil);
9050                   tail = XCDR (tail);
9051                 }
9052             }
9053           if (charset_map_loaded)
9054             {
9055               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9056
9057               if (STRINGP (start))
9058                 pbeg = SDATA (start);
9059               else
9060                 pbeg = BYTE_POS_ADDR (start_byte);
9061               p = pbeg + p_offset;
9062               pend = pbeg + pend_offset;
9063             }
9064           char_table_set (work_table, c, Qt);
9065         }
9066     }
9067
9068   safe_codings = list2 (Qraw_text, Qno_conversion);
9069   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
9070     if (! NILP (XCAR (tail)))
9071       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
9072
9073   return safe_codings;
9074 }
9075
9076
9077 DEFUN ("unencodable-char-position", Funencodable_char_position,
9078        Sunencodable_char_position, 3, 5, 0,
9079        doc: /* Return position of first un-encodable character in a region.
9080 START and END specify the region and CODING-SYSTEM specifies the
9081 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
9082
9083 If optional 4th argument COUNT is non-nil, it specifies at most how
9084 many un-encodable characters to search.  In this case, the value is a
9085 list of positions.
9086
9087 If optional 5th argument STRING is non-nil, it is a string to search
9088 for un-encodable characters.  In that case, START and END are indexes
9089 to the string and treated as in `substring'.  */)
9090   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system,
9091    Lisp_Object count, Lisp_Object string)
9092 {
9093   EMACS_INT n;
9094   struct coding_system coding;
9095   Lisp_Object attrs, charset_list, translation_table;
9096   Lisp_Object positions;
9097   ptrdiff_t from, to;
9098   const unsigned char *p, *stop, *pend;
9099   bool ascii_compatible;
9100
9101   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
9102   attrs = CODING_ID_ATTRS (coding.id);
9103   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
9104     return Qnil;
9105   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
9106   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9107   translation_table = get_translation_table (attrs, 1, NULL);
9108
9109   if (NILP (string))
9110     {
9111       validate_region (&start, &end);
9112       from = XINT (start);
9113       to = XINT (end);
9114       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
9115           || (ascii_compatible
9116               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
9117         return Qnil;
9118       p = CHAR_POS_ADDR (from);
9119       pend = CHAR_POS_ADDR (to);
9120       if (from < GPT && to >= GPT)
9121         stop = GPT_ADDR;
9122       else
9123         stop = pend;
9124     }
9125   else
9126     {
9127       CHECK_STRING (string);
9128       validate_subarray (string, start, end, SCHARS (string), &from, &to);
9129       if (! STRING_MULTIBYTE (string))
9130         return Qnil;
9131       p = SDATA (string) + string_char_to_byte (string, from);
9132       stop = pend = SDATA (string) + string_char_to_byte (string, to);
9133       if (ascii_compatible && (to - from) == (pend - p))
9134         return Qnil;
9135     }
9136
9137   if (NILP (count))
9138     n = 1;
9139   else
9140     {
9141       CHECK_NATNUM (count);
9142       n = XINT (count);
9143     }
9144
9145   positions = Qnil;
9146   charset_map_loaded = 0;
9147   while (1)
9148     {
9149       int c;
9150
9151       if (ascii_compatible)
9152         while (p < stop && ASCII_CHAR_P (*p))
9153           p++, from++;
9154       if (p >= stop)
9155         {
9156           if (p >= pend)
9157             break;
9158           stop = pend;
9159           p = GAP_END_ADDR;
9160         }
9161
9162       c = STRING_CHAR_ADVANCE (p);
9163       if (! (ASCII_CHAR_P (c) && ascii_compatible)
9164           && ! char_charset (translate_char (translation_table, c),
9165                              charset_list, NULL))
9166         {
9167           positions = Fcons (make_number (from), positions);
9168           n--;
9169           if (n == 0)
9170             break;
9171         }
9172
9173       from++;
9174       if (charset_map_loaded && NILP (string))
9175         {
9176           p = CHAR_POS_ADDR (from);
9177           pend = CHAR_POS_ADDR (to);
9178           if (from < GPT && to >= GPT)
9179             stop = GPT_ADDR;
9180           else
9181             stop = pend;
9182           charset_map_loaded = 0;
9183         }
9184     }
9185
9186   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
9187 }
9188
9189
9190 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
9191        Scheck_coding_systems_region, 3, 3, 0,
9192        doc: /* Check if the region is encodable by coding systems.
9193
9194 START and END are buffer positions specifying the region.
9195 CODING-SYSTEM-LIST is a list of coding systems to check.
9196
9197 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
9198 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
9199 whole region, POS0, POS1, ... are buffer positions where non-encodable
9200 characters are found.
9201
9202 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
9203 value is nil.
9204
9205 START may be a string.  In that case, check if the string is
9206 encodable, and the value contains indices to the string instead of
9207 buffer positions.  END is ignored.
9208
9209 If the current buffer (or START if it is a string) is unibyte, the value
9210 is nil.  */)
9211   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
9212 {
9213   Lisp_Object list;
9214   ptrdiff_t start_byte, end_byte;
9215   ptrdiff_t pos;
9216   const unsigned char *p, *pbeg, *pend;
9217   int c;
9218   Lisp_Object tail, elt, attrs;
9219
9220   if (STRINGP (start))
9221     {
9222       if (!STRING_MULTIBYTE (start)
9223           || SCHARS (start) == SBYTES (start))
9224         return Qnil;
9225       start_byte = 0;
9226       end_byte = SBYTES (start);
9227       pos = 0;
9228     }
9229   else
9230     {
9231       CHECK_NUMBER_COERCE_MARKER (start);
9232       CHECK_NUMBER_COERCE_MARKER (end);
9233       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
9234         args_out_of_range (start, end);
9235       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9236         return Qnil;
9237       start_byte = CHAR_TO_BYTE (XINT (start));
9238       end_byte = CHAR_TO_BYTE (XINT (end));
9239       if (XINT (end) - XINT (start) == end_byte - start_byte)
9240         return Qnil;
9241
9242       if (XINT (start) < GPT && XINT (end) > GPT)
9243         {
9244           if ((GPT - XINT (start)) < (XINT (end) - GPT))
9245             move_gap_both (XINT (start), start_byte);
9246           else
9247             move_gap_both (XINT (end), end_byte);
9248         }
9249       pos = XINT (start);
9250     }
9251
9252   list = Qnil;
9253   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
9254     {
9255       elt = XCAR (tail);
9256       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
9257       ASET (attrs, coding_attr_trans_tbl,
9258             get_translation_table (attrs, 1, NULL));
9259       list = Fcons (list2 (elt, attrs), list);
9260     }
9261
9262   if (STRINGP (start))
9263     p = pbeg = SDATA (start);
9264   else
9265     p = pbeg = BYTE_POS_ADDR (start_byte);
9266   pend = p + (end_byte - start_byte);
9267
9268   while (p < pend && ASCII_CHAR_P (*p)) p++, pos++;
9269   while (p < pend && ASCII_CHAR_P (*(pend - 1))) pend--;
9270
9271   while (p < pend)
9272     {
9273       if (ASCII_CHAR_P (*p))
9274         p++;
9275       else
9276         {
9277           c = STRING_CHAR_ADVANCE (p);
9278
9279           charset_map_loaded = 0;
9280           for (tail = list; CONSP (tail); tail = XCDR (tail))
9281             {
9282               elt = XCDR (XCAR (tail));
9283               if (! char_encodable_p (c, XCAR (elt)))
9284                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
9285             }
9286           if (charset_map_loaded)
9287             {
9288               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9289
9290               if (STRINGP (start))
9291                 pbeg = SDATA (start);
9292               else
9293                 pbeg = BYTE_POS_ADDR (start_byte);
9294               p = pbeg + p_offset;
9295               pend = pbeg + pend_offset;
9296             }
9297         }
9298       pos++;
9299     }
9300
9301   tail = list;
9302   list = Qnil;
9303   for (; CONSP (tail); tail = XCDR (tail))
9304     {
9305       elt = XCAR (tail);
9306       if (CONSP (XCDR (XCDR (elt))))
9307         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
9308                       list);
9309     }
9310
9311   return list;
9312 }
9313
9314
9315 static Lisp_Object
9316 code_convert_region (Lisp_Object start, Lisp_Object end,
9317                      Lisp_Object coding_system, Lisp_Object dst_object,
9318                      bool encodep, bool norecord)
9319 {
9320   struct coding_system coding;
9321   ptrdiff_t from, from_byte, to, to_byte;
9322   Lisp_Object src_object;
9323
9324   if (NILP (coding_system))
9325     coding_system = Qno_conversion;
9326   else
9327     CHECK_CODING_SYSTEM (coding_system);
9328   src_object = Fcurrent_buffer ();
9329   if (NILP (dst_object))
9330     dst_object = src_object;
9331   else if (! EQ (dst_object, Qt))
9332     CHECK_BUFFER (dst_object);
9333
9334   validate_region (&start, &end);
9335   from = XFASTINT (start);
9336   from_byte = CHAR_TO_BYTE (from);
9337   to = XFASTINT (end);
9338   to_byte = CHAR_TO_BYTE (to);
9339
9340   setup_coding_system (coding_system, &coding);
9341   coding.mode |= CODING_MODE_LAST_BLOCK;
9342
9343   if (BUFFERP (dst_object) && !EQ (dst_object, src_object))
9344     {
9345       struct buffer *buf = XBUFFER (dst_object);
9346       ptrdiff_t buf_pt = BUF_PT (buf);
9347
9348       invalidate_buffer_caches (buf, buf_pt, buf_pt);
9349     }
9350
9351   if (encodep)
9352     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9353                           dst_object);
9354   else
9355     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9356                           dst_object);
9357   if (! norecord)
9358     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9359
9360   return (BUFFERP (dst_object)
9361           ? make_number (coding.produced_char)
9362           : coding.dst_object);
9363 }
9364
9365
9366 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9367        3, 4, "r\nzCoding system: ",
9368        doc: /* Decode the current region from the specified coding system.
9369 When called from a program, takes four arguments:
9370         START, END, CODING-SYSTEM, and DESTINATION.
9371 START and END are buffer positions.
9372
9373 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9374 If nil, the region between START and END is replaced by the decoded text.
9375 If buffer, the decoded text is inserted in that buffer after point (point
9376 does not move).
9377 In those cases, the length of the decoded text is returned.
9378 If DESTINATION is t, the decoded text is returned.
9379
9380 This function sets `last-coding-system-used' to the precise coding system
9381 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9382 not fully specified.)  */)
9383   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9384 {
9385   return code_convert_region (start, end, coding_system, destination, 0, 0);
9386 }
9387
9388 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9389        3, 4, "r\nzCoding system: ",
9390        doc: /* Encode the current region by specified coding system.
9391 When called from a program, takes four arguments:
9392         START, END, CODING-SYSTEM and DESTINATION.
9393 START and END are buffer positions.
9394
9395 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9396 If nil, the region between START and END is replace by the encoded text.
9397 If buffer, the encoded text is inserted in that buffer after point (point
9398 does not move).
9399 In those cases, the length of the encoded text is returned.
9400 If DESTINATION is t, the encoded text is returned.
9401
9402 This function sets `last-coding-system-used' to the precise coding system
9403 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9404 not fully specified.)  */)
9405   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9406 {
9407   return code_convert_region (start, end, coding_system, destination, 1, 0);
9408 }
9409
9410 Lisp_Object
9411 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
9412                      Lisp_Object dst_object, bool encodep, bool nocopy,
9413                      bool norecord)
9414 {
9415   struct coding_system coding;
9416   ptrdiff_t chars, bytes;
9417
9418   CHECK_STRING (string);
9419   if (NILP (coding_system))
9420     {
9421       if (! norecord)
9422         Vlast_coding_system_used = Qno_conversion;
9423       if (NILP (dst_object))
9424         return (nocopy ? Fcopy_sequence (string) : string);
9425     }
9426
9427   if (NILP (coding_system))
9428     coding_system = Qno_conversion;
9429   else
9430     CHECK_CODING_SYSTEM (coding_system);
9431   if (NILP (dst_object))
9432     dst_object = Qt;
9433   else if (! EQ (dst_object, Qt))
9434     CHECK_BUFFER (dst_object);
9435
9436   setup_coding_system (coding_system, &coding);
9437   coding.mode |= CODING_MODE_LAST_BLOCK;
9438   chars = SCHARS (string);
9439   bytes = SBYTES (string);
9440
9441   if (BUFFERP (dst_object))
9442     {
9443       struct buffer *buf = XBUFFER (dst_object);
9444       ptrdiff_t buf_pt = BUF_PT (buf);
9445
9446       invalidate_buffer_caches (buf, buf_pt, buf_pt);
9447     }
9448
9449   if (encodep)
9450     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9451   else
9452     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9453   if (! norecord)
9454     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9455
9456   return (BUFFERP (dst_object)
9457           ? make_number (coding.produced_char)
9458           : coding.dst_object);
9459 }
9460
9461
9462 /* Encode or decode STRING according to CODING_SYSTEM.
9463    Do not set Vlast_coding_system_used.
9464
9465    This function is called only from macros DECODE_FILE and
9466    ENCODE_FILE, thus we ignore character composition.  */
9467
9468 Lisp_Object
9469 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9470                               bool encodep)
9471 {
9472   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9473 }
9474
9475 /* Encode or decode a file name, to or from a unibyte string suitable
9476    for passing to C library functions.  */
9477 Lisp_Object
9478 decode_file_name (Lisp_Object fname)
9479 {
9480 #ifdef WINDOWSNT
9481   /* The w32 build pretends to use UTF-8 for file-name encoding, and
9482      converts the file names either to UTF-16LE or to the system ANSI
9483      codepage internally, depending on the underlying OS; see w32.c.  */
9484   if (! NILP (Fcoding_system_p (Qutf_8)))
9485     return code_convert_string_norecord (fname, Qutf_8, 0);
9486   return fname;
9487 #else  /* !WINDOWSNT */
9488   if (! NILP (Vfile_name_coding_system))
9489     return code_convert_string_norecord (fname, Vfile_name_coding_system, 0);
9490   else if (! NILP (Vdefault_file_name_coding_system))
9491     return code_convert_string_norecord (fname,
9492                                          Vdefault_file_name_coding_system, 0);
9493   else
9494     return fname;
9495 #endif
9496 }
9497
9498 Lisp_Object
9499 encode_file_name (Lisp_Object fname)
9500 {
9501   /* This is especially important during bootstrap and dumping, when
9502      file-name encoding is not yet known, and therefore any non-ASCII
9503      file names are unibyte strings, and could only be thrashed if we
9504      try to encode them.  */
9505   if (!STRING_MULTIBYTE (fname))
9506     return fname;
9507 #ifdef WINDOWSNT
9508   /* The w32 build pretends to use UTF-8 for file-name encoding, and
9509      converts the file names either to UTF-16LE or to the system ANSI
9510      codepage internally, depending on the underlying OS; see w32.c.  */
9511   if (! NILP (Fcoding_system_p (Qutf_8)))
9512     return code_convert_string_norecord (fname, Qutf_8, 1);
9513   return fname;
9514 #else  /* !WINDOWSNT */
9515   if (! NILP (Vfile_name_coding_system))
9516     return code_convert_string_norecord (fname, Vfile_name_coding_system, 1);
9517   else if (! NILP (Vdefault_file_name_coding_system))
9518     return code_convert_string_norecord (fname,
9519                                          Vdefault_file_name_coding_system, 1);
9520   else
9521     return fname;
9522 #endif
9523 }
9524
9525 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9526        2, 4, 0,
9527        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9528
9529 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9530 if the decoding operation is trivial.
9531
9532 Optional fourth arg BUFFER non-nil means that the decoded text is
9533 inserted in that buffer after point (point does not move).  In this
9534 case, the return value is the length of the decoded text.
9535
9536 This function sets `last-coding-system-used' to the precise coding system
9537 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9538 not fully specified.)  */)
9539   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9540 {
9541   return code_convert_string (string, coding_system, buffer,
9542                               0, ! NILP (nocopy), 0);
9543 }
9544
9545 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9546        2, 4, 0,
9547        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9548
9549 Optional third arg NOCOPY non-nil means it is OK to return STRING
9550 itself if the encoding operation is trivial.
9551
9552 Optional fourth arg BUFFER non-nil means that the encoded text is
9553 inserted in that buffer after point (point does not move).  In this
9554 case, the return value is the length of the encoded text.
9555
9556 This function sets `last-coding-system-used' to the precise coding system
9557 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9558 not fully specified.)  */)
9559   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9560 {
9561   return code_convert_string (string, coding_system, buffer,
9562                               1, ! NILP (nocopy), 0);
9563 }
9564
9565 \f
9566 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9567        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9568 Return the corresponding character.  */)
9569   (Lisp_Object code)
9570 {
9571   Lisp_Object spec, attrs, val;
9572   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9573   EMACS_INT ch;
9574   int c;
9575
9576   CHECK_NATNUM (code);
9577   ch = XFASTINT (code);
9578   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9579   attrs = AREF (spec, 0);
9580
9581   if (ASCII_CHAR_P (ch)
9582       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9583     return code;
9584
9585   val = CODING_ATTR_CHARSET_LIST (attrs);
9586   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9587   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9588   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9589
9590   if (ch <= 0x7F)
9591     {
9592       c = ch;
9593       charset = charset_roman;
9594     }
9595   else if (ch >= 0xA0 && ch < 0xDF)
9596     {
9597       c = ch - 0x80;
9598       charset = charset_kana;
9599     }
9600   else
9601     {
9602       EMACS_INT c1 = ch >> 8;
9603       int c2 = ch & 0xFF;
9604
9605       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9606           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9607         error ("Invalid code: %"pI"d", ch);
9608       c = ch;
9609       SJIS_TO_JIS (c);
9610       charset = charset_kanji;
9611     }
9612   c = DECODE_CHAR (charset, c);
9613   if (c < 0)
9614     error ("Invalid code: %"pI"d", ch);
9615   return make_number (c);
9616 }
9617
9618
9619 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9620        doc: /* Encode a Japanese character CH to shift_jis encoding.
9621 Return the corresponding code in SJIS.  */)
9622   (Lisp_Object ch)
9623 {
9624   Lisp_Object spec, attrs, charset_list;
9625   int c;
9626   struct charset *charset;
9627   unsigned code;
9628
9629   CHECK_CHARACTER (ch);
9630   c = XFASTINT (ch);
9631   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9632   attrs = AREF (spec, 0);
9633
9634   if (ASCII_CHAR_P (c)
9635       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9636     return ch;
9637
9638   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9639   charset = char_charset (c, charset_list, &code);
9640   if (code == CHARSET_INVALID_CODE (charset))
9641     error ("Can't encode by shift_jis encoding: %c", c);
9642   JIS_TO_SJIS (code);
9643
9644   return make_number (code);
9645 }
9646
9647 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9648        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9649 Return the corresponding character.  */)
9650   (Lisp_Object code)
9651 {
9652   Lisp_Object spec, attrs, val;
9653   struct charset *charset_roman, *charset_big5, *charset;
9654   EMACS_INT ch;
9655   int c;
9656
9657   CHECK_NATNUM (code);
9658   ch = XFASTINT (code);
9659   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9660   attrs = AREF (spec, 0);
9661
9662   if (ASCII_CHAR_P (ch)
9663       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9664     return code;
9665
9666   val = CODING_ATTR_CHARSET_LIST (attrs);
9667   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9668   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9669
9670   if (ch <= 0x7F)
9671     {
9672       c = ch;
9673       charset = charset_roman;
9674     }
9675   else
9676     {
9677       EMACS_INT b1 = ch >> 8;
9678       int b2 = ch & 0x7F;
9679       if (b1 < 0xA1 || b1 > 0xFE
9680           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9681         error ("Invalid code: %"pI"d", ch);
9682       c = ch;
9683       charset = charset_big5;
9684     }
9685   c = DECODE_CHAR (charset, c);
9686   if (c < 0)
9687     error ("Invalid code: %"pI"d", ch);
9688   return make_number (c);
9689 }
9690
9691 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9692        doc: /* Encode the Big5 character CH to BIG5 coding system.
9693 Return the corresponding character code in Big5.  */)
9694   (Lisp_Object ch)
9695 {
9696   Lisp_Object spec, attrs, charset_list;
9697   struct charset *charset;
9698   int c;
9699   unsigned code;
9700
9701   CHECK_CHARACTER (ch);
9702   c = XFASTINT (ch);
9703   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9704   attrs = AREF (spec, 0);
9705   if (ASCII_CHAR_P (c)
9706       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9707     return ch;
9708
9709   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9710   charset = char_charset (c, charset_list, &code);
9711   if (code == CHARSET_INVALID_CODE (charset))
9712     error ("Can't encode by Big5 encoding: %c", c);
9713
9714   return make_number (code);
9715 }
9716
9717 \f
9718 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9719        Sset_terminal_coding_system_internal, 1, 2, 0,
9720        doc: /* Internal use only.  */)
9721   (Lisp_Object coding_system, Lisp_Object terminal)
9722 {
9723   struct terminal *term = decode_live_terminal (terminal);
9724   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9725   CHECK_SYMBOL (coding_system);
9726   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9727   /* We had better not send unsafe characters to terminal.  */
9728   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9729   /* Character composition should be disabled.  */
9730   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9731   terminal_coding->src_multibyte = 1;
9732   terminal_coding->dst_multibyte = 0;
9733   tset_charset_list
9734     (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9735             ? coding_charset_list (terminal_coding)
9736             : list1 (make_number (charset_ascii))));
9737   return Qnil;
9738 }
9739
9740 DEFUN ("set-safe-terminal-coding-system-internal",
9741        Fset_safe_terminal_coding_system_internal,
9742        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9743        doc: /* Internal use only.  */)
9744   (Lisp_Object coding_system)
9745 {
9746   CHECK_SYMBOL (coding_system);
9747   setup_coding_system (Fcheck_coding_system (coding_system),
9748                        &safe_terminal_coding);
9749   /* Character composition should be disabled.  */
9750   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9751   safe_terminal_coding.src_multibyte = 1;
9752   safe_terminal_coding.dst_multibyte = 0;
9753   return Qnil;
9754 }
9755
9756 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9757        Sterminal_coding_system, 0, 1, 0,
9758        doc: /* Return coding system specified for terminal output on the given terminal.
9759 TERMINAL may be a terminal object, a frame, or nil for the selected
9760 frame's terminal device.  */)
9761   (Lisp_Object terminal)
9762 {
9763   struct coding_system *terminal_coding
9764     = TERMINAL_TERMINAL_CODING (decode_live_terminal (terminal));
9765   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9766
9767   /* For backward compatibility, return nil if it is `undecided'.  */
9768   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9769 }
9770
9771 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9772        Sset_keyboard_coding_system_internal, 1, 2, 0,
9773        doc: /* Internal use only.  */)
9774   (Lisp_Object coding_system, Lisp_Object terminal)
9775 {
9776   struct terminal *t = decode_live_terminal (terminal);
9777   CHECK_SYMBOL (coding_system);
9778   if (NILP (coding_system))
9779     coding_system = Qno_conversion;
9780   else
9781     Fcheck_coding_system (coding_system);
9782   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9783   /* Character composition should be disabled.  */
9784   TERMINAL_KEYBOARD_CODING (t)->common_flags
9785     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9786   return Qnil;
9787 }
9788
9789 DEFUN ("keyboard-coding-system",
9790        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9791        doc: /* Return coding system specified for decoding keyboard input.  */)
9792   (Lisp_Object terminal)
9793 {
9794   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9795                          (decode_live_terminal (terminal))->id);
9796 }
9797
9798 \f
9799 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9800        Sfind_operation_coding_system,  1, MANY, 0,
9801        doc: /* Choose a coding system for an operation based on the target name.
9802 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9803 DECODING-SYSTEM is the coding system to use for decoding
9804 (in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9805 for encoding (in case OPERATION does encoding).
9806
9807 The first argument OPERATION specifies an I/O primitive:
9808   For file I/O, `insert-file-contents' or `write-region'.
9809   For process I/O, `call-process', `call-process-region', or `start-process'.
9810   For network I/O, `open-network-stream'.
9811
9812 The remaining arguments should be the same arguments that were passed
9813 to the primitive.  Depending on which primitive, one of those arguments
9814 is selected as the TARGET.  For example, if OPERATION does file I/O,
9815 whichever argument specifies the file name is TARGET.
9816
9817 TARGET has a meaning which depends on OPERATION:
9818   For file I/O, TARGET is a file name (except for the special case below).
9819   For process I/O, TARGET is a process name.
9820   For network I/O, TARGET is a service name or a port number.
9821
9822 This function looks up what is specified for TARGET in
9823 `file-coding-system-alist', `process-coding-system-alist',
9824 or `network-coding-system-alist' depending on OPERATION.
9825 They may specify a coding system, a cons of coding systems,
9826 or a function symbol to call.
9827 In the last case, we call the function with one argument,
9828 which is a list of all the arguments given to this function.
9829 If the function can't decide a coding system, it can return
9830 `undecided' so that the normal code-detection is performed.
9831
9832 If OPERATION is `insert-file-contents', the argument corresponding to
9833 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9834 file name to look up, and BUFFER is a buffer that contains the file's
9835 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9836 function to call for FILENAME, that function should examine the
9837 contents of BUFFER instead of reading the file.
9838
9839 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9840   (ptrdiff_t nargs, Lisp_Object *args)
9841 {
9842   Lisp_Object operation, target_idx, target, val;
9843   register Lisp_Object chain;
9844
9845   if (nargs < 2)
9846     error ("Too few arguments");
9847   operation = args[0];
9848   if (!SYMBOLP (operation)
9849       || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9850     error ("Invalid first argument");
9851   if (nargs <= 1 + XFASTINT (target_idx))
9852     error ("Too few arguments for operation `%s'",
9853            SDATA (SYMBOL_NAME (operation)));
9854   target = args[XFASTINT (target_idx) + 1];
9855   if (!(STRINGP (target)
9856         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9857             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9858         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9859     error ("Invalid argument %"pI"d of operation `%s'",
9860            XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9861   if (CONSP (target))
9862     target = XCAR (target);
9863
9864   chain = ((EQ (operation, Qinsert_file_contents)
9865             || EQ (operation, Qwrite_region))
9866            ? Vfile_coding_system_alist
9867            : (EQ (operation, Qopen_network_stream)
9868               ? Vnetwork_coding_system_alist
9869               : Vprocess_coding_system_alist));
9870   if (NILP (chain))
9871     return Qnil;
9872
9873   for (; CONSP (chain); chain = XCDR (chain))
9874     {
9875       Lisp_Object elt;
9876
9877       elt = XCAR (chain);
9878       if (CONSP (elt)
9879           && ((STRINGP (target)
9880                && STRINGP (XCAR (elt))
9881                && fast_string_match (XCAR (elt), target) >= 0)
9882               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9883         {
9884           val = XCDR (elt);
9885           /* Here, if VAL is both a valid coding system and a valid
9886              function symbol, we return VAL as a coding system.  */
9887           if (CONSP (val))
9888             return val;
9889           if (! SYMBOLP (val))
9890             return Qnil;
9891           if (! NILP (Fcoding_system_p (val)))
9892             return Fcons (val, val);
9893           if (! NILP (Ffboundp (val)))
9894             {
9895               /* We use call1 rather than safe_call1
9896                  so as to get bug reports about functions called here
9897                  which don't handle the current interface.  */
9898               val = call1 (val, Flist (nargs, args));
9899               if (CONSP (val))
9900                 return val;
9901               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9902                 return Fcons (val, val);
9903             }
9904           return Qnil;
9905         }
9906     }
9907   return Qnil;
9908 }
9909
9910 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9911        Sset_coding_system_priority, 0, MANY, 0,
9912        doc: /* Assign higher priority to the coding systems given as arguments.
9913 If multiple coding systems belong to the same category,
9914 all but the first one are ignored.
9915
9916 usage: (set-coding-system-priority &rest coding-systems)  */)
9917   (ptrdiff_t nargs, Lisp_Object *args)
9918 {
9919   ptrdiff_t i, j;
9920   bool changed[coding_category_max];
9921   enum coding_category priorities[coding_category_max];
9922
9923   memset (changed, 0, sizeof changed);
9924
9925   for (i = j = 0; i < nargs; i++)
9926     {
9927       enum coding_category category;
9928       Lisp_Object spec, attrs;
9929
9930       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9931       attrs = AREF (spec, 0);
9932       category = XINT (CODING_ATTR_CATEGORY (attrs));
9933       if (changed[category])
9934         /* Ignore this coding system because a coding system of the
9935            same category already had a higher priority.  */
9936         continue;
9937       changed[category] = 1;
9938       priorities[j++] = category;
9939       if (coding_categories[category].id >= 0
9940           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9941         setup_coding_system (args[i], &coding_categories[category]);
9942       Fset (AREF (Vcoding_category_table, category), args[i]);
9943     }
9944
9945   /* Now we have decided top J priorities.  Reflect the order of the
9946      original priorities to the remaining priorities.  */
9947
9948   for (i = j, j = 0; i < coding_category_max; i++, j++)
9949     {
9950       while (j < coding_category_max
9951              && changed[coding_priorities[j]])
9952         j++;
9953       if (j == coding_category_max)
9954         emacs_abort ();
9955       priorities[i] = coding_priorities[j];
9956     }
9957
9958   memcpy (coding_priorities, priorities, sizeof priorities);
9959
9960   /* Update `coding-category-list'.  */
9961   Vcoding_category_list = Qnil;
9962   for (i = coding_category_max; i-- > 0; )
9963     Vcoding_category_list
9964       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9965                Vcoding_category_list);
9966
9967   return Qnil;
9968 }
9969
9970 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9971        Scoding_system_priority_list, 0, 1, 0,
9972        doc: /* Return a list of coding systems ordered by their priorities.
9973 The list contains a subset of coding systems; i.e. coding systems
9974 assigned to each coding category (see `coding-category-list').
9975
9976 HIGHESTP non-nil means just return the highest priority one.  */)
9977   (Lisp_Object highestp)
9978 {
9979   int i;
9980   Lisp_Object val;
9981
9982   for (i = 0, val = Qnil; i < coding_category_max; i++)
9983     {
9984       enum coding_category category = coding_priorities[i];
9985       int id = coding_categories[category].id;
9986       Lisp_Object attrs;
9987
9988       if (id < 0)
9989         continue;
9990       attrs = CODING_ID_ATTRS (id);
9991       if (! NILP (highestp))
9992         return CODING_ATTR_BASE_NAME (attrs);
9993       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9994     }
9995   return Fnreverse (val);
9996 }
9997
9998 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9999
10000 static Lisp_Object
10001 make_subsidiaries (Lisp_Object base)
10002 {
10003   Lisp_Object subsidiaries;
10004   ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
10005   USE_SAFE_ALLOCA;
10006   char *buf = SAFE_ALLOCA (base_name_len + 6);
10007   int i;
10008
10009   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
10010   subsidiaries = make_uninit_vector (3);
10011   for (i = 0; i < 3; i++)
10012     {
10013       strcpy (buf + base_name_len, suffixes[i]);
10014       ASET (subsidiaries, i, intern (buf));
10015     }
10016   SAFE_FREE ();
10017   return subsidiaries;
10018 }
10019
10020
10021 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
10022        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
10023        doc: /* For internal use only.
10024 usage: (define-coding-system-internal ...)  */)
10025   (ptrdiff_t nargs, Lisp_Object *args)
10026 {
10027   Lisp_Object name;
10028   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
10029   Lisp_Object attrs;            /* Vector of attributes.  */
10030   Lisp_Object eol_type;
10031   Lisp_Object aliases;
10032   Lisp_Object coding_type, charset_list, safe_charsets;
10033   enum coding_category category;
10034   Lisp_Object tail, val;
10035   int max_charset_id = 0;
10036   int i;
10037
10038   if (nargs < coding_arg_max)
10039     goto short_args;
10040
10041   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
10042
10043   name = args[coding_arg_name];
10044   CHECK_SYMBOL (name);
10045   ASET (attrs, coding_attr_base_name, name);
10046
10047   val = args[coding_arg_mnemonic];
10048   if (! STRINGP (val))
10049     CHECK_CHARACTER (val);
10050   ASET (attrs, coding_attr_mnemonic, val);
10051
10052   coding_type = args[coding_arg_coding_type];
10053   CHECK_SYMBOL (coding_type);
10054   ASET (attrs, coding_attr_type, coding_type);
10055
10056   charset_list = args[coding_arg_charset_list];
10057   if (SYMBOLP (charset_list))
10058     {
10059       if (EQ (charset_list, Qiso_2022))
10060         {
10061           if (! EQ (coding_type, Qiso_2022))
10062             error ("Invalid charset-list");
10063           charset_list = Viso_2022_charset_list;
10064         }
10065       else if (EQ (charset_list, Qemacs_mule))
10066         {
10067           if (! EQ (coding_type, Qemacs_mule))
10068             error ("Invalid charset-list");
10069           charset_list = Vemacs_mule_charset_list;
10070         }
10071       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10072         {
10073           if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
10074             error ("Invalid charset-list");
10075           if (max_charset_id < XFASTINT (XCAR (tail)))
10076             max_charset_id = XFASTINT (XCAR (tail));
10077         }
10078     }
10079   else
10080     {
10081       charset_list = Fcopy_sequence (charset_list);
10082       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10083         {
10084           struct charset *charset;
10085
10086           val = XCAR (tail);
10087           CHECK_CHARSET_GET_CHARSET (val, charset);
10088           if (EQ (coding_type, Qiso_2022)
10089               ? CHARSET_ISO_FINAL (charset) < 0
10090               : EQ (coding_type, Qemacs_mule)
10091               ? CHARSET_EMACS_MULE_ID (charset) < 0
10092               : 0)
10093             error ("Can't handle charset `%s'",
10094                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10095
10096           XSETCAR (tail, make_number (charset->id));
10097           if (max_charset_id < charset->id)
10098             max_charset_id = charset->id;
10099         }
10100     }
10101   ASET (attrs, coding_attr_charset_list, charset_list);
10102
10103   safe_charsets = make_uninit_string (max_charset_id + 1);
10104   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
10105   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10106     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
10107   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
10108
10109   ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
10110
10111   val = args[coding_arg_decode_translation_table];
10112   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10113     CHECK_SYMBOL (val);
10114   ASET (attrs, coding_attr_decode_tbl, val);
10115
10116   val = args[coding_arg_encode_translation_table];
10117   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10118     CHECK_SYMBOL (val);
10119   ASET (attrs, coding_attr_encode_tbl, val);
10120
10121   val = args[coding_arg_post_read_conversion];
10122   CHECK_SYMBOL (val);
10123   ASET (attrs, coding_attr_post_read, val);
10124
10125   val = args[coding_arg_pre_write_conversion];
10126   CHECK_SYMBOL (val);
10127   ASET (attrs, coding_attr_pre_write, val);
10128
10129   val = args[coding_arg_default_char];
10130   if (NILP (val))
10131     ASET (attrs, coding_attr_default_char, make_number (' '));
10132   else
10133     {
10134       CHECK_CHARACTER (val);
10135       ASET (attrs, coding_attr_default_char, val);
10136     }
10137
10138   val = args[coding_arg_for_unibyte];
10139   ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
10140
10141   val = args[coding_arg_plist];
10142   CHECK_LIST (val);
10143   ASET (attrs, coding_attr_plist, val);
10144
10145   if (EQ (coding_type, Qcharset))
10146     {
10147       /* Generate a lisp vector of 256 elements.  Each element is nil,
10148          integer, or a list of charset IDs.
10149
10150          If Nth element is nil, the byte code N is invalid in this
10151          coding system.
10152
10153          If Nth element is a number NUM, N is the first byte of a
10154          charset whose ID is NUM.
10155
10156          If Nth element is a list of charset IDs, N is the first byte
10157          of one of them.  The list is sorted by dimensions of the
10158          charsets.  A charset of smaller dimension comes first. */
10159       val = Fmake_vector (make_number (256), Qnil);
10160
10161       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10162         {
10163           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
10164           int dim = CHARSET_DIMENSION (charset);
10165           int idx = (dim - 1) * 4;
10166
10167           if (CHARSET_ASCII_COMPATIBLE_P (charset))
10168             ASET (attrs, coding_attr_ascii_compat, Qt);
10169
10170           for (i = charset->code_space[idx];
10171                i <= charset->code_space[idx + 1]; i++)
10172             {
10173               Lisp_Object tmp, tmp2;
10174               int dim2;
10175
10176               tmp = AREF (val, i);
10177               if (NILP (tmp))
10178                 tmp = XCAR (tail);
10179               else if (NUMBERP (tmp))
10180                 {
10181                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
10182                   if (dim < dim2)
10183                     tmp = list2 (XCAR (tail), tmp);
10184                   else
10185                     tmp = list2 (tmp, XCAR (tail));
10186                 }
10187               else
10188                 {
10189                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
10190                     {
10191                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
10192                       if (dim < dim2)
10193                         break;
10194                     }
10195                   if (NILP (tmp2))
10196                     tmp = nconc2 (tmp, list1 (XCAR (tail)));
10197                   else
10198                     {
10199                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
10200                       XSETCAR (tmp2, XCAR (tail));
10201                     }
10202                 }
10203               ASET (val, i, tmp);
10204             }
10205         }
10206       ASET (attrs, coding_attr_charset_valids, val);
10207       category = coding_category_charset;
10208     }
10209   else if (EQ (coding_type, Qccl))
10210     {
10211       Lisp_Object valids;
10212
10213       if (nargs < coding_arg_ccl_max)
10214         goto short_args;
10215
10216       val = args[coding_arg_ccl_decoder];
10217       CHECK_CCL_PROGRAM (val);
10218       if (VECTORP (val))
10219         val = Fcopy_sequence (val);
10220       ASET (attrs, coding_attr_ccl_decoder, val);
10221
10222       val = args[coding_arg_ccl_encoder];
10223       CHECK_CCL_PROGRAM (val);
10224       if (VECTORP (val))
10225         val = Fcopy_sequence (val);
10226       ASET (attrs, coding_attr_ccl_encoder, val);
10227
10228       val = args[coding_arg_ccl_valids];
10229       valids = Fmake_string (make_number (256), make_number (0));
10230       for (tail = val; CONSP (tail); tail = XCDR (tail))
10231         {
10232           int from, to;
10233
10234           val = XCAR (tail);
10235           if (INTEGERP (val))
10236             {
10237               if (! (0 <= XINT (val) && XINT (val) <= 255))
10238                 args_out_of_range_3 (val, make_number (0), make_number (255));
10239               from = to = XINT (val);
10240             }
10241           else
10242             {
10243               CHECK_CONS (val);
10244               CHECK_NATNUM_CAR (val);
10245               CHECK_NUMBER_CDR (val);
10246               if (XINT (XCAR (val)) > 255)
10247                 args_out_of_range_3 (XCAR (val),
10248                                      make_number (0), make_number (255));
10249               from = XINT (XCAR (val));
10250               if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
10251                 args_out_of_range_3 (XCDR (val),
10252                                      XCAR (val), make_number (255));
10253               to = XINT (XCDR (val));
10254             }
10255           for (i = from; i <= to; i++)
10256             SSET (valids, i, 1);
10257         }
10258       ASET (attrs, coding_attr_ccl_valids, valids);
10259
10260       category = coding_category_ccl;
10261     }
10262   else if (EQ (coding_type, Qutf_16))
10263     {
10264       Lisp_Object bom, endian;
10265
10266       ASET (attrs, coding_attr_ascii_compat, Qnil);
10267
10268       if (nargs < coding_arg_utf16_max)
10269         goto short_args;
10270
10271       bom = args[coding_arg_utf16_bom];
10272       if (! NILP (bom) && ! EQ (bom, Qt))
10273         {
10274           CHECK_CONS (bom);
10275           val = XCAR (bom);
10276           CHECK_CODING_SYSTEM (val);
10277           val = XCDR (bom);
10278           CHECK_CODING_SYSTEM (val);
10279         }
10280       ASET (attrs, coding_attr_utf_bom, bom);
10281
10282       endian = args[coding_arg_utf16_endian];
10283       CHECK_SYMBOL (endian);
10284       if (NILP (endian))
10285         endian = Qbig;
10286       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
10287         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
10288       ASET (attrs, coding_attr_utf_16_endian, endian);
10289
10290       category = (CONSP (bom)
10291                   ? coding_category_utf_16_auto
10292                   : NILP (bom)
10293                   ? (EQ (endian, Qbig)
10294                      ? coding_category_utf_16_be_nosig
10295                      : coding_category_utf_16_le_nosig)
10296                   : (EQ (endian, Qbig)
10297                      ? coding_category_utf_16_be
10298                      : coding_category_utf_16_le));
10299     }
10300   else if (EQ (coding_type, Qiso_2022))
10301     {
10302       Lisp_Object initial, reg_usage, request, flags;
10303
10304       if (nargs < coding_arg_iso2022_max)
10305         goto short_args;
10306
10307       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
10308       CHECK_VECTOR (initial);
10309       for (i = 0; i < 4; i++)
10310         {
10311           val = AREF (initial, i);
10312           if (! NILP (val))
10313             {
10314               struct charset *charset;
10315
10316               CHECK_CHARSET_GET_CHARSET (val, charset);
10317               ASET (initial, i, make_number (CHARSET_ID (charset)));
10318               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
10319                 ASET (attrs, coding_attr_ascii_compat, Qt);
10320             }
10321           else
10322             ASET (initial, i, make_number (-1));
10323         }
10324
10325       reg_usage = args[coding_arg_iso2022_reg_usage];
10326       CHECK_CONS (reg_usage);
10327       CHECK_NUMBER_CAR (reg_usage);
10328       CHECK_NUMBER_CDR (reg_usage);
10329
10330       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
10331       for (tail = request; CONSP (tail); tail = XCDR (tail))
10332         {
10333           int id;
10334           Lisp_Object tmp1;
10335
10336           val = XCAR (tail);
10337           CHECK_CONS (val);
10338           tmp1 = XCAR (val);
10339           CHECK_CHARSET_GET_ID (tmp1, id);
10340           CHECK_NATNUM_CDR (val);
10341           if (XINT (XCDR (val)) >= 4)
10342             error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
10343           XSETCAR (val, make_number (id));
10344         }
10345
10346       flags = args[coding_arg_iso2022_flags];
10347       CHECK_NATNUM (flags);
10348       i = XINT (flags) & INT_MAX;
10349       if (EQ (args[coding_arg_charset_list], Qiso_2022))
10350         i |= CODING_ISO_FLAG_FULL_SUPPORT;
10351       flags = make_number (i);
10352
10353       ASET (attrs, coding_attr_iso_initial, initial);
10354       ASET (attrs, coding_attr_iso_usage, reg_usage);
10355       ASET (attrs, coding_attr_iso_request, request);
10356       ASET (attrs, coding_attr_iso_flags, flags);
10357       setup_iso_safe_charsets (attrs);
10358
10359       if (i & CODING_ISO_FLAG_SEVEN_BITS)
10360         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
10361                           | CODING_ISO_FLAG_SINGLE_SHIFT))
10362                     ? coding_category_iso_7_else
10363                     : EQ (args[coding_arg_charset_list], Qiso_2022)
10364                     ? coding_category_iso_7
10365                     : coding_category_iso_7_tight);
10366       else
10367         {
10368           int id = XINT (AREF (initial, 1));
10369
10370           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
10371                        || EQ (args[coding_arg_charset_list], Qiso_2022)
10372                        || id < 0)
10373                       ? coding_category_iso_8_else
10374                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
10375                       ? coding_category_iso_8_1
10376                       : coding_category_iso_8_2);
10377         }
10378       if (category != coding_category_iso_8_1
10379           && category != coding_category_iso_8_2)
10380         ASET (attrs, coding_attr_ascii_compat, Qnil);
10381     }
10382   else if (EQ (coding_type, Qemacs_mule))
10383     {
10384       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
10385         ASET (attrs, coding_attr_emacs_mule_full, Qt);
10386       ASET (attrs, coding_attr_ascii_compat, Qt);
10387       category = coding_category_emacs_mule;
10388     }
10389   else if (EQ (coding_type, Qshift_jis))
10390     {
10391
10392       struct charset *charset;
10393
10394       if (XINT (Flength (charset_list)) != 3
10395           && XINT (Flength (charset_list)) != 4)
10396         error ("There should be three or four charsets");
10397
10398       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10399       if (CHARSET_DIMENSION (charset) != 1)
10400         error ("Dimension of charset %s is not one",
10401                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10402       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10403         ASET (attrs, coding_attr_ascii_compat, Qt);
10404
10405       charset_list = XCDR (charset_list);
10406       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10407       if (CHARSET_DIMENSION (charset) != 1)
10408         error ("Dimension of charset %s is not one",
10409                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10410
10411       charset_list = XCDR (charset_list);
10412       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10413       if (CHARSET_DIMENSION (charset) != 2)
10414         error ("Dimension of charset %s is not two",
10415                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10416
10417       charset_list = XCDR (charset_list);
10418       if (! NILP (charset_list))
10419         {
10420           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10421           if (CHARSET_DIMENSION (charset) != 2)
10422             error ("Dimension of charset %s is not two",
10423                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10424         }
10425
10426       category = coding_category_sjis;
10427       Vsjis_coding_system = name;
10428     }
10429   else if (EQ (coding_type, Qbig5))
10430     {
10431       struct charset *charset;
10432
10433       if (XINT (Flength (charset_list)) != 2)
10434         error ("There should be just two charsets");
10435
10436       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10437       if (CHARSET_DIMENSION (charset) != 1)
10438         error ("Dimension of charset %s is not one",
10439                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10440       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10441         ASET (attrs, coding_attr_ascii_compat, Qt);
10442
10443       charset_list = XCDR (charset_list);
10444       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10445       if (CHARSET_DIMENSION (charset) != 2)
10446         error ("Dimension of charset %s is not two",
10447                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10448
10449       category = coding_category_big5;
10450       Vbig5_coding_system = name;
10451     }
10452   else if (EQ (coding_type, Qraw_text))
10453     {
10454       category = coding_category_raw_text;
10455       ASET (attrs, coding_attr_ascii_compat, Qt);
10456     }
10457   else if (EQ (coding_type, Qutf_8))
10458     {
10459       Lisp_Object bom;
10460
10461       if (nargs < coding_arg_utf8_max)
10462         goto short_args;
10463
10464       bom = args[coding_arg_utf8_bom];
10465       if (! NILP (bom) && ! EQ (bom, Qt))
10466         {
10467           CHECK_CONS (bom);
10468           val = XCAR (bom);
10469           CHECK_CODING_SYSTEM (val);
10470           val = XCDR (bom);
10471           CHECK_CODING_SYSTEM (val);
10472         }
10473       ASET (attrs, coding_attr_utf_bom, bom);
10474       if (NILP (bom))
10475         ASET (attrs, coding_attr_ascii_compat, Qt);
10476
10477       category = (CONSP (bom) ? coding_category_utf_8_auto
10478                   : NILP (bom) ? coding_category_utf_8_nosig
10479                   : coding_category_utf_8_sig);
10480     }
10481   else if (EQ (coding_type, Qundecided))
10482     {
10483       if (nargs < coding_arg_undecided_max)
10484         goto short_args;
10485       ASET (attrs, coding_attr_undecided_inhibit_null_byte_detection,
10486             args[coding_arg_undecided_inhibit_null_byte_detection]);
10487       ASET (attrs, coding_attr_undecided_inhibit_iso_escape_detection,
10488             args[coding_arg_undecided_inhibit_iso_escape_detection]);
10489       ASET (attrs, coding_attr_undecided_prefer_utf_8,
10490             args[coding_arg_undecided_prefer_utf_8]);
10491       category = coding_category_undecided;
10492     }
10493   else
10494     error ("Invalid coding system type: %s",
10495            SDATA (SYMBOL_NAME (coding_type)));
10496
10497   ASET (attrs, coding_attr_category, make_number (category));
10498   ASET (attrs, coding_attr_plist,
10499         Fcons (QCcategory,
10500                Fcons (AREF (Vcoding_category_table, category),
10501                       CODING_ATTR_PLIST (attrs))));
10502   ASET (attrs, coding_attr_plist,
10503         Fcons (QCascii_compatible_p,
10504                Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10505                       CODING_ATTR_PLIST (attrs))));
10506
10507   eol_type = args[coding_arg_eol_type];
10508   if (! NILP (eol_type)
10509       && ! EQ (eol_type, Qunix)
10510       && ! EQ (eol_type, Qdos)
10511       && ! EQ (eol_type, Qmac))
10512     error ("Invalid eol-type");
10513
10514   aliases = list1 (name);
10515
10516   if (NILP (eol_type))
10517     {
10518       eol_type = make_subsidiaries (name);
10519       for (i = 0; i < 3; i++)
10520         {
10521           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10522
10523           this_name = AREF (eol_type, i);
10524           this_aliases = list1 (this_name);
10525           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10526           this_spec = make_uninit_vector (3);
10527           ASET (this_spec, 0, attrs);
10528           ASET (this_spec, 1, this_aliases);
10529           ASET (this_spec, 2, this_eol_type);
10530           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10531           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10532           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10533           if (NILP (val))
10534             Vcoding_system_alist
10535               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10536                        Vcoding_system_alist);
10537         }
10538     }
10539
10540   spec_vec = make_uninit_vector (3);
10541   ASET (spec_vec, 0, attrs);
10542   ASET (spec_vec, 1, aliases);
10543   ASET (spec_vec, 2, eol_type);
10544
10545   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10546   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10547   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10548   if (NILP (val))
10549     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10550                                   Vcoding_system_alist);
10551
10552   {
10553     int id = coding_categories[category].id;
10554
10555     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10556       setup_coding_system (name, &coding_categories[category]);
10557   }
10558
10559   return Qnil;
10560
10561  short_args:
10562   return Fsignal (Qwrong_number_of_arguments,
10563                   Fcons (intern ("define-coding-system-internal"),
10564                          make_number (nargs)));
10565 }
10566
10567
10568 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10569        3, 3, 0,
10570        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10571   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10572 {
10573   Lisp_Object spec, attrs;
10574
10575   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10576   attrs = AREF (spec, 0);
10577   if (EQ (prop, QCmnemonic))
10578     {
10579       if (! STRINGP (val))
10580         CHECK_CHARACTER (val);
10581       ASET (attrs, coding_attr_mnemonic, val);
10582     }
10583   else if (EQ (prop, QCdefault_char))
10584     {
10585       if (NILP (val))
10586         val = make_number (' ');
10587       else
10588         CHECK_CHARACTER (val);
10589       ASET (attrs, coding_attr_default_char, val);
10590     }
10591   else if (EQ (prop, QCdecode_translation_table))
10592     {
10593       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10594         CHECK_SYMBOL (val);
10595       ASET (attrs, coding_attr_decode_tbl, val);
10596     }
10597   else if (EQ (prop, QCencode_translation_table))
10598     {
10599       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10600         CHECK_SYMBOL (val);
10601       ASET (attrs, coding_attr_encode_tbl, val);
10602     }
10603   else if (EQ (prop, QCpost_read_conversion))
10604     {
10605       CHECK_SYMBOL (val);
10606       ASET (attrs, coding_attr_post_read, val);
10607     }
10608   else if (EQ (prop, QCpre_write_conversion))
10609     {
10610       CHECK_SYMBOL (val);
10611       ASET (attrs, coding_attr_pre_write, val);
10612     }
10613   else if (EQ (prop, QCascii_compatible_p))
10614     {
10615       ASET (attrs, coding_attr_ascii_compat, val);
10616     }
10617
10618   ASET (attrs, coding_attr_plist,
10619         Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10620   return val;
10621 }
10622
10623
10624 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10625        Sdefine_coding_system_alias, 2, 2, 0,
10626        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10627   (Lisp_Object alias, Lisp_Object coding_system)
10628 {
10629   Lisp_Object spec, aliases, eol_type, val;
10630
10631   CHECK_SYMBOL (alias);
10632   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10633   aliases = AREF (spec, 1);
10634   /* ALIASES should be a list of length more than zero, and the first
10635      element is a base coding system.  Append ALIAS at the tail of the
10636      list.  */
10637   while (!NILP (XCDR (aliases)))
10638     aliases = XCDR (aliases);
10639   XSETCDR (aliases, list1 (alias));
10640
10641   eol_type = AREF (spec, 2);
10642   if (VECTORP (eol_type))
10643     {
10644       Lisp_Object subsidiaries;
10645       int i;
10646
10647       subsidiaries = make_subsidiaries (alias);
10648       for (i = 0; i < 3; i++)
10649         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10650                                      AREF (eol_type, i));
10651     }
10652
10653   Fputhash (alias, spec, Vcoding_system_hash_table);
10654   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10655   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10656   if (NILP (val))
10657     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10658                                   Vcoding_system_alist);
10659
10660   return Qnil;
10661 }
10662
10663 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10664        1, 1, 0,
10665        doc: /* Return the base of CODING-SYSTEM.
10666 Any alias or subsidiary coding system is not a base coding system.  */)
10667   (Lisp_Object coding_system)
10668 {
10669   Lisp_Object spec, attrs;
10670
10671   if (NILP (coding_system))
10672     return (Qno_conversion);
10673   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10674   attrs = AREF (spec, 0);
10675   return CODING_ATTR_BASE_NAME (attrs);
10676 }
10677
10678 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10679        1, 1, 0,
10680        doc: /* Return the property list of CODING-SYSTEM.  */)
10681   (Lisp_Object coding_system)
10682 {
10683   Lisp_Object spec, attrs;
10684
10685   if (NILP (coding_system))
10686     coding_system = Qno_conversion;
10687   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10688   attrs = AREF (spec, 0);
10689   return CODING_ATTR_PLIST (attrs);
10690 }
10691
10692
10693 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10694        1, 1, 0,
10695        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10696   (Lisp_Object coding_system)
10697 {
10698   Lisp_Object spec;
10699
10700   if (NILP (coding_system))
10701     coding_system = Qno_conversion;
10702   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10703   return AREF (spec, 1);
10704 }
10705
10706 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10707        Scoding_system_eol_type, 1, 1, 0,
10708        doc: /* Return eol-type of CODING-SYSTEM.
10709 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10710
10711 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10712 and CR respectively.
10713
10714 A vector value indicates that a format of end-of-line should be
10715 detected automatically.  Nth element of the vector is the subsidiary
10716 coding system whose eol-type is N.  */)
10717   (Lisp_Object coding_system)
10718 {
10719   Lisp_Object spec, eol_type;
10720   int n;
10721
10722   if (NILP (coding_system))
10723     coding_system = Qno_conversion;
10724   if (! CODING_SYSTEM_P (coding_system))
10725     return Qnil;
10726   spec = CODING_SYSTEM_SPEC (coding_system);
10727   eol_type = AREF (spec, 2);
10728   if (VECTORP (eol_type))
10729     return Fcopy_sequence (eol_type);
10730   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10731   return make_number (n);
10732 }
10733
10734 #endif /* emacs */
10735
10736 \f
10737 /*** 9. Post-amble ***/
10738
10739 void
10740 init_coding_once (void)
10741 {
10742   int i;
10743
10744   for (i = 0; i < coding_category_max; i++)
10745     {
10746       coding_categories[i].id = -1;
10747       coding_priorities[i] = i;
10748     }
10749
10750   /* ISO2022 specific initialize routine.  */
10751   for (i = 0; i < 0x20; i++)
10752     iso_code_class[i] = ISO_control_0;
10753   for (i = 0x21; i < 0x7F; i++)
10754     iso_code_class[i] = ISO_graphic_plane_0;
10755   for (i = 0x80; i < 0xA0; i++)
10756     iso_code_class[i] = ISO_control_1;
10757   for (i = 0xA1; i < 0xFF; i++)
10758     iso_code_class[i] = ISO_graphic_plane_1;
10759   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10760   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10761   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10762   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10763   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10764   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10765   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10766   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10767   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10768
10769   for (i = 0; i < 256; i++)
10770     {
10771       emacs_mule_bytes[i] = 1;
10772     }
10773   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10774   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10775   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10776   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10777 }
10778
10779 #ifdef emacs
10780
10781 void
10782 syms_of_coding (void)
10783 {
10784   staticpro (&Vcoding_system_hash_table);
10785   Vcoding_system_hash_table = CALLN (Fmake_hash_table, QCtest, Qeq);
10786
10787   staticpro (&Vsjis_coding_system);
10788   Vsjis_coding_system = Qnil;
10789
10790   staticpro (&Vbig5_coding_system);
10791   Vbig5_coding_system = Qnil;
10792
10793   staticpro (&Vcode_conversion_reused_workbuf);
10794   Vcode_conversion_reused_workbuf = Qnil;
10795
10796   staticpro (&Vcode_conversion_workbuf_name);
10797   Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10798
10799   reused_workbuf_in_use = 0;
10800
10801   DEFSYM (Qcharset, "charset");
10802   DEFSYM (Qtarget_idx, "target-idx");
10803   DEFSYM (Qcoding_system_history, "coding-system-history");
10804   Fset (Qcoding_system_history, Qnil);
10805
10806   /* Target FILENAME is the first argument.  */
10807   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10808   /* Target FILENAME is the third argument.  */
10809   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10810
10811   DEFSYM (Qcall_process, "call-process");
10812   /* Target PROGRAM is the first argument.  */
10813   Fput (Qcall_process, Qtarget_idx, make_number (0));
10814
10815   DEFSYM (Qcall_process_region, "call-process-region");
10816   /* Target PROGRAM is the third argument.  */
10817   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10818
10819   DEFSYM (Qstart_process, "start-process");
10820   /* Target PROGRAM is the third argument.  */
10821   Fput (Qstart_process, Qtarget_idx, make_number (2));
10822
10823   DEFSYM (Qopen_network_stream, "open-network-stream");
10824   /* Target SERVICE is the fourth argument.  */
10825   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10826
10827   DEFSYM (Qunix, "unix");
10828   DEFSYM (Qdos, "dos");
10829   DEFSYM (Qmac, "mac");
10830
10831   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10832   DEFSYM (Qundecided, "undecided");
10833   DEFSYM (Qno_conversion, "no-conversion");
10834   DEFSYM (Qraw_text, "raw-text");
10835
10836   DEFSYM (Qiso_2022, "iso-2022");
10837
10838   DEFSYM (Qutf_8, "utf-8");
10839   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10840
10841 #if defined (WINDOWSNT) || defined (CYGWIN)
10842   /* No, not utf-16-le: that one has a BOM.  */
10843   DEFSYM (Qutf_16le, "utf-16le");
10844 #endif
10845
10846   DEFSYM (Qutf_16, "utf-16");
10847   DEFSYM (Qbig, "big");
10848   DEFSYM (Qlittle, "little");
10849
10850   DEFSYM (Qshift_jis, "shift-jis");
10851   DEFSYM (Qbig5, "big5");
10852
10853   DEFSYM (Qcoding_system_p, "coding-system-p");
10854
10855   /* Error signaled when there's a problem with detecting a coding system.  */
10856   DEFSYM (Qcoding_system_error, "coding-system-error");
10857   Fput (Qcoding_system_error, Qerror_conditions,
10858         listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10859   Fput (Qcoding_system_error, Qerror_message,
10860         build_pure_c_string ("Invalid coding system"));
10861
10862   DEFSYM (Qtranslation_table, "translation-table");
10863   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10864   DEFSYM (Qtranslation_table_id, "translation-table-id");
10865
10866   /* Coding system emacs-mule and raw-text are for converting only
10867      end-of-line format.  */
10868   DEFSYM (Qemacs_mule, "emacs-mule");
10869
10870   DEFSYM (QCcategory, ":category");
10871   DEFSYM (QCmnemonic, ":mnemonic");
10872   DEFSYM (QCdefault_char, ":default-char");
10873   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10874   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10875   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10876   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10877   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10878
10879   Vcoding_category_table
10880     = Fmake_vector (make_number (coding_category_max), Qnil);
10881   staticpro (&Vcoding_category_table);
10882   /* Followings are target of code detection.  */
10883   ASET (Vcoding_category_table, coding_category_iso_7,
10884         intern_c_string ("coding-category-iso-7"));
10885   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10886         intern_c_string ("coding-category-iso-7-tight"));
10887   ASET (Vcoding_category_table, coding_category_iso_8_1,
10888         intern_c_string ("coding-category-iso-8-1"));
10889   ASET (Vcoding_category_table, coding_category_iso_8_2,
10890         intern_c_string ("coding-category-iso-8-2"));
10891   ASET (Vcoding_category_table, coding_category_iso_7_else,
10892         intern_c_string ("coding-category-iso-7-else"));
10893   ASET (Vcoding_category_table, coding_category_iso_8_else,
10894         intern_c_string ("coding-category-iso-8-else"));
10895   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10896         intern_c_string ("coding-category-utf-8-auto"));
10897   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10898         intern_c_string ("coding-category-utf-8"));
10899   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10900         intern_c_string ("coding-category-utf-8-sig"));
10901   ASET (Vcoding_category_table, coding_category_utf_16_be,
10902         intern_c_string ("coding-category-utf-16-be"));
10903   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10904         intern_c_string ("coding-category-utf-16-auto"));
10905   ASET (Vcoding_category_table, coding_category_utf_16_le,
10906         intern_c_string ("coding-category-utf-16-le"));
10907   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10908         intern_c_string ("coding-category-utf-16-be-nosig"));
10909   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10910         intern_c_string ("coding-category-utf-16-le-nosig"));
10911   ASET (Vcoding_category_table, coding_category_charset,
10912         intern_c_string ("coding-category-charset"));
10913   ASET (Vcoding_category_table, coding_category_sjis,
10914         intern_c_string ("coding-category-sjis"));
10915   ASET (Vcoding_category_table, coding_category_big5,
10916         intern_c_string ("coding-category-big5"));
10917   ASET (Vcoding_category_table, coding_category_ccl,
10918         intern_c_string ("coding-category-ccl"));
10919   ASET (Vcoding_category_table, coding_category_emacs_mule,
10920         intern_c_string ("coding-category-emacs-mule"));
10921   /* Followings are NOT target of code detection.  */
10922   ASET (Vcoding_category_table, coding_category_raw_text,
10923         intern_c_string ("coding-category-raw-text"));
10924   ASET (Vcoding_category_table, coding_category_undecided,
10925         intern_c_string ("coding-category-undecided"));
10926
10927   DEFSYM (Qinsufficient_source, "insufficient-source");
10928   DEFSYM (Qinvalid_source, "invalid-source");
10929   DEFSYM (Qinterrupted, "interrupted");
10930
10931   /* If a symbol has this property, evaluate the value to define the
10932      symbol as a coding system.  */
10933   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10934
10935   defsubr (&Scoding_system_p);
10936   defsubr (&Sread_coding_system);
10937   defsubr (&Sread_non_nil_coding_system);
10938   defsubr (&Scheck_coding_system);
10939   defsubr (&Sdetect_coding_region);
10940   defsubr (&Sdetect_coding_string);
10941   defsubr (&Sfind_coding_systems_region_internal);
10942   defsubr (&Sunencodable_char_position);
10943   defsubr (&Scheck_coding_systems_region);
10944   defsubr (&Sdecode_coding_region);
10945   defsubr (&Sencode_coding_region);
10946   defsubr (&Sdecode_coding_string);
10947   defsubr (&Sencode_coding_string);
10948   defsubr (&Sdecode_sjis_char);
10949   defsubr (&Sencode_sjis_char);
10950   defsubr (&Sdecode_big5_char);
10951   defsubr (&Sencode_big5_char);
10952   defsubr (&Sset_terminal_coding_system_internal);
10953   defsubr (&Sset_safe_terminal_coding_system_internal);
10954   defsubr (&Sterminal_coding_system);
10955   defsubr (&Sset_keyboard_coding_system_internal);
10956   defsubr (&Skeyboard_coding_system);
10957   defsubr (&Sfind_operation_coding_system);
10958   defsubr (&Sset_coding_system_priority);
10959   defsubr (&Sdefine_coding_system_internal);
10960   defsubr (&Sdefine_coding_system_alias);
10961   defsubr (&Scoding_system_put);
10962   defsubr (&Scoding_system_base);
10963   defsubr (&Scoding_system_plist);
10964   defsubr (&Scoding_system_aliases);
10965   defsubr (&Scoding_system_eol_type);
10966   defsubr (&Scoding_system_priority_list);
10967
10968   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10969                doc: /* List of coding systems.
10970
10971 Do not alter the value of this variable manually.  This variable should be
10972 updated by the functions `define-coding-system' and
10973 `define-coding-system-alias'.  */);
10974   Vcoding_system_list = Qnil;
10975
10976   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10977                doc: /* Alist of coding system names.
10978 Each element is one element list of coding system name.
10979 This variable is given to `completing-read' as COLLECTION argument.
10980
10981 Do not alter the value of this variable manually.  This variable should be
10982 updated by the functions `make-coding-system' and
10983 `define-coding-system-alias'.  */);
10984   Vcoding_system_alist = Qnil;
10985
10986   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10987                doc: /* List of coding-categories (symbols) ordered by priority.
10988
10989 On detecting a coding system, Emacs tries code detection algorithms
10990 associated with each coding-category one by one in this order.  When
10991 one algorithm agrees with a byte sequence of source text, the coding
10992 system bound to the corresponding coding-category is selected.
10993
10994 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
10995   {
10996     int i;
10997
10998     Vcoding_category_list = Qnil;
10999     for (i = coding_category_max - 1; i >= 0; i--)
11000       Vcoding_category_list
11001         = Fcons (AREF (Vcoding_category_table, i),
11002                  Vcoding_category_list);
11003   }
11004
11005   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
11006                doc: /* Specify the coding system for read operations.
11007 It is useful to bind this variable with `let', but do not set it globally.
11008 If the value is a coding system, it is used for decoding on read operation.
11009 If not, an appropriate element is used from one of the coding system alists.
11010 There are three such tables: `file-coding-system-alist',
11011 `process-coding-system-alist', and `network-coding-system-alist'.  */);
11012   Vcoding_system_for_read = Qnil;
11013
11014   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
11015                doc: /* Specify the coding system for write operations.
11016 Programs bind this variable with `let', but you should not set it globally.
11017 If the value is a coding system, it is used for encoding of output,
11018 when writing it to a file and when sending it to a file or subprocess.
11019
11020 If this does not specify a coding system, an appropriate element
11021 is used from one of the coding system alists.
11022 There are three such tables: `file-coding-system-alist',
11023 `process-coding-system-alist', and `network-coding-system-alist'.
11024 For output to files, if the above procedure does not specify a coding system,
11025 the value of `buffer-file-coding-system' is used.  */);
11026   Vcoding_system_for_write = Qnil;
11027
11028   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
11029                doc: /*
11030 Coding system used in the latest file or process I/O.  */);
11031   Vlast_coding_system_used = Qnil;
11032
11033   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
11034                doc: /*
11035 Error status of the last code conversion.
11036
11037 When an error was detected in the last code conversion, this variable
11038 is set to one of the following symbols.
11039   `insufficient-source'
11040   `inconsistent-eol'
11041   `invalid-source'
11042   `interrupted'
11043   `insufficient-memory'
11044 When no error was detected, the value doesn't change.  So, to check
11045 the error status of a code conversion by this variable, you must
11046 explicitly set this variable to nil before performing code
11047 conversion.  */);
11048   Vlast_code_conversion_error = Qnil;
11049
11050   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
11051                doc: /*
11052 Non-nil means always inhibit code conversion of end-of-line format.
11053 See info node `Coding Systems' and info node `Text and Binary' concerning
11054 such conversion.  */);
11055   inhibit_eol_conversion = 0;
11056
11057   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
11058                doc: /*
11059 Non-nil means process buffer inherits coding system of process output.
11060 Bind it to t if the process output is to be treated as if it were a file
11061 read from some filesystem.  */);
11062   inherit_process_coding_system = 0;
11063
11064   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
11065                doc: /*
11066 Alist to decide a coding system to use for a file I/O operation.
11067 The format is ((PATTERN . VAL) ...),
11068 where PATTERN is a regular expression matching a file name,
11069 VAL is a coding system, a cons of coding systems, or a function symbol.
11070 If VAL is a coding system, it is used for both decoding and encoding
11071 the file contents.
11072 If VAL is a cons of coding systems, the car part is used for decoding,
11073 and the cdr part is used for encoding.
11074 If VAL is a function symbol, the function must return a coding system
11075 or a cons of coding systems which are used as above.  The function is
11076 called with an argument that is a list of the arguments with which
11077 `find-operation-coding-system' was called.  If the function can't decide
11078 a coding system, it can return `undecided' so that the normal
11079 code-detection is performed.
11080
11081 See also the function `find-operation-coding-system'
11082 and the variable `auto-coding-alist'.  */);
11083   Vfile_coding_system_alist = Qnil;
11084
11085   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
11086                doc: /*
11087 Alist to decide a coding system to use for a process I/O operation.
11088 The format is ((PATTERN . VAL) ...),
11089 where PATTERN is a regular expression matching a program name,
11090 VAL is a coding system, a cons of coding systems, or a function symbol.
11091 If VAL is a coding system, it is used for both decoding what received
11092 from the program and encoding what sent to the program.
11093 If VAL is a cons of coding systems, the car part is used for decoding,
11094 and the cdr part is used for encoding.
11095 If VAL is a function symbol, the function must return a coding system
11096 or a cons of coding systems which are used as above.
11097
11098 See also the function `find-operation-coding-system'.  */);
11099   Vprocess_coding_system_alist = Qnil;
11100
11101   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
11102                doc: /*
11103 Alist to decide a coding system to use for a network I/O operation.
11104 The format is ((PATTERN . VAL) ...),
11105 where PATTERN is a regular expression matching a network service name
11106 or is a port number to connect to,
11107 VAL is a coding system, a cons of coding systems, or a function symbol.
11108 If VAL is a coding system, it is used for both decoding what received
11109 from the network stream and encoding what sent to the network stream.
11110 If VAL is a cons of coding systems, the car part is used for decoding,
11111 and the cdr part is used for encoding.
11112 If VAL is a function symbol, the function must return a coding system
11113 or a cons of coding systems which are used as above.
11114
11115 See also the function `find-operation-coding-system'.  */);
11116   Vnetwork_coding_system_alist = Qnil;
11117
11118   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
11119                doc: /* Coding system to use with system messages.
11120 Also used for decoding keyboard input on X Window system, and for
11121 encoding standard output and error streams.  */);
11122   Vlocale_coding_system = Qnil;
11123
11124   /* The eol mnemonics are reset in startup.el system-dependently.  */
11125   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
11126                doc: /*
11127 String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
11128   eol_mnemonic_unix = build_pure_c_string (":");
11129
11130   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
11131                doc: /*
11132 String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
11133   eol_mnemonic_dos = build_pure_c_string ("\\");
11134
11135   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
11136                doc: /*
11137 String displayed in mode line for MAC-like (CR) end-of-line format.  */);
11138   eol_mnemonic_mac = build_pure_c_string ("/");
11139
11140   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
11141                doc: /*
11142 String displayed in mode line when end-of-line format is not yet determined.  */);
11143   eol_mnemonic_undecided = build_pure_c_string (":");
11144
11145   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
11146                doc: /*
11147 Non-nil enables character translation while encoding and decoding.  */);
11148   Venable_character_translation = Qt;
11149
11150   DEFVAR_LISP ("standard-translation-table-for-decode",
11151                Vstandard_translation_table_for_decode,
11152                doc: /* Table for translating characters while decoding.  */);
11153   Vstandard_translation_table_for_decode = Qnil;
11154
11155   DEFVAR_LISP ("standard-translation-table-for-encode",
11156                Vstandard_translation_table_for_encode,
11157                doc: /* Table for translating characters while encoding.  */);
11158   Vstandard_translation_table_for_encode = Qnil;
11159
11160   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
11161                doc: /* Alist of charsets vs revision numbers.
11162 While encoding, if a charset (car part of an element) is found,
11163 designate it with the escape sequence identifying revision (cdr part
11164 of the element).  */);
11165   Vcharset_revision_table = Qnil;
11166
11167   DEFVAR_LISP ("default-process-coding-system",
11168                Vdefault_process_coding_system,
11169                doc: /* Cons of coding systems used for process I/O by default.
11170 The car part is used for decoding a process output,
11171 the cdr part is used for encoding a text to be sent to a process.  */);
11172   Vdefault_process_coding_system = Qnil;
11173
11174   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
11175                doc: /*
11176 Table of extra Latin codes in the range 128..159 (inclusive).
11177 This is a vector of length 256.
11178 If Nth element is non-nil, the existence of code N in a file
11179 (or output of subprocess) doesn't prevent it to be detected as
11180 a coding system of ISO 2022 variant which has a flag
11181 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
11182 or reading output of a subprocess.
11183 Only 128th through 159th elements have a meaning.  */);
11184   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
11185
11186   DEFVAR_LISP ("select-safe-coding-system-function",
11187                Vselect_safe_coding_system_function,
11188                doc: /*
11189 Function to call to select safe coding system for encoding a text.
11190
11191 If set, this function is called to force a user to select a proper
11192 coding system which can encode the text in the case that a default
11193 coding system used in each operation can't encode the text.  The
11194 function should take care that the buffer is not modified while
11195 the coding system is being selected.
11196
11197 The default value is `select-safe-coding-system' (which see).  */);
11198   Vselect_safe_coding_system_function = Qnil;
11199
11200   DEFVAR_BOOL ("coding-system-require-warning",
11201                coding_system_require_warning,
11202                doc: /* Internal use only.
11203 If non-nil, on writing a file, `select-safe-coding-system-function' is
11204 called even if `coding-system-for-write' is non-nil.  The command
11205 `universal-coding-system-argument' binds this variable to t temporarily.  */);
11206   coding_system_require_warning = 0;
11207
11208
11209   DEFVAR_BOOL ("inhibit-iso-escape-detection",
11210                inhibit_iso_escape_detection,
11211                doc: /*
11212 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
11213
11214 When Emacs reads text, it tries to detect how the text is encoded.
11215 This code detection is sensitive to escape sequences.  If Emacs sees
11216 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
11217 of the ISO2022 encodings, and decodes text by the corresponding coding
11218 system (e.g. `iso-2022-7bit').
11219
11220 However, there may be a case that you want to read escape sequences in
11221 a file as is.  In such a case, you can set this variable to non-nil.
11222 Then the code detection will ignore any escape sequences, and no text is
11223 detected as encoded in some ISO-2022 encoding.  The result is that all
11224 escape sequences become visible in a buffer.
11225
11226 The default value is nil, and it is strongly recommended not to change
11227 it.  That is because many Emacs Lisp source files that contain
11228 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
11229 in Emacs's distribution, and they won't be decoded correctly on
11230 reading if you suppress escape sequence detection.
11231
11232 The other way to read escape sequences in a file without decoding is
11233 to explicitly specify some coding system that doesn't use ISO-2022
11234 escape sequence (e.g., `latin-1') on reading by \\[universal-coding-system-argument].  */);
11235   inhibit_iso_escape_detection = 0;
11236
11237   DEFVAR_BOOL ("inhibit-null-byte-detection",
11238                inhibit_null_byte_detection,
11239                doc: /* If non-nil, Emacs ignores null bytes on code detection.
11240 By default, Emacs treats it as binary data, and does not attempt to
11241 decode it.  The effect is as if you specified `no-conversion' for
11242 reading that text.
11243
11244 Set this to non-nil when a regular text happens to include null bytes.
11245 Examples are Index nodes of Info files and null-byte delimited output
11246 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
11247 decode text as usual.  */);
11248   inhibit_null_byte_detection = 0;
11249
11250   DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
11251                doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
11252 Internal use only.  Remove after the experimental optimizer becomes stable.  */);
11253   disable_ascii_optimization = 0;
11254
11255   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
11256                doc: /* Char table for translating self-inserting characters.
11257 This is applied to the result of input methods, not their input.
11258 See also `keyboard-translate-table'.
11259
11260 Use of this variable for character code unification was rendered
11261 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
11262 internal character representation.  */);
11263   Vtranslation_table_for_input = Qnil;
11264
11265   Lisp_Object args[coding_arg_undecided_max];
11266   memclear (args, sizeof args);
11267
11268   Lisp_Object plist[] =
11269     {
11270       QCname,
11271       args[coding_arg_name] = Qno_conversion,
11272       QCmnemonic,
11273       args[coding_arg_mnemonic] = make_number ('='),
11274       intern_c_string (":coding-type"),
11275       args[coding_arg_coding_type] = Qraw_text,
11276       QCascii_compatible_p,
11277       args[coding_arg_ascii_compatible_p] = Qt,
11278       QCdefault_char,
11279       args[coding_arg_default_char] = make_number (0),
11280       intern_c_string (":for-unibyte"),
11281       args[coding_arg_for_unibyte] = Qt,
11282       intern_c_string (":docstring"),
11283       (build_pure_c_string
11284        ("Do no conversion.\n"
11285         "\n"
11286         "When you visit a file with this coding, the file is read into a\n"
11287         "unibyte buffer as is, thus each byte of a file is treated as a\n"
11288         "character.")),
11289       intern_c_string (":eol-type"),
11290       args[coding_arg_eol_type] = Qunix,
11291     };
11292   args[coding_arg_plist] = CALLMANY (Flist, plist);
11293   Fdefine_coding_system_internal (coding_arg_max, args);
11294
11295   plist[1] = args[coding_arg_name] = Qundecided;
11296   plist[3] = args[coding_arg_mnemonic] = make_number ('-');
11297   plist[5] = args[coding_arg_coding_type] = Qundecided;
11298   /* This is already set.
11299      plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
11300   plist[8] = intern_c_string (":charset-list");
11301   plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
11302   plist[11] = args[coding_arg_for_unibyte] = Qnil;
11303   plist[13] = build_pure_c_string ("No conversion on encoding, "
11304                                    "automatic conversion on decoding.");
11305   plist[15] = args[coding_arg_eol_type] = Qnil;
11306   args[coding_arg_plist] = CALLMANY (Flist, plist);
11307   args[coding_arg_undecided_inhibit_null_byte_detection] = make_number (0);
11308   args[coding_arg_undecided_inhibit_iso_escape_detection] = make_number (0);
11309   Fdefine_coding_system_internal (coding_arg_undecided_max, args);
11310
11311   setup_coding_system (Qno_conversion, &safe_terminal_coding);
11312
11313   for (int i = 0; i < coding_category_max; i++)
11314     Fset (AREF (Vcoding_category_table, i), Qno_conversion);
11315
11316 #if defined (DOS_NT)
11317   system_eol_type = Qdos;
11318 #else
11319   system_eol_type = Qunix;
11320 #endif
11321   staticpro (&system_eol_type);
11322 }
11323
11324 char *
11325 emacs_strerror (int error_number)
11326 {
11327   char *str;
11328
11329   synchronize_system_messages_locale ();
11330   str = strerror (error_number);
11331
11332   if (! NILP (Vlocale_coding_system))
11333     {
11334       Lisp_Object dec = code_convert_string_norecord (build_string (str),
11335                                                       Vlocale_coding_system,
11336                                                       0);
11337       str = SSDATA (dec);
11338     }
11339
11340   return str;
11341 }
11342
11343 #endif /* emacs */