code.delx.au - gnu-emacs/blob - src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2016 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or (at
  16 your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  On
  59   the C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return true if the byte sequence conforms to XXX.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static bool
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   bool multibytep = coding->src_multibyte;
 162   ptrdiff_t consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   bool multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce a encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   bool multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   ptrdiff_t produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288
 289 #ifdef HAVE_WCHAR_H
 290 #include <wchar.h>
 291 #endif /* HAVE_WCHAR_H */
 292
 293 #include "lisp.h"
 294 #include "character.h"
 295 #include "buffer.h"
 296 #include "charset.h"
 297 #include "ccl.h"
 298 #include "composite.h"
 299 #include "coding.h"
 300 #include "termhooks.h"
 301
 302 Lisp_Object Vcoding_system_hash_table;
 303
 304 /* Format of end-of-line decided by system.  This is Qunix on
 305    Unix and Mac, Qdos on DOS/Windows.
 306    This has an effect only for external encoding (i.e. for output to
 307    file and process), not for in-buffer or Lisp string encoding.  */
 308 static Lisp_Object system_eol_type;
 309
 310 #ifdef emacs
 311
 312 /* Coding-systems are handed between Emacs Lisp programs and C internal
 313    routines by the following three variables.  */
 314 /* Coding system to be used to encode text for terminal display when
 315    terminal coding system is nil.  */
 316 struct coding_system safe_terminal_coding;
 317
 318 #endif /* emacs */
 319
 320 /* Two special coding systems.  */
 321 static Lisp_Object Vsjis_coding_system;
 322 static Lisp_Object Vbig5_coding_system;
 323
 324 /* ISO2022 section */
 325
 326 #define CODING_ISO_INITIAL(coding, reg)                 \
 327   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 328                      coding_attr_iso_initial),          \
 329                reg)))
 330
 331
 332 #define CODING_ISO_REQUEST(coding, charset_id)          \
 333   (((charset_id) <= (coding)->max_charset_id            \
 334     ? ((coding)->safe_charsets[charset_id] != 255       \
 335        ? (coding)->safe_charsets[charset_id]            \
 336        : -1)                                            \
 337     : -1))
 338
 339
 340 #define CODING_ISO_FLAGS(coding)        \
 341   ((coding)->spec.iso_2022.flags)
 342 #define CODING_ISO_DESIGNATION(coding, reg)     \
 343   ((coding)->spec.iso_2022.current_designation[reg])
 344 #define CODING_ISO_INVOCATION(coding, plane)    \
 345   ((coding)->spec.iso_2022.current_invocation[plane])
 346 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 347   ((coding)->spec.iso_2022.single_shifting)
 348 #define CODING_ISO_BOL(coding)  \
 349   ((coding)->spec.iso_2022.bol)
 350 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 351   (CODING_ISO_INVOCATION (coding, plane) < 0 ? -1       \
 352    : CODING_ISO_DESIGNATION (coding, CODING_ISO_INVOCATION (coding, plane)))
 353 #define CODING_ISO_CMP_STATUS(coding)   \
 354   (&(coding)->spec.iso_2022.cmp_status)
 355 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 356   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 357 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 358   ((coding)->spec.iso_2022.embedded_utf_8)
 359
 360 /* Control characters of ISO2022.  */
 361                         /* code */      /* function */
 362 #define ISO_CODE_SO     0x0E            /* shift-out */
 363 #define ISO_CODE_SI     0x0F            /* shift-in */
 364 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 365 #define ISO_CODE_ESC    0x1B            /* escape */
 366 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 367 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 368 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 369
 370 /* All code (1-byte) of ISO2022 is classified into one of the
 371    followings.  */
 372 enum iso_code_class_type
 373   {
 374     ISO_control_0,              /* Control codes in the range
 375                                    0x00..0x1F and 0x7F, except for the
 376                                    following 5 codes.  */
 377     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 378     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 379     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 380     ISO_escape,                 /* ISO_CODE_ESC (0x1B) */
 381     ISO_control_1,              /* Control codes in the range
 382                                    0x80..0x9F, except for the
 383                                    following 3 codes.  */
 384     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 385     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 386     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 387     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 388     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 389     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 390     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 391   };
 392
 393 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 394     `iso-flags' attribute of an iso2022 coding system.  */
 395
 396 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 397    instead of the correct short-form sequence (e.g. ESC $ A).  */
 398 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 399
 400 /* If set, reset graphic planes and registers at end-of-line to the
 401    initial state.  */
 402 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 403
 404 /* If set, reset graphic planes and registers before any control
 405    characters to the initial state.  */
 406 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 407
 408 /* If set, encode by 7-bit environment.  */
 409 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 410
 411 /* If set, use locking-shift function.  */
 412 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 413
 414 /* If set, use single-shift function.  Overwrite
 415    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 416 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 417
 418 /* If set, use designation escape sequence.  */
 419 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 420
 421 /* If set, produce revision number sequence.  */
 422 #define CODING_ISO_FLAG_REVISION        0x0080
 423
 424 /* If set, produce ISO6429's direction specifying sequence.  */
 425 #define CODING_ISO_FLAG_DIRECTION       0x0100
 426
 427 /* If set, assume designation states are reset at beginning of line on
 428    output.  */
 429 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 430
 431 /* If set, designation sequence should be placed at beginning of line
 432    on output.  */
 433 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 434
 435 /* If set, do not encode unsafe characters on output.  */
 436 #define CODING_ISO_FLAG_SAFE            0x0800
 437
 438 /* If set, extra latin codes (128..159) are accepted as a valid code
 439    on input.  */
 440 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 441
 442 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 443
 444 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
 445
 446 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 447
 448 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 449
 450 #define CODING_ISO_FLAG_LEVEL_4         0x20000
 451
 452 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 453
 454 /* A character to be produced on output if encoding of the original
 455    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 456 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 457
 458 /* UTF-8 section */
 459 #define CODING_UTF_8_BOM(coding)        \
 460   ((coding)->spec.utf_8_bom)
 461
 462 /* UTF-16 section */
 463 #define CODING_UTF_16_BOM(coding)       \
 464   ((coding)->spec.utf_16.bom)
 465
 466 #define CODING_UTF_16_ENDIAN(coding)    \
 467   ((coding)->spec.utf_16.endian)
 468
 469 #define CODING_UTF_16_SURROGATE(coding) \
 470   ((coding)->spec.utf_16.surrogate)
 471
 472
 473 /* CCL section */
 474 #define CODING_CCL_DECODER(coding)      \
 475   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 476 #define CODING_CCL_ENCODER(coding)      \
 477   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 478 #define CODING_CCL_VALIDS(coding)                                          \
 479   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 480
 481 /* Index for each coding category in `coding_categories' */
 482
 483 enum coding_category
 484   {
 485     coding_category_iso_7,
 486     coding_category_iso_7_tight,
 487     coding_category_iso_8_1,
 488     coding_category_iso_8_2,
 489     coding_category_iso_7_else,
 490     coding_category_iso_8_else,
 491     coding_category_utf_8_auto,
 492     coding_category_utf_8_nosig,
 493     coding_category_utf_8_sig,
 494     coding_category_utf_16_auto,
 495     coding_category_utf_16_be,
 496     coding_category_utf_16_le,
 497     coding_category_utf_16_be_nosig,
 498     coding_category_utf_16_le_nosig,
 499     coding_category_charset,
 500     coding_category_sjis,
 501     coding_category_big5,
 502     coding_category_ccl,
 503     coding_category_emacs_mule,
 504     /* All above are targets of code detection.  */
 505     coding_category_raw_text,
 506     coding_category_undecided,
 507     coding_category_max
 508   };
 509
 510 /* Definitions of flag bits used in detect_coding_XXXX.  */
 511 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 512 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 513 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 514 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 515 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 516 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 517 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 518 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 519 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 520 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 521 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 522 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 523 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 524 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 525 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 526 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 527 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 528 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 529 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 530 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 531
 532 /* This value is returned if detect_coding_mask () find nothing other
 533    than ASCII characters.  */
 534 #define CATEGORY_MASK_ANY               \
 535   (CATEGORY_MASK_ISO_7                  \
 536    | CATEGORY_MASK_ISO_7_TIGHT          \
 537    | CATEGORY_MASK_ISO_8_1              \
 538    | CATEGORY_MASK_ISO_8_2              \
 539    | CATEGORY_MASK_ISO_7_ELSE           \
 540    | CATEGORY_MASK_ISO_8_ELSE           \
 541    | CATEGORY_MASK_UTF_8_AUTO           \
 542    | CATEGORY_MASK_UTF_8_NOSIG          \
 543    | CATEGORY_MASK_UTF_8_SIG            \
 544    | CATEGORY_MASK_UTF_16_AUTO          \
 545    | CATEGORY_MASK_UTF_16_BE            \
 546    | CATEGORY_MASK_UTF_16_LE            \
 547    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 548    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 549    | CATEGORY_MASK_CHARSET              \
 550    | CATEGORY_MASK_SJIS                 \
 551    | CATEGORY_MASK_BIG5                 \
 552    | CATEGORY_MASK_CCL                  \
 553    | CATEGORY_MASK_EMACS_MULE)
 554
 555
 556 #define CATEGORY_MASK_ISO_7BIT \
 557   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 558
 559 #define CATEGORY_MASK_ISO_8BIT \
 560   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 561
 562 #define CATEGORY_MASK_ISO_ELSE \
 563   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 564
 565 #define CATEGORY_MASK_ISO_ESCAPE        \
 566   (CATEGORY_MASK_ISO_7                  \
 567    | CATEGORY_MASK_ISO_7_TIGHT          \
 568    | CATEGORY_MASK_ISO_7_ELSE           \
 569    | CATEGORY_MASK_ISO_8_ELSE)
 570
 571 #define CATEGORY_MASK_ISO       \
 572   (  CATEGORY_MASK_ISO_7BIT     \
 573      | CATEGORY_MASK_ISO_8BIT   \
 574      | CATEGORY_MASK_ISO_ELSE)
 575
 576 #define CATEGORY_MASK_UTF_16            \
 577   (CATEGORY_MASK_UTF_16_AUTO            \
 578    | CATEGORY_MASK_UTF_16_BE            \
 579    | CATEGORY_MASK_UTF_16_LE            \
 580    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 581    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 582
 583 #define CATEGORY_MASK_UTF_8     \
 584   (CATEGORY_MASK_UTF_8_AUTO     \
 585    | CATEGORY_MASK_UTF_8_NOSIG  \
 586    | CATEGORY_MASK_UTF_8_SIG)
 587
 588 /* Table of coding categories (Lisp symbols).  This variable is for
 589    internal use only.  */
 590 static Lisp_Object Vcoding_category_table;
 591
 592 /* Table of coding-categories ordered by priority.  */
 593 static enum coding_category coding_priorities[coding_category_max];
 594
 595 /* Nth element is a coding context for the coding system bound to the
 596    Nth coding category.  */
 597 static struct coding_system coding_categories[coding_category_max];
 598
 599 /* Encode a flag that can be nil, something else, or t as -1, 0, 1.  */
 600
 601 static int
 602 encode_inhibit_flag (Lisp_Object flag)
 603 {
 604   return NILP (flag) ? -1 : EQ (flag, Qt);
 605 }
 606
 607 /* True if the value of ENCODED_FLAG says a flag should be treated as set.
 608    1 means yes, -1 means no, 0 means ask the user variable VAR.  */
 609
 610 static bool
 611 inhibit_flag (int encoded_flag, bool var)
 612 {
 613   return 0 < encoded_flag + var;
 614 }
 615
 616 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 617   do {                                                  \
 618     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 619     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 620   } while (0)
 621
 622 static void
 623 CHECK_NATNUM_CAR (Lisp_Object x)
 624 {
 625   Lisp_Object tmp = XCAR (x);
 626   CHECK_NATNUM (tmp);
 627   XSETCAR (x, tmp);
 628 }
 629
 630 static void
 631 CHECK_NATNUM_CDR (Lisp_Object x)
 632 {
 633   Lisp_Object tmp = XCDR (x);
 634   CHECK_NATNUM (tmp);
 635   XSETCDR (x, tmp);
 636 }
 637
 638 /* True if CODING's destination can be grown.  */
 639
 640 static bool
 641 growable_destination (struct coding_system *coding)
 642 {
 643   return STRINGP (coding->dst_object) || BUFFERP (coding->dst_object);
 644 }
 645
 646
 647 /* Safely get one byte from the source text pointed by SRC which ends
 648    at SRC_END, and set C to that byte.  If there are not enough bytes
 649    in the source, it jumps to 'no_more_source'.  If MULTIBYTEP,
 650    and a multibyte character is found at SRC, set C to the
 651    negative value of the character code.  The caller should declare
 652    and set these variables appropriately in advance:
 653         src, src_end, multibytep */
 654
 655 #define ONE_MORE_BYTE(c)                                \
 656   do {                                                  \
 657     if (src == src_end)                                 \
 658       {                                                 \
 659         if (src_base < src)                             \
 660           record_conversion_result                      \
 661             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 662         goto no_more_source;                            \
 663       }                                                 \
 664     c = *src++;                                         \
 665     if (multibytep && (c & 0x80))                       \
 666       {                                                 \
 667         if ((c & 0xFE) == 0xC0)                         \
 668           c = ((c & 1) << 6) | *src++;                  \
 669         else                                            \
 670           {                                             \
 671             src--;                                      \
 672             c = - string_char (src, &src, NULL);        \
 673             record_conversion_result                    \
 674               (coding, CODING_RESULT_INVALID_SRC);      \
 675           }                                             \
 676       }                                                 \
 677     consumed_chars++;                                   \
 678   } while (0)
 679
 680 /* Safely get two bytes from the source text pointed by SRC which ends
 681    at SRC_END, and set C1 and C2 to those bytes while skipping the
 682    heading multibyte characters.  If there are not enough bytes in the
 683    source, it jumps to 'no_more_source'.  If MULTIBYTEP and
 684    a multibyte character is found for C2, set C2 to the negative value
 685    of the character code.  The caller should declare and set these
 686    variables appropriately in advance:
 687         src, src_end, multibytep
 688    It is intended that this macro is used in detect_coding_utf_16.  */
 689
 690 #define TWO_MORE_BYTES(c1, c2)                          \
 691   do {                                                  \
 692     do {                                                \
 693       if (src == src_end)                               \
 694         goto no_more_source;                            \
 695       c1 = *src++;                                      \
 696       if (multibytep && (c1 & 0x80))                    \
 697         {                                               \
 698           if ((c1 & 0xFE) == 0xC0)                      \
 699             c1 = ((c1 & 1) << 6) | *src++;              \
 700           else                                          \
 701             {                                           \
 702               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 703               c1 = -1;                                  \
 704             }                                           \
 705         }                                               \
 706     } while (c1 < 0);                                   \
 707     if (src == src_end)                                 \
 708       goto no_more_source;                              \
 709     c2 = *src++;                                        \
 710     if (multibytep && (c2 & 0x80))                      \
 711       {                                                 \
 712         if ((c2 & 0xFE) == 0xC0)                        \
 713           c2 = ((c2 & 1) << 6) | *src++;                \
 714         else                                            \
 715           c2 = -1;                                      \
 716       }                                                 \
 717   } while (0)
 718
 719
 720 /* Store a byte C in the place pointed by DST and increment DST to the
 721    next free point, and increment PRODUCED_CHARS.  The caller should
 722    assure that C is 0..127, and declare and set the variable `dst'
 723    appropriately in advance.
 724 */
 725
 726
 727 #define EMIT_ONE_ASCII_BYTE(c)  \
 728   do {                          \
 729     produced_chars++;           \
 730     *dst++ = (c);               \
 731   } while (0)
 732
 733
 734 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 735
 736 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 737   do {                                  \
 738     produced_chars += 2;                \
 739     *dst++ = (c1), *dst++ = (c2);       \
 740   } while (0)
 741
 742
 743 /* Store a byte C in the place pointed by DST and increment DST to the
 744    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP,
 745    store in an appropriate multibyte form.  The caller should
 746    declare and set the variables `dst' and `multibytep' appropriately
 747    in advance.  */
 748
 749 #define EMIT_ONE_BYTE(c)                \
 750   do {                                  \
 751     produced_chars++;                   \
 752     if (multibytep)                     \
 753       {                                 \
 754         unsigned ch = (c);              \
 755         if (ch >= 0x80)                 \
 756           ch = BYTE8_TO_CHAR (ch);      \
 757         CHAR_STRING_ADVANCE (ch, dst);  \
 758       }                                 \
 759     else                                \
 760       *dst++ = (c);                     \
 761   } while (0)
 762
 763
 764 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 765
 766 #define EMIT_TWO_BYTES(c1, c2)          \
 767   do {                                  \
 768     produced_chars += 2;                \
 769     if (multibytep)                     \
 770       {                                 \
 771         unsigned ch;                    \
 772                                         \
 773         ch = (c1);                      \
 774         if (ch >= 0x80)                 \
 775           ch = BYTE8_TO_CHAR (ch);      \
 776         CHAR_STRING_ADVANCE (ch, dst);  \
 777         ch = (c2);                      \
 778         if (ch >= 0x80)                 \
 779           ch = BYTE8_TO_CHAR (ch);      \
 780         CHAR_STRING_ADVANCE (ch, dst);  \
 781       }                                 \
 782     else                                \
 783       {                                 \
 784         *dst++ = (c1);                  \
 785         *dst++ = (c2);                  \
 786       }                                 \
 787   } while (0)
 788
 789
 790 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 791   do {                                  \
 792     EMIT_ONE_BYTE (c1);                 \
 793     EMIT_TWO_BYTES (c2, c3);            \
 794   } while (0)
 795
 796
 797 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 798   do {                                          \
 799     EMIT_TWO_BYTES (c1, c2);                    \
 800     EMIT_TWO_BYTES (c3, c4);                    \
 801   } while (0)
 802
 803
 804 static void
 805 record_conversion_result (struct coding_system *coding,
 806                           enum coding_result_code result)
 807 {
 808   coding->result = result;
 809   switch (result)
 810     {
 811     case CODING_RESULT_INSUFFICIENT_SRC:
 812       Vlast_code_conversion_error = Qinsufficient_source;
 813       break;
 814     case CODING_RESULT_INVALID_SRC:
 815       Vlast_code_conversion_error = Qinvalid_source;
 816       break;
 817     case CODING_RESULT_INTERRUPT:
 818       Vlast_code_conversion_error = Qinterrupted;
 819       break;
 820     case CODING_RESULT_INSUFFICIENT_DST:
 821       /* Don't record this error in Vlast_code_conversion_error
 822          because it happens just temporarily and is resolved when the
 823          whole conversion is finished.  */
 824       break;
 825     case CODING_RESULT_SUCCESS:
 826       break;
 827     default:
 828       Vlast_code_conversion_error = intern ("Unknown error");
 829     }
 830 }
 831
 832 /* These wrapper macros are used to preserve validity of pointers into
 833    buffer text across calls to decode_char, encode_char, etc, which
 834    could cause relocation of buffers if it loads a charset map,
 835    because loading a charset map allocates large structures.  */
 836
 837 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 838   do {                                                                       \
 839     ptrdiff_t offset;                                                        \
 840                                                                              \
 841     charset_map_loaded = 0;                                                  \
 842     c = DECODE_CHAR (charset, code);                                         \
 843     if (charset_map_loaded                                                   \
 844         && (offset = coding_change_source (coding)))                         \
 845       {                                                                      \
 846         src += offset;                                                       \
 847         src_base += offset;                                                  \
 848         src_end += offset;                                                   \
 849       }                                                                      \
 850   } while (0)
 851
 852 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code)      \
 853   do {                                                                  \
 854     ptrdiff_t offset;                                                   \
 855                                                                         \
 856     charset_map_loaded = 0;                                             \
 857     code = ENCODE_CHAR (charset, c);                                    \
 858     if (charset_map_loaded                                              \
 859         && (offset = coding_change_destination (coding)))               \
 860       {                                                                 \
 861         dst += offset;                                                  \
 862         dst_end += offset;                                              \
 863       }                                                                 \
 864   } while (0)
 865
 866 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
 867   do {                                                                  \
 868     ptrdiff_t offset;                                                   \
 869                                                                         \
 870     charset_map_loaded = 0;                                             \
 871     charset = char_charset (c, charset_list, code_return);              \
 872     if (charset_map_loaded                                              \
 873         && (offset = coding_change_destination (coding)))               \
 874       {                                                                 \
 875         dst += offset;                                                  \
 876         dst_end += offset;                                              \
 877       }                                                                 \
 878   } while (0)
 879
 880 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
 881   do {                                                                  \
 882     ptrdiff_t offset;                                                   \
 883                                                                         \
 884     charset_map_loaded = 0;                                             \
 885     result = CHAR_CHARSET_P (c, charset);                               \
 886     if (charset_map_loaded                                              \
 887         && (offset = coding_change_destination (coding)))               \
 888       {                                                                 \
 889         dst += offset;                                                  \
 890         dst_end += offset;                                              \
 891       }                                                                 \
 892   } while (0)
 893
 894
 895 /* If there are at least BYTES length of room at dst, allocate memory
 896    for coding->destination and update dst and dst_end.  We don't have
 897    to take care of coding->source which will be relocated.  It is
 898    handled by calling coding_set_source in encode_coding.  */
 899
 900 #define ASSURE_DESTINATION(bytes)                               \
 901   do {                                                          \
 902     if (dst + (bytes) >= dst_end)                               \
 903       {                                                         \
 904         ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
 905                                                                 \
 906         dst = alloc_destination (coding, more_bytes, dst);      \
 907         dst_end = coding->destination + coding->dst_bytes;      \
 908       }                                                         \
 909   } while (0)
 910
 911
 912 /* Store multibyte form of the character C in P, and advance P to the
 913    end of the multibyte form.  This used to be like CHAR_STRING_ADVANCE
 914    without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
 915    MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE.  */
 916
 917 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)  CHAR_STRING_ADVANCE(c, p)
 918
 919 /* Return the character code of character whose multibyte form is at
 920    P, and advance P to the end of the multibyte form.  This used to be
 921    like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
 922    nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR.  */
 923
 924 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
 925
 926 /* Set coding->source from coding->src_object.  */
 927
 928 static void
 929 coding_set_source (struct coding_system *coding)
 930 {
 931   if (BUFFERP (coding->src_object))
 932     {
 933       struct buffer *buf = XBUFFER (coding->src_object);
 934
 935       if (coding->src_pos < 0)
 936         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 937       else
 938         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 939     }
 940   else if (STRINGP (coding->src_object))
 941     {
 942       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 943     }
 944   else
 945     {
 946       /* Otherwise, the source is C string and is never relocated
 947          automatically.  Thus we don't have to update anything.  */
 948     }
 949 }
 950
 951
 952 /* Set coding->source from coding->src_object, and return how many
 953    bytes coding->source was changed.  */
 954
 955 static ptrdiff_t
 956 coding_change_source (struct coding_system *coding)
 957 {
 958   const unsigned char *orig = coding->source;
 959   coding_set_source (coding);
 960   return coding->source - orig;
 961 }
 962
 963
 964 /* Set coding->destination from coding->dst_object.  */
 965
 966 static void
 967 coding_set_destination (struct coding_system *coding)
 968 {
 969   if (BUFFERP (coding->dst_object))
 970     {
 971       if (BUFFERP (coding->src_object) && coding->src_pos < 0)
 972         {
 973           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
 974           coding->dst_bytes = (GAP_END_ADDR
 975                                - (coding->src_bytes - coding->consumed)
 976                                - coding->destination);
 977         }
 978       else
 979         {
 980           /* We are sure that coding->dst_pos_byte is before the gap
 981              of the buffer. */
 982           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
 983                                  + coding->dst_pos_byte - BEG_BYTE);
 984           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
 985                                - coding->destination);
 986         }
 987     }
 988   else
 989     {
 990       /* Otherwise, the destination is C string and is never relocated
 991          automatically.  Thus we don't have to update anything.  */
 992     }
 993 }
 994
 995
 996 /* Set coding->destination from coding->dst_object, and return how
 997    many bytes coding->destination was changed.  */
 998
 999 static ptrdiff_t
1000 coding_change_destination (struct coding_system *coding)
1001 {
1002   const unsigned char *orig = coding->destination;
1003   coding_set_destination (coding);
1004   return coding->destination - orig;
1005 }
1006
1007
1008 static void
1009 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1010 {
1011   ptrdiff_t newbytes;
1012   if (INT_ADD_WRAPV (coding->dst_bytes, bytes, &newbytes)
1013       || SIZE_MAX < newbytes)
1014     string_overflow ();
1015   coding->destination = xrealloc (coding->destination, newbytes);
1016   coding->dst_bytes = newbytes;
1017 }
1018
1019 static void
1020 coding_alloc_by_making_gap (struct coding_system *coding,
1021                             ptrdiff_t gap_head_used, ptrdiff_t bytes)
1022 {
1023   if (EQ (coding->src_object, coding->dst_object))
1024     {
1025       /* The gap may contain the produced data at the head and not-yet
1026          consumed data at the tail.  To preserve those data, we at
1027          first make the gap size to zero, then increase the gap
1028          size.  */
1029       ptrdiff_t add = GAP_SIZE;
1030
1031       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1032       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1033       make_gap (bytes);
1034       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1035       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1036     }
1037   else
1038     make_gap_1 (XBUFFER (coding->dst_object), bytes);
1039 }
1040
1041
1042 static unsigned char *
1043 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1044                    unsigned char *dst)
1045 {
1046   ptrdiff_t offset = dst - coding->destination;
1047
1048   if (BUFFERP (coding->dst_object))
1049     {
1050       struct buffer *buf = XBUFFER (coding->dst_object);
1051
1052       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1053     }
1054   else
1055     coding_alloc_by_realloc (coding, nbytes);
1056   coding_set_destination (coding);
1057   dst = coding->destination + offset;
1058   return dst;
1059 }
1060
1061 /** Macros for annotations.  */
1062
1063 /* An annotation data is stored in the array coding->charbuf in this
1064    format:
1065      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1066    LENGTH is the number of elements in the annotation.
1067    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1068    NCHARS is the number of characters in the text annotated.
1069
1070    The format of the following elements depend on ANNOTATION_MASK.
1071
1072    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1073    follows:
1074      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1075
1076    NBYTES is the number of bytes specified in the header part of
1077    old-style emacs-mule encoding, or 0 for the other kind of
1078    composition.
1079
1080    METHOD is one of enum composition_method.
1081
1082    Optional COMPOSITION-COMPONENTS are characters and composition
1083    rules.
1084
1085    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1086    follows.
1087
1088    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1089    recover from an invalid annotation, and should be skipped by
1090    produce_annotation.  */
1091
1092 /* Maximum length of the header of annotation data.  */
1093 #define MAX_ANNOTATION_LENGTH 5
1094
1095 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1096   do {                                                  \
1097     *(buf)++ = -(len);                                  \
1098     *(buf)++ = (mask);                                  \
1099     *(buf)++ = (nchars);                                \
1100     coding->annotated = 1;                              \
1101   } while (0);
1102
1103 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1104   do {                                                                      \
1105     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1106     *buf++ = nbytes;                                                        \
1107     *buf++ = method;                                                        \
1108   } while (0)
1109
1110
1111 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1112   do {                                                                  \
1113     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1114     *buf++ = id;                                                        \
1115   } while (0)
1116
1117
1118 /* Bitmasks for coding->eol_seen.  */
1119
1120 #define EOL_SEEN_NONE   0
1121 #define EOL_SEEN_LF     1
1122 #define EOL_SEEN_CR     2
1123 #define EOL_SEEN_CRLF   4
1124
1125 \f
1126 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1127
1128
1129
1130 \f
1131 /*** 3. UTF-8 ***/
1132
1133 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1134    Return true if a text is encoded in UTF-8.  */
1135
1136 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1137 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1138 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1139 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1140 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1141 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1142
1143 #define UTF_8_BOM_1 0xEF
1144 #define UTF_8_BOM_2 0xBB
1145 #define UTF_8_BOM_3 0xBF
1146
1147 /* Unlike the other detect_coding_XXX, this function counts the number
1148    of characters and checks the EOL format.  */
1149
1150 static bool
1151 detect_coding_utf_8 (struct coding_system *coding,
1152                      struct coding_detection_info *detect_info)
1153 {
1154   const unsigned char *src = coding->source, *src_base;
1155   const unsigned char *src_end = coding->source + coding->src_bytes;
1156   bool multibytep = coding->src_multibyte;
1157   ptrdiff_t consumed_chars = 0;
1158   bool bom_found = 0;
1159   ptrdiff_t nchars = coding->head_ascii;
1160   int eol_seen = coding->eol_seen;
1161
1162   detect_info->checked |= CATEGORY_MASK_UTF_8;
1163   /* A coding system of this category is always ASCII compatible.  */
1164   src += nchars;
1165
1166   if (src == coding->source     /* BOM should be at the head.  */
1167       && src + 3 < src_end      /* BOM is 3-byte long.  */
1168       && src[0] == UTF_8_BOM_1
1169       && src[1] == UTF_8_BOM_2
1170       && src[2] == UTF_8_BOM_3)
1171     {
1172       bom_found = 1;
1173       src += 3;
1174       nchars++;
1175     }
1176
1177   while (1)
1178     {
1179       int c, c1, c2, c3, c4;
1180
1181       src_base = src;
1182       ONE_MORE_BYTE (c);
1183       if (c < 0 || UTF_8_1_OCTET_P (c))
1184         {
1185           nchars++;
1186           if (c == '\r')
1187             {
1188               if (src < src_end && *src == '\n')
1189                 {
1190                   eol_seen |= EOL_SEEN_CRLF;
1191                   src++;
1192                   nchars++;
1193                 }
1194               else
1195                 eol_seen |= EOL_SEEN_CR;
1196             }
1197           else if (c == '\n')
1198             eol_seen |= EOL_SEEN_LF;
1199           continue;
1200         }
1201       ONE_MORE_BYTE (c1);
1202       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1203         break;
1204       if (UTF_8_2_OCTET_LEADING_P (c))
1205         {
1206           nchars++;
1207           continue;
1208         }
1209       ONE_MORE_BYTE (c2);
1210       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1211         break;
1212       if (UTF_8_3_OCTET_LEADING_P (c))
1213         {
1214           nchars++;
1215           continue;
1216         }
1217       ONE_MORE_BYTE (c3);
1218       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1219         break;
1220       if (UTF_8_4_OCTET_LEADING_P (c))
1221         {
1222           nchars++;
1223           continue;
1224         }
1225       ONE_MORE_BYTE (c4);
1226       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1227         break;
1228       if (UTF_8_5_OCTET_LEADING_P (c))
1229         {
1230           nchars++;
1231           continue;
1232         }
1233       break;
1234     }
1235   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1236   return 0;
1237
1238  no_more_source:
1239   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1240     {
1241       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1242       return 0;
1243     }
1244   if (bom_found)
1245     {
1246       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1247       detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1248     }
1249   else
1250     {
1251       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1252       if (nchars < src_end - coding->source)
1253         /* The found characters are less than source bytes, which
1254            means that we found a valid non-ASCII characters.  */
1255         detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_NOSIG;
1256     }
1257   coding->detected_utf8_bytes = src_base - coding->source;
1258   coding->detected_utf8_chars = nchars;
1259   return 1;
1260 }
1261
1262
1263 static void
1264 decode_coding_utf_8 (struct coding_system *coding)
1265 {
1266   const unsigned char *src = coding->source + coding->consumed;
1267   const unsigned char *src_end = coding->source + coding->src_bytes;
1268   const unsigned char *src_base;
1269   int *charbuf = coding->charbuf + coding->charbuf_used;
1270   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1271   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1272   bool multibytep = coding->src_multibyte;
1273   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1274   bool eol_dos
1275     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1276   int byte_after_cr = -1;
1277
1278   if (bom != utf_without_bom)
1279     {
1280       int c1, c2, c3;
1281
1282       src_base = src;
1283       ONE_MORE_BYTE (c1);
1284       if (! UTF_8_3_OCTET_LEADING_P (c1))
1285         src = src_base;
1286       else
1287         {
1288           ONE_MORE_BYTE (c2);
1289           if (! UTF_8_EXTRA_OCTET_P (c2))
1290             src = src_base;
1291           else
1292             {
1293               ONE_MORE_BYTE (c3);
1294               if (! UTF_8_EXTRA_OCTET_P (c3))
1295                 src = src_base;
1296               else
1297                 {
1298                   if ((c1 != UTF_8_BOM_1)
1299                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1300                     src = src_base;
1301                   else
1302                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1303                 }
1304             }
1305         }
1306     }
1307   CODING_UTF_8_BOM (coding) = utf_without_bom;
1308
1309   while (1)
1310     {
1311       int c, c1, c2, c3, c4, c5;
1312
1313       src_base = src;
1314       consumed_chars_base = consumed_chars;
1315
1316       if (charbuf >= charbuf_end)
1317         {
1318           if (byte_after_cr >= 0)
1319             src_base--;
1320           break;
1321         }
1322
1323       /* In the simple case, rapidly handle ordinary characters */
1324       if (multibytep && ! eol_dos
1325           && charbuf < charbuf_end - 6 && src < src_end - 6)
1326         {
1327           while (charbuf < charbuf_end - 6 && src < src_end - 6)
1328             {
1329               c1 = *src;
1330               if (c1 & 0x80)
1331                 break;
1332               src++;
1333               consumed_chars++;
1334               *charbuf++ = c1;
1335
1336               c1 = *src;
1337               if (c1 & 0x80)
1338                 break;
1339               src++;
1340               consumed_chars++;
1341               *charbuf++ = c1;
1342
1343               c1 = *src;
1344               if (c1 & 0x80)
1345                 break;
1346               src++;
1347               consumed_chars++;
1348               *charbuf++ = c1;
1349
1350               c1 = *src;
1351               if (c1 & 0x80)
1352                 break;
1353               src++;
1354               consumed_chars++;
1355               *charbuf++ = c1;
1356             }
1357           /* If we handled at least one character, restart the main loop.  */
1358           if (src != src_base)
1359             continue;
1360         }
1361
1362       if (byte_after_cr >= 0)
1363         c1 = byte_after_cr, byte_after_cr = -1;
1364       else
1365         ONE_MORE_BYTE (c1);
1366       if (c1 < 0)
1367         {
1368           c = - c1;
1369         }
1370       else if (UTF_8_1_OCTET_P (c1))
1371         {
1372           if (eol_dos && c1 == '\r')
1373             ONE_MORE_BYTE (byte_after_cr);
1374           c = c1;
1375         }
1376       else
1377         {
1378           ONE_MORE_BYTE (c2);
1379           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1380             goto invalid_code;
1381           if (UTF_8_2_OCTET_LEADING_P (c1))
1382             {
1383               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1384               /* Reject overlong sequences here and below.  Encoders
1385                  producing them are incorrect, they can be misleading,
1386                  and they mess up read/write invariance.  */
1387               if (c < 128)
1388                 goto invalid_code;
1389             }
1390           else
1391             {
1392               ONE_MORE_BYTE (c3);
1393               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1394                 goto invalid_code;
1395               if (UTF_8_3_OCTET_LEADING_P (c1))
1396                 {
1397                   c = (((c1 & 0xF) << 12)
1398                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1399                   if (c < 0x800
1400                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1401                     goto invalid_code;
1402                 }
1403               else
1404                 {
1405                   ONE_MORE_BYTE (c4);
1406                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1407                     goto invalid_code;
1408                   if (UTF_8_4_OCTET_LEADING_P (c1))
1409                     {
1410                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1411                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1412                     if (c < 0x10000)
1413                       goto invalid_code;
1414                     }
1415                   else
1416                     {
1417                       ONE_MORE_BYTE (c5);
1418                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1419                         goto invalid_code;
1420                       if (UTF_8_5_OCTET_LEADING_P (c1))
1421                         {
1422                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1423                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1424                                | (c5 & 0x3F));
1425                           if ((c > MAX_CHAR) || (c < 0x200000))
1426                             goto invalid_code;
1427                         }
1428                       else
1429                         goto invalid_code;
1430                     }
1431                 }
1432             }
1433         }
1434
1435       *charbuf++ = c;
1436       continue;
1437
1438     invalid_code:
1439       src = src_base;
1440       consumed_chars = consumed_chars_base;
1441       ONE_MORE_BYTE (c);
1442       *charbuf++ = ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
1443     }
1444
1445  no_more_source:
1446   coding->consumed_char += consumed_chars_base;
1447   coding->consumed = src_base - coding->source;
1448   coding->charbuf_used = charbuf - coding->charbuf;
1449 }
1450
1451
1452 static bool
1453 encode_coding_utf_8 (struct coding_system *coding)
1454 {
1455   bool multibytep = coding->dst_multibyte;
1456   int *charbuf = coding->charbuf;
1457   int *charbuf_end = charbuf + coding->charbuf_used;
1458   unsigned char *dst = coding->destination + coding->produced;
1459   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1460   ptrdiff_t produced_chars = 0;
1461   int c;
1462
1463   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1464     {
1465       ASSURE_DESTINATION (3);
1466       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1467       CODING_UTF_8_BOM (coding) = utf_without_bom;
1468     }
1469
1470   if (multibytep)
1471     {
1472       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1473
1474       while (charbuf < charbuf_end)
1475         {
1476           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1477
1478           ASSURE_DESTINATION (safe_room);
1479           c = *charbuf++;
1480           if (CHAR_BYTE8_P (c))
1481             {
1482               c = CHAR_TO_BYTE8 (c);
1483               EMIT_ONE_BYTE (c);
1484             }
1485           else
1486             {
1487               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1488               for (p = str; p < pend; p++)
1489                 EMIT_ONE_BYTE (*p);
1490             }
1491         }
1492     }
1493   else
1494     {
1495       int safe_room = MAX_MULTIBYTE_LENGTH;
1496
1497       while (charbuf < charbuf_end)
1498         {
1499           ASSURE_DESTINATION (safe_room);
1500           c = *charbuf++;
1501           if (CHAR_BYTE8_P (c))
1502             *dst++ = CHAR_TO_BYTE8 (c);
1503           else
1504             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1505         }
1506       produced_chars = dst - (coding->destination + coding->produced);
1507     }
1508   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1509   coding->produced_char += produced_chars;
1510   coding->produced = dst - coding->destination;
1511   return 0;
1512 }
1513
1514
1515 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1516    Return true if a text is encoded in one of UTF-16 based coding systems.  */
1517
1518 #define UTF_16_HIGH_SURROGATE_P(val) \
1519   (((val) & 0xFC00) == 0xD800)
1520
1521 #define UTF_16_LOW_SURROGATE_P(val) \
1522   (((val) & 0xFC00) == 0xDC00)
1523
1524
1525 static bool
1526 detect_coding_utf_16 (struct coding_system *coding,
1527                       struct coding_detection_info *detect_info)
1528 {
1529   const unsigned char *src = coding->source;
1530   const unsigned char *src_end = coding->source + coding->src_bytes;
1531   bool multibytep = coding->src_multibyte;
1532   int c1, c2;
1533
1534   detect_info->checked |= CATEGORY_MASK_UTF_16;
1535   if (coding->mode & CODING_MODE_LAST_BLOCK
1536       && (coding->src_chars & 1))
1537     {
1538       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1539       return 0;
1540     }
1541
1542   TWO_MORE_BYTES (c1, c2);
1543   if ((c1 == 0xFF) && (c2 == 0xFE))
1544     {
1545       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1546                              | CATEGORY_MASK_UTF_16_AUTO);
1547       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1548                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1549                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1550     }
1551   else if ((c1 == 0xFE) && (c2 == 0xFF))
1552     {
1553       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1554                              | CATEGORY_MASK_UTF_16_AUTO);
1555       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1556                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1557                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1558     }
1559   else if (c2 < 0)
1560     {
1561       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1562       return 0;
1563     }
1564   else
1565     {
1566       /* We check the dispersion of Eth and Oth bytes where E is even and
1567          O is odd.  If both are high, we assume binary data.*/
1568       unsigned char e[256], o[256];
1569       unsigned e_num = 1, o_num = 1;
1570
1571       memset (e, 0, 256);
1572       memset (o, 0, 256);
1573       e[c1] = 1;
1574       o[c2] = 1;
1575
1576       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1577                                 |CATEGORY_MASK_UTF_16_BE
1578                                 | CATEGORY_MASK_UTF_16_LE);
1579
1580       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1581              != CATEGORY_MASK_UTF_16)
1582         {
1583           TWO_MORE_BYTES (c1, c2);
1584           if (c2 < 0)
1585             break;
1586           if (! e[c1])
1587             {
1588               e[c1] = 1;
1589               e_num++;
1590               if (e_num >= 128)
1591                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1592             }
1593           if (! o[c2])
1594             {
1595               o[c2] = 1;
1596               o_num++;
1597               if (o_num >= 128)
1598                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1599             }
1600         }
1601       return 0;
1602     }
1603
1604  no_more_source:
1605   return 1;
1606 }
1607
1608 static void
1609 decode_coding_utf_16 (struct coding_system *coding)
1610 {
1611   const unsigned char *src = coding->source + coding->consumed;
1612   const unsigned char *src_end = coding->source + coding->src_bytes;
1613   const unsigned char *src_base;
1614   int *charbuf = coding->charbuf + coding->charbuf_used;
1615   /* We may produces at most 3 chars in one loop.  */
1616   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1617   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1618   bool multibytep = coding->src_multibyte;
1619   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1620   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1621   int surrogate = CODING_UTF_16_SURROGATE (coding);
1622   bool eol_dos
1623     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1624   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1625
1626   if (bom == utf_with_bom)
1627     {
1628       int c, c1, c2;
1629
1630       src_base = src;
1631       ONE_MORE_BYTE (c1);
1632       ONE_MORE_BYTE (c2);
1633       c = (c1 << 8) | c2;
1634
1635       if (endian == utf_16_big_endian
1636           ? c != 0xFEFF : c != 0xFFFE)
1637         {
1638           /* The first two bytes are not BOM.  Treat them as bytes
1639              for a normal character.  */
1640           src = src_base;
1641         }
1642       CODING_UTF_16_BOM (coding) = utf_without_bom;
1643     }
1644   else if (bom == utf_detect_bom)
1645     {
1646       /* We have already tried to detect BOM and failed in
1647          detect_coding.  */
1648       CODING_UTF_16_BOM (coding) = utf_without_bom;
1649     }
1650
1651   while (1)
1652     {
1653       int c, c1, c2;
1654
1655       src_base = src;
1656       consumed_chars_base = consumed_chars;
1657
1658       if (charbuf >= charbuf_end)
1659         {
1660           if (byte_after_cr1 >= 0)
1661             src_base -= 2;
1662           break;
1663         }
1664
1665       if (byte_after_cr1 >= 0)
1666         c1 = byte_after_cr1, byte_after_cr1 = -1;
1667       else
1668         ONE_MORE_BYTE (c1);
1669       if (c1 < 0)
1670         {
1671           *charbuf++ = -c1;
1672           continue;
1673         }
1674       if (byte_after_cr2 >= 0)
1675         c2 = byte_after_cr2, byte_after_cr2 = -1;
1676       else
1677         ONE_MORE_BYTE (c2);
1678       if (c2 < 0)
1679         {
1680           *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1681           *charbuf++ = -c2;
1682           continue;
1683         }
1684       c = (endian == utf_16_big_endian
1685            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1686
1687       if (surrogate)
1688         {
1689           if (! UTF_16_LOW_SURROGATE_P (c))
1690             {
1691               if (endian == utf_16_big_endian)
1692                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1693               else
1694                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1695               *charbuf++ = c1;
1696               *charbuf++ = c2;
1697               if (UTF_16_HIGH_SURROGATE_P (c))
1698                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1699               else
1700                 *charbuf++ = c;
1701             }
1702           else
1703             {
1704               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1705               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1706               *charbuf++ = 0x10000 + c;
1707             }
1708         }
1709       else
1710         {
1711           if (UTF_16_HIGH_SURROGATE_P (c))
1712             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1713           else
1714             {
1715               if (eol_dos && c == '\r')
1716                 {
1717                   ONE_MORE_BYTE (byte_after_cr1);
1718                   ONE_MORE_BYTE (byte_after_cr2);
1719                 }
1720               *charbuf++ = c;
1721             }
1722         }
1723     }
1724
1725  no_more_source:
1726   coding->consumed_char += consumed_chars_base;
1727   coding->consumed = src_base - coding->source;
1728   coding->charbuf_used = charbuf - coding->charbuf;
1729 }
1730
1731 static bool
1732 encode_coding_utf_16 (struct coding_system *coding)
1733 {
1734   bool multibytep = coding->dst_multibyte;
1735   int *charbuf = coding->charbuf;
1736   int *charbuf_end = charbuf + coding->charbuf_used;
1737   unsigned char *dst = coding->destination + coding->produced;
1738   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1739   int safe_room = 8;
1740   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1741   bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1742   ptrdiff_t produced_chars = 0;
1743   int c;
1744
1745   if (bom != utf_without_bom)
1746     {
1747       ASSURE_DESTINATION (safe_room);
1748       if (big_endian)
1749         EMIT_TWO_BYTES (0xFE, 0xFF);
1750       else
1751         EMIT_TWO_BYTES (0xFF, 0xFE);
1752       CODING_UTF_16_BOM (coding) = utf_without_bom;
1753     }
1754
1755   while (charbuf < charbuf_end)
1756     {
1757       ASSURE_DESTINATION (safe_room);
1758       c = *charbuf++;
1759       if (c > MAX_UNICODE_CHAR)
1760         c = coding->default_char;
1761
1762       if (c < 0x10000)
1763         {
1764           if (big_endian)
1765             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1766           else
1767             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1768         }
1769       else
1770         {
1771           int c1, c2;
1772
1773           c -= 0x10000;
1774           c1 = (c >> 10) + 0xD800;
1775           c2 = (c & 0x3FF) + 0xDC00;
1776           if (big_endian)
1777             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1778           else
1779             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1780         }
1781     }
1782   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1783   coding->produced = dst - coding->destination;
1784   coding->produced_char += produced_chars;
1785   return 0;
1786 }
1787
1788 \f
1789 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1790
1791 /* Emacs' internal format for representation of multiple character
1792    sets is a kind of multi-byte encoding, i.e. characters are
1793    represented by variable-length sequences of one-byte codes.
1794
1795    ASCII characters and control characters (e.g. `tab', `newline') are
1796    represented by one-byte sequences which are their ASCII codes, in
1797    the range 0x00 through 0x7F.
1798
1799    8-bit characters of the range 0x80..0x9F are represented by
1800    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1801    code + 0x20).
1802
1803    8-bit characters of the range 0xA0..0xFF are represented by
1804    one-byte sequences which are their 8-bit code.
1805
1806    The other characters are represented by a sequence of `base
1807    leading-code', optional `extended leading-code', and one or two
1808    `position-code's.  The length of the sequence is determined by the
1809    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1810    whereas extended leading-code and position-code take the range 0xA0
1811    through 0xFF.  See `charset.h' for more details about leading-code
1812    and position-code.
1813
1814    --- CODE RANGE of Emacs' internal format ---
1815    character set        range
1816    -------------        -----
1817    ascii                0x00..0x7F
1818    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1819    eight-bit-graphic    0xA0..0xBF
1820    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1821    ---------------------------------------------
1822
1823    As this is the internal character representation, the format is
1824    usually not used externally (i.e. in a file or in a data sent to a
1825    process).  But, it is possible to have a text externally in this
1826    format (i.e. by encoding by the coding system `emacs-mule').
1827
1828    In that case, a sequence of one-byte codes has a slightly different
1829    form.
1830
1831    At first, all characters in eight-bit-control are represented by
1832    one-byte sequences which are their 8-bit code.
1833
1834    Next, character composition data are represented by the byte
1835    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1836    where,
1837         METHOD is 0xF2 plus one of composition method (enum
1838         composition_method),
1839
1840         BYTES is 0xA0 plus a byte length of this composition data,
1841
1842         CHARS is 0xA0 plus a number of characters composed by this
1843         data,
1844
1845         COMPONENTs are characters of multibyte form or composition
1846         rules encoded by two-byte of ASCII codes.
1847
1848    In addition, for backward compatibility, the following formats are
1849    also recognized as composition data on decoding.
1850
1851    0x80 MSEQ ...
1852    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1853
1854    Here,
1855         MSEQ is a multibyte form but in these special format:
1856           ASCII: 0xA0 ASCII_CODE+0x80,
1857           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1858         RULE is a one byte code of the range 0xA0..0xF0 that
1859         represents a composition rule.
1860   */
1861
1862 char emacs_mule_bytes[256];
1863
1864
1865 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1866    Return true if a text is encoded in 'emacs-mule'.  */
1867
1868 static bool
1869 detect_coding_emacs_mule (struct coding_system *coding,
1870                           struct coding_detection_info *detect_info)
1871 {
1872   const unsigned char *src = coding->source, *src_base;
1873   const unsigned char *src_end = coding->source + coding->src_bytes;
1874   bool multibytep = coding->src_multibyte;
1875   ptrdiff_t consumed_chars = 0;
1876   int c;
1877   int found = 0;
1878
1879   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1880   /* A coding system of this category is always ASCII compatible.  */
1881   src += coding->head_ascii;
1882
1883   while (1)
1884     {
1885       src_base = src;
1886       ONE_MORE_BYTE (c);
1887       if (c < 0)
1888         continue;
1889       if (c == 0x80)
1890         {
1891           /* Perhaps the start of composite character.  We simply skip
1892              it because analyzing it is too heavy for detecting.  But,
1893              at least, we check that the composite character
1894              constitutes of more than 4 bytes.  */
1895           const unsigned char *src_start;
1896
1897         repeat:
1898           src_start = src;
1899           do
1900             {
1901               ONE_MORE_BYTE (c);
1902             }
1903           while (c >= 0xA0);
1904
1905           if (src - src_start <= 4)
1906             break;
1907           found = CATEGORY_MASK_EMACS_MULE;
1908           if (c == 0x80)
1909             goto repeat;
1910         }
1911
1912       if (c < 0x80)
1913         {
1914           if (c < 0x20
1915               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1916             break;
1917         }
1918       else
1919         {
1920           int more_bytes = emacs_mule_bytes[c] - 1;
1921
1922           while (more_bytes > 0)
1923             {
1924               ONE_MORE_BYTE (c);
1925               if (c < 0xA0)
1926                 {
1927                   src--;        /* Unread the last byte.  */
1928                   break;
1929                 }
1930               more_bytes--;
1931             }
1932           if (more_bytes != 0)
1933             break;
1934           found = CATEGORY_MASK_EMACS_MULE;
1935         }
1936     }
1937   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1938   return 0;
1939
1940  no_more_source:
1941   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1942     {
1943       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1944       return 0;
1945     }
1946   detect_info->found |= found;
1947   return 1;
1948 }
1949
1950
1951 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1952    character.  If CMP_STATUS indicates that we must expect MSEQ or
1953    RULE described above, decode it and return the negative value of
1954    the decoded character or rule.  If an invalid byte is found, return
1955    -1.  If SRC is too short, return -2.  */
1956
1957 static int
1958 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1959                  int *nbytes, int *nchars, int *id,
1960                  struct composition_status *cmp_status)
1961 {
1962   const unsigned char *src_end = coding->source + coding->src_bytes;
1963   const unsigned char *src_base = src;
1964   bool multibytep = coding->src_multibyte;
1965   int charset_ID;
1966   unsigned code;
1967   int c;
1968   ptrdiff_t consumed_chars = 0;
1969   bool mseq_found = 0;
1970
1971   ONE_MORE_BYTE (c);
1972   if (c < 0)
1973     {
1974       c = -c;
1975       charset_ID = emacs_mule_charset[0];
1976     }
1977   else
1978     {
1979       if (c >= 0xA0)
1980         {
1981           if (cmp_status->state != COMPOSING_NO
1982               && cmp_status->old_form)
1983             {
1984               if (cmp_status->state == COMPOSING_CHAR)
1985                 {
1986                   if (c == 0xA0)
1987                     {
1988                       ONE_MORE_BYTE (c);
1989                       c -= 0x80;
1990                       if (c < 0)
1991                         goto invalid_code;
1992                     }
1993                   else
1994                     c -= 0x20;
1995                   mseq_found = 1;
1996                 }
1997               else
1998                 {
1999                   *nbytes = src - src_base;
2000                   *nchars = consumed_chars;
2001                   return -c;
2002                 }
2003             }
2004           else
2005             goto invalid_code;
2006         }
2007
2008       switch (emacs_mule_bytes[c])
2009         {
2010         case 2:
2011           if ((charset_ID = emacs_mule_charset[c]) < 0)
2012             goto invalid_code;
2013           ONE_MORE_BYTE (c);
2014           if (c < 0xA0)
2015             goto invalid_code;
2016           code = c & 0x7F;
2017           break;
2018
2019         case 3:
2020           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2021               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2022             {
2023               ONE_MORE_BYTE (c);
2024               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
2025                 goto invalid_code;
2026               ONE_MORE_BYTE (c);
2027               if (c < 0xA0)
2028                 goto invalid_code;
2029               code = c & 0x7F;
2030             }
2031           else
2032             {
2033               if ((charset_ID = emacs_mule_charset[c]) < 0)
2034                 goto invalid_code;
2035               ONE_MORE_BYTE (c);
2036               if (c < 0xA0)
2037                 goto invalid_code;
2038               code = (c & 0x7F) << 8;
2039               ONE_MORE_BYTE (c);
2040               if (c < 0xA0)
2041                 goto invalid_code;
2042               code |= c & 0x7F;
2043             }
2044           break;
2045
2046         case 4:
2047           ONE_MORE_BYTE (c);
2048           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
2049             goto invalid_code;
2050           ONE_MORE_BYTE (c);
2051           if (c < 0xA0)
2052             goto invalid_code;
2053           code = (c & 0x7F) << 8;
2054           ONE_MORE_BYTE (c);
2055           if (c < 0xA0)
2056             goto invalid_code;
2057           code |= c & 0x7F;
2058           break;
2059
2060         case 1:
2061           code = c;
2062           charset_ID = ASCII_CHAR_P (code) ? charset_ascii : charset_eight_bit;
2063           break;
2064
2065         default:
2066           emacs_abort ();
2067         }
2068       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2069                           CHARSET_FROM_ID (charset_ID), code, c);
2070       if (c < 0)
2071         goto invalid_code;
2072     }
2073   *nbytes = src - src_base;
2074   *nchars = consumed_chars;
2075   if (id)
2076     *id = charset_ID;
2077   return (mseq_found ? -c : c);
2078
2079  no_more_source:
2080   return -2;
2081
2082  invalid_code:
2083   return -1;
2084 }
2085
2086
2087 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2088
2089 /* Handle these composition sequence ('|': the end of header elements,
2090    BYTES and CHARS >= 0xA0):
2091
2092    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2093    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2094    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2095
2096    and these old form:
2097
2098    (4) relative composition: 0x80 | MSEQ ... MSEQ
2099    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2100
2101    When the starter 0x80 and the following header elements are found,
2102    this annotation header is produced.
2103
2104         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2105
2106    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2107    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2108
2109    Then, upon reading the following elements, these codes are produced
2110    until the composition end is found:
2111
2112    (1) CHAR ... CHAR
2113    (2) ALT ... ALT CHAR ... CHAR
2114    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2115    (4) CHAR ... CHAR
2116    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2117
2118    When the composition end is found, LENGTH and NCHARS in the
2119    annotation header is updated as below:
2120
2121    (1) LENGTH: unchanged, NCHARS: unchanged
2122    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2123    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2124    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2125    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2126
2127    If an error is found while composing, the annotation header is
2128    changed to the original composition header (plus filler -1s) as
2129    below:
2130
2131    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2132    (5)          [ 0x80 0xFF -1 -1- -1 ]
2133
2134    and the sequence [ -2 DECODED-RULE ] is changed to the original
2135    byte sequence as below:
2136         o the original byte sequence is B: [ B -1 ]
2137         o the original byte sequence is B1 B2: [ B1 B2 ]
2138
2139    Most of the routines are implemented by macros because many
2140    variables and labels in the caller decode_coding_emacs_mule must be
2141    accessible, and they are usually called just once (thus doesn't
2142    increase the size of compiled object).  */
2143
2144 /* Decode a composition rule represented by C as a component of
2145    composition sequence of Emacs 20 style.  Set RULE to the decoded
2146    rule. */
2147
2148 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2149   do {                                                  \
2150     int gref, nref;                                     \
2151                                                         \
2152     c -= 0xA0;                                          \
2153     if (c < 0 || c >= 81)                               \
2154       goto invalid_code;                                \
2155     gref = c / 9, nref = c % 9;                         \
2156     if (gref == 4) gref = 10;                           \
2157     if (nref == 4) nref = 10;                           \
2158     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2159   } while (0)
2160
2161
2162 /* Decode a composition rule represented by C and the following byte
2163    at SRC as a component of composition sequence of Emacs 21 style.
2164    Set RULE to the decoded rule.  */
2165
2166 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2167   do {                                                  \
2168     int gref, nref;                                     \
2169                                                         \
2170     gref = c - 0x20;                                    \
2171     if (gref < 0 || gref >= 81)                         \
2172       goto invalid_code;                                \
2173     ONE_MORE_BYTE (c);                                  \
2174     nref = c - 0x20;                                    \
2175     if (nref < 0 || nref >= 81)                         \
2176       goto invalid_code;                                \
2177     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2178   } while (0)
2179
2180
2181 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2182    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2183    byte length of this composition information, CHARS is the number of
2184    characters composed by this composition.  */
2185
2186 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2187   do {                                                                  \
2188     enum composition_method method = c - 0xF2;                          \
2189     int nbytes, nchars;                                                 \
2190                                                                         \
2191     ONE_MORE_BYTE (c);                                                  \
2192     if (c < 0)                                                          \
2193       goto invalid_code;                                                \
2194     nbytes = c - 0xA0;                                                  \
2195     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2196       goto invalid_code;                                                \
2197     ONE_MORE_BYTE (c);                                                  \
2198     nchars = c - 0xA0;                                                  \
2199     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2200       goto invalid_code;                                                \
2201     cmp_status->old_form = 0;                                           \
2202     cmp_status->method = method;                                        \
2203     if (method == COMPOSITION_RELATIVE)                                 \
2204       cmp_status->state = COMPOSING_CHAR;                               \
2205     else                                                                \
2206       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2207     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2208     cmp_status->nchars = nchars;                                        \
2209     cmp_status->ncomps = nbytes - 4;                                    \
2210     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2211   } while (0)
2212
2213
2214 /* Start of Emacs 20 style format for relative composition.  */
2215
2216 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2217   do {                                                          \
2218     cmp_status->old_form = 1;                                   \
2219     cmp_status->method = COMPOSITION_RELATIVE;                  \
2220     cmp_status->state = COMPOSING_CHAR;                         \
2221     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2222     cmp_status->nchars = cmp_status->ncomps = 0;                \
2223     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2224   } while (0)
2225
2226
2227 /* Start of Emacs 20 style format for rule-base composition.  */
2228
2229 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2230   do {                                                          \
2231     cmp_status->old_form = 1;                                   \
2232     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2233     cmp_status->state = COMPOSING_CHAR;                         \
2234     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2235     cmp_status->nchars = cmp_status->ncomps = 0;                \
2236     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2237   } while (0)
2238
2239
2240 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2241   do {                                                  \
2242     const unsigned char *current_src = src;             \
2243                                                         \
2244     ONE_MORE_BYTE (c);                                  \
2245     if (c < 0)                                          \
2246       goto invalid_code;                                \
2247     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2248         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2249       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2250     else if (c < 0xA0)                                  \
2251       goto invalid_code;                                \
2252     else if (c < 0xC0)                                  \
2253       {                                                 \
2254         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2255         /* Re-read C as a composition component.  */    \
2256         src = current_src;                              \
2257       }                                                 \
2258     else if (c == 0xFF)                                 \
2259       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2260     else                                                \
2261       goto invalid_code;                                \
2262   } while (0)
2263
2264 #define EMACS_MULE_COMPOSITION_END()                            \
2265   do {                                                          \
2266     int idx = - cmp_status->length;                             \
2267                                                                 \
2268     if (cmp_status->old_form)                                   \
2269       charbuf[idx + 2] = cmp_status->nchars;                    \
2270     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2271       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2272     cmp_status->state = COMPOSING_NO;                           \
2273   } while (0)
2274
2275
2276 static int
2277 emacs_mule_finish_composition (int *charbuf,
2278                                struct composition_status *cmp_status)
2279 {
2280   int idx = - cmp_status->length;
2281   int new_chars;
2282
2283   if (cmp_status->old_form && cmp_status->nchars > 0)
2284     {
2285       charbuf[idx + 2] = cmp_status->nchars;
2286       new_chars = 0;
2287       if (cmp_status->method == COMPOSITION_WITH_RULE
2288           && cmp_status->state == COMPOSING_CHAR)
2289         {
2290           /* The last rule was invalid.  */
2291           int rule = charbuf[-1] + 0xA0;
2292
2293           charbuf[-2] = BYTE8_TO_CHAR (rule);
2294           charbuf[-1] = -1;
2295           new_chars = 1;
2296         }
2297     }
2298   else
2299     {
2300       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2301
2302       if (cmp_status->method == COMPOSITION_WITH_RULE)
2303         {
2304           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2305           charbuf[idx++] = -3;
2306           charbuf[idx++] = 0;
2307           new_chars = 1;
2308         }
2309       else
2310         {
2311           int nchars = charbuf[idx + 1] + 0xA0;
2312           int nbytes = charbuf[idx + 2] + 0xA0;
2313
2314           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2315           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2316           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2317           charbuf[idx++] = -1;
2318           new_chars = 4;
2319         }
2320     }
2321   cmp_status->state = COMPOSING_NO;
2322   return new_chars;
2323 }
2324
2325 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2326   do {                                                                    \
2327     if (cmp_status->state != COMPOSING_NO)                                \
2328       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2329   } while (0)
2330
2331
2332 static void
2333 decode_coding_emacs_mule (struct coding_system *coding)
2334 {
2335   const unsigned char *src = coding->source + coding->consumed;
2336   const unsigned char *src_end = coding->source + coding->src_bytes;
2337   const unsigned char *src_base;
2338   int *charbuf = coding->charbuf + coding->charbuf_used;
2339   /* We may produce two annotations (charset and composition) in one
2340      loop and one more charset annotation at the end.  */
2341   int *charbuf_end
2342     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2343       /* We can produce up to 2 characters in a loop.  */
2344       - 1;
2345   ptrdiff_t consumed_chars = 0, consumed_chars_base;
2346   bool multibytep = coding->src_multibyte;
2347   ptrdiff_t char_offset = coding->produced_char;
2348   ptrdiff_t last_offset = char_offset;
2349   int last_id = charset_ascii;
2350   bool eol_dos
2351     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2352   int byte_after_cr = -1;
2353   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2354
2355   if (cmp_status->state != COMPOSING_NO)
2356     {
2357       int i;
2358
2359       if (charbuf_end - charbuf < cmp_status->length)
2360         emacs_abort ();
2361       for (i = 0; i < cmp_status->length; i++)
2362         *charbuf++ = cmp_status->carryover[i];
2363       coding->annotated = 1;
2364     }
2365
2366   while (1)
2367     {
2368       int c, id IF_LINT (= 0);
2369
2370       src_base = src;
2371       consumed_chars_base = consumed_chars;
2372
2373       if (charbuf >= charbuf_end)
2374         {
2375           if (byte_after_cr >= 0)
2376             src_base--;
2377           break;
2378         }
2379
2380       if (byte_after_cr >= 0)
2381         c = byte_after_cr, byte_after_cr = -1;
2382       else
2383         ONE_MORE_BYTE (c);
2384
2385       if (c < 0 || c == 0x80)
2386         {
2387           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2388           if (c < 0)
2389             {
2390               *charbuf++ = -c;
2391               char_offset++;
2392             }
2393           else
2394             DECODE_EMACS_MULE_COMPOSITION_START ();
2395           continue;
2396         }
2397
2398       if (c < 0x80)
2399         {
2400           if (eol_dos && c == '\r')
2401             ONE_MORE_BYTE (byte_after_cr);
2402           id = charset_ascii;
2403           if (cmp_status->state != COMPOSING_NO)
2404             {
2405               if (cmp_status->old_form)
2406                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2407               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2408                 cmp_status->ncomps--;
2409             }
2410         }
2411       else
2412         {
2413           int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2414           /* emacs_mule_char can load a charset map from a file, which
2415              allocates a large structure and might cause buffer text
2416              to be relocated as result.  Thus, we need to remember the
2417              original pointer to buffer text, and fix up all related
2418              pointers after the call.  */
2419           const unsigned char *orig = coding->source;
2420           ptrdiff_t offset;
2421
2422           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2423                                cmp_status);
2424           offset = coding->source - orig;
2425           if (offset)
2426             {
2427               src += offset;
2428               src_base += offset;
2429               src_end += offset;
2430             }
2431           if (c < 0)
2432             {
2433               if (c == -1)
2434                 goto invalid_code;
2435               if (c == -2)
2436                 break;
2437             }
2438           src = src_base + nbytes;
2439           consumed_chars = consumed_chars_base + nchars;
2440           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2441             cmp_status->ncomps -= nchars;
2442         }
2443
2444       /* Now if C >= 0, we found a normally encoded character, if C <
2445          0, we found an old-style composition component character or
2446          rule.  */
2447
2448       if (cmp_status->state == COMPOSING_NO)
2449         {
2450           if (last_id != id)
2451             {
2452               if (last_id != charset_ascii)
2453                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2454                                   last_id);
2455               last_id = id;
2456               last_offset = char_offset;
2457             }
2458           *charbuf++ = c;
2459           char_offset++;
2460         }
2461       else if (cmp_status->state == COMPOSING_CHAR)
2462         {
2463           if (cmp_status->old_form)
2464             {
2465               if (c >= 0)
2466                 {
2467                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2468                   *charbuf++ = c;
2469                   char_offset++;
2470                 }
2471               else
2472                 {
2473                   *charbuf++ = -c;
2474                   cmp_status->nchars++;
2475                   cmp_status->length++;
2476                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2477                     EMACS_MULE_COMPOSITION_END ();
2478                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2479                     cmp_status->state = COMPOSING_RULE;
2480                 }
2481             }
2482           else
2483             {
2484               *charbuf++ = c;
2485               cmp_status->length++;
2486               cmp_status->nchars--;
2487               if (cmp_status->nchars == 0)
2488                 EMACS_MULE_COMPOSITION_END ();
2489             }
2490         }
2491       else if (cmp_status->state == COMPOSING_RULE)
2492         {
2493           int rule;
2494
2495           if (c >= 0)
2496             {
2497               EMACS_MULE_COMPOSITION_END ();
2498               *charbuf++ = c;
2499               char_offset++;
2500             }
2501           else
2502             {
2503               c = -c;
2504               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2505               if (rule < 0)
2506                 goto invalid_code;
2507               *charbuf++ = -2;
2508               *charbuf++ = rule;
2509               cmp_status->length += 2;
2510               cmp_status->state = COMPOSING_CHAR;
2511             }
2512         }
2513       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2514         {
2515           *charbuf++ = c;
2516           cmp_status->length++;
2517           if (cmp_status->ncomps == 0)
2518             cmp_status->state = COMPOSING_CHAR;
2519           else if (cmp_status->ncomps > 0)
2520             {
2521               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2522                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2523             }
2524           else
2525             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2526         }
2527       else                      /* COMPOSING_COMPONENT_RULE */
2528         {
2529           int rule;
2530
2531           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2532           if (rule < 0)
2533             goto invalid_code;
2534           *charbuf++ = -2;
2535           *charbuf++ = rule;
2536           cmp_status->length += 2;
2537           cmp_status->ncomps--;
2538           if (cmp_status->ncomps > 0)
2539             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2540           else
2541             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2542         }
2543       continue;
2544
2545     invalid_code:
2546       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2547       src = src_base;
2548       consumed_chars = consumed_chars_base;
2549       ONE_MORE_BYTE (c);
2550       *charbuf++ = ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
2551       char_offset++;
2552     }
2553
2554  no_more_source:
2555   if (cmp_status->state != COMPOSING_NO)
2556     {
2557       if (coding->mode & CODING_MODE_LAST_BLOCK)
2558         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2559       else
2560         {
2561           int i;
2562
2563           charbuf -= cmp_status->length;
2564           for (i = 0; i < cmp_status->length; i++)
2565             cmp_status->carryover[i] = charbuf[i];
2566         }
2567     }
2568   if (last_id != charset_ascii)
2569     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2570   coding->consumed_char += consumed_chars_base;
2571   coding->consumed = src_base - coding->source;
2572   coding->charbuf_used = charbuf - coding->charbuf;
2573 }
2574
2575
2576 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2577   do {                                          \
2578     if (id < 0xA0)                              \
2579       codes[0] = id, codes[1] = 0;              \
2580     else if (id < 0xE0)                         \
2581       codes[0] = 0x9A, codes[1] = id;           \
2582     else if (id < 0xF0)                         \
2583       codes[0] = 0x9B, codes[1] = id;           \
2584     else if (id < 0xF5)                         \
2585       codes[0] = 0x9C, codes[1] = id;           \
2586     else                                        \
2587       codes[0] = 0x9D, codes[1] = id;           \
2588   } while (0);
2589
2590
2591 static bool
2592 encode_coding_emacs_mule (struct coding_system *coding)
2593 {
2594   bool multibytep = coding->dst_multibyte;
2595   int *charbuf = coding->charbuf;
2596   int *charbuf_end = charbuf + coding->charbuf_used;
2597   unsigned char *dst = coding->destination + coding->produced;
2598   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2599   int safe_room = 8;
2600   ptrdiff_t produced_chars = 0;
2601   Lisp_Object attrs, charset_list;
2602   int c;
2603   int preferred_charset_id = -1;
2604
2605   CODING_GET_INFO (coding, attrs, charset_list);
2606   if (! EQ (charset_list, Vemacs_mule_charset_list))
2607     {
2608       charset_list = Vemacs_mule_charset_list;
2609       ASET (attrs, coding_attr_charset_list, charset_list);
2610     }
2611
2612   while (charbuf < charbuf_end)
2613     {
2614       ASSURE_DESTINATION (safe_room);
2615       c = *charbuf++;
2616
2617       if (c < 0)
2618         {
2619           /* Handle an annotation.  */
2620           switch (*charbuf)
2621             {
2622             case CODING_ANNOTATE_COMPOSITION_MASK:
2623               /* Not yet implemented.  */
2624               break;
2625             case CODING_ANNOTATE_CHARSET_MASK:
2626               preferred_charset_id = charbuf[3];
2627               if (preferred_charset_id >= 0
2628                   && NILP (Fmemq (make_number (preferred_charset_id),
2629                                   charset_list)))
2630                 preferred_charset_id = -1;
2631               break;
2632             default:
2633               emacs_abort ();
2634             }
2635           charbuf += -c - 1;
2636           continue;
2637         }
2638
2639       if (ASCII_CHAR_P (c))
2640         EMIT_ONE_ASCII_BYTE (c);
2641       else if (CHAR_BYTE8_P (c))
2642         {
2643           c = CHAR_TO_BYTE8 (c);
2644           EMIT_ONE_BYTE (c);
2645         }
2646       else
2647         {
2648           struct charset *charset;
2649           unsigned code;
2650           int dimension;
2651           int emacs_mule_id;
2652           unsigned char leading_codes[2];
2653
2654           if (preferred_charset_id >= 0)
2655             {
2656               bool result;
2657
2658               charset = CHARSET_FROM_ID (preferred_charset_id);
2659               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2660               if (result)
2661                 code = ENCODE_CHAR (charset, c);
2662               else
2663                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2664                                      &code, charset);
2665             }
2666           else
2667             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2668                                  &code, charset);
2669           if (! charset)
2670             {
2671               c = coding->default_char;
2672               if (ASCII_CHAR_P (c))
2673                 {
2674                   EMIT_ONE_ASCII_BYTE (c);
2675                   continue;
2676                 }
2677               CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2678                                    &code, charset);
2679             }
2680           dimension = CHARSET_DIMENSION (charset);
2681           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2682           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2683           EMIT_ONE_BYTE (leading_codes[0]);
2684           if (leading_codes[1])
2685             EMIT_ONE_BYTE (leading_codes[1]);
2686           if (dimension == 1)
2687             EMIT_ONE_BYTE (code | 0x80);
2688           else
2689             {
2690               code |= 0x8080;
2691               EMIT_ONE_BYTE (code >> 8);
2692               EMIT_ONE_BYTE (code & 0xFF);
2693             }
2694         }
2695     }
2696   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2697   coding->produced_char += produced_chars;
2698   coding->produced = dst - coding->destination;
2699   return 0;
2700 }
2701
2702 \f
2703 /*** 7. ISO2022 handlers ***/
2704
2705 /* The following note describes the coding system ISO2022 briefly.
2706    Since the intention of this note is to help understand the
2707    functions in this file, some parts are NOT ACCURATE or are OVERLY
2708    SIMPLIFIED.  For thorough understanding, please refer to the
2709    original document of ISO2022.  This is equivalent to the standard
2710    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2711
2712    ISO2022 provides many mechanisms to encode several character sets
2713    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2714    is encoded using bytes less than 128.  This may make the encoded
2715    text a little bit longer, but the text passes more easily through
2716    several types of gateway, some of which strip off the MSB (Most
2717    Significant Bit).
2718
2719    There are two kinds of character sets: control character sets and
2720    graphic character sets.  The former contain control characters such
2721    as `newline' and `escape' to provide control functions (control
2722    functions are also provided by escape sequences).  The latter
2723    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2724    two control character sets and many graphic character sets.
2725
2726    Graphic character sets are classified into one of the following
2727    four classes, according to the number of bytes (DIMENSION) and
2728    number of characters in one dimension (CHARS) of the set:
2729    - DIMENSION1_CHARS94
2730    - DIMENSION1_CHARS96
2731    - DIMENSION2_CHARS94
2732    - DIMENSION2_CHARS96
2733
2734    In addition, each character set is assigned an identification tag,
2735    unique for each set, called the "final character" (denoted as <F>
2736    hereafter).  The <F> of each character set is decided by ECMA(*)
2737    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2738    (0x30..0x3F are for private use only).
2739
2740    Note (*): ECMA = European Computer Manufacturers Association
2741
2742    Here are examples of graphic character sets [NAME(<F>)]:
2743         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2744         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2745         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2746         o DIMENSION2_CHARS96 -- none for the moment
2747
2748    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2749         C0 [0x00..0x1F] -- control character plane 0
2750         GL [0x20..0x7F] -- graphic character plane 0
2751         C1 [0x80..0x9F] -- control character plane 1
2752         GR [0xA0..0xFF] -- graphic character plane 1
2753
2754    A control character set is directly designated and invoked to C0 or
2755    C1 by an escape sequence.  The most common case is that:
2756    - ISO646's  control character set is designated/invoked to C0, and
2757    - ISO6429's control character set is designated/invoked to C1,
2758    and usually these designations/invocations are omitted in encoded
2759    text.  In a 7-bit environment, only C0 can be used, and a control
2760    character for C1 is encoded by an appropriate escape sequence to
2761    fit into the environment.  All control characters for C1 are
2762    defined to have corresponding escape sequences.
2763
2764    A graphic character set is at first designated to one of four
2765    graphic registers (G0 through G3), then these graphic registers are
2766    invoked to GL or GR.  These designations and invocations can be
2767    done independently.  The most common case is that G0 is invoked to
2768    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2769    these invocations and designations are omitted in encoded text.
2770    In a 7-bit environment, only GL can be used.
2771
2772    When a graphic character set of CHARS94 is invoked to GL, codes
2773    0x20 and 0x7F of the GL area work as control characters SPACE and
2774    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2775    be used.
2776
2777    There are two ways of invocation: locking-shift and single-shift.
2778    With locking-shift, the invocation lasts until the next different
2779    invocation, whereas with single-shift, the invocation affects the
2780    following character only and doesn't affect the locking-shift
2781    state.  Invocations are done by the following control characters or
2782    escape sequences:
2783
2784    ----------------------------------------------------------------------
2785    abbrev  function                  cntrl escape seq   description
2786    ----------------------------------------------------------------------
2787    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2788    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2789    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2790    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2791    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2792    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2793    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2794    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2795    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2796    ----------------------------------------------------------------------
2797    (*) These are not used by any known coding system.
2798
2799    Control characters for these functions are defined by macros
2800    ISO_CODE_XXX in `coding.h'.
2801
2802    Designations are done by the following escape sequences:
2803    ----------------------------------------------------------------------
2804    escape sequence      description
2805    ----------------------------------------------------------------------
2806    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2807    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2808    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2809    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2810    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2811    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2812    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2813    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2814    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2815    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2816    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2817    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2818    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2819    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2820    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2821    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2822    ----------------------------------------------------------------------
2823
2824    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2825    of dimension 1, chars 94, and final character <F>, etc...
2826
2827    Note (*): Although these designations are not allowed in ISO2022,
2828    Emacs accepts them on decoding, and produces them on encoding
2829    CHARS96 character sets in a coding system which is characterized as
2830    7-bit environment, non-locking-shift, and non-single-shift.
2831
2832    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2833    '(' must be omitted.  We refer to this as "short-form" hereafter.
2834
2835    Now you may notice that there are a lot of ways of encoding the
2836    same multilingual text in ISO2022.  Actually, there exist many
2837    coding systems such as Compound Text (used in X11's inter client
2838    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2839    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2840    localized platforms), and all of these are variants of ISO2022.
2841
2842    In addition to the above, Emacs handles two more kinds of escape
2843    sequences: ISO6429's direction specification and Emacs' private
2844    sequence for specifying character composition.
2845
2846    ISO6429's direction specification takes the following form:
2847         o CSI ']'      -- end of the current direction
2848         o CSI '0' ']'  -- end of the current direction
2849         o CSI '1' ']'  -- start of left-to-right text
2850         o CSI '2' ']'  -- start of right-to-left text
2851    The control character CSI (0x9B: control sequence introducer) is
2852    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2853
2854    Character composition specification takes the following form:
2855         o ESC '0' -- start relative composition
2856         o ESC '1' -- end composition
2857         o ESC '2' -- start rule-base composition (*)
2858         o ESC '3' -- start relative composition with alternate chars  (**)
2859         o ESC '4' -- start rule-base composition with alternate chars  (**)
2860   Since these are not standard escape sequences of any ISO standard,
2861   the use of them with these meanings is restricted to Emacs only.
2862
2863   (*) This form is used only in Emacs 20.7 and older versions,
2864   but newer versions can safely decode it.
2865   (**) This form is used only in Emacs 21.1 and newer versions,
2866   and older versions can't decode it.
2867
2868   Here's a list of example usages of these composition escape
2869   sequences (categorized by `enum composition_method').
2870
2871   COMPOSITION_RELATIVE:
2872         ESC 0 CHAR [ CHAR ] ESC 1
2873   COMPOSITION_WITH_RULE:
2874         ESC 2 CHAR [ RULE CHAR ] ESC 1
2875   COMPOSITION_WITH_ALTCHARS:
2876         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2877   COMPOSITION_WITH_RULE_ALTCHARS:
2878         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2879
2880 static enum iso_code_class_type iso_code_class[256];
2881
2882 #define SAFE_CHARSET_P(coding, id)      \
2883   ((id) <= (coding)->max_charset_id     \
2884    && (coding)->safe_charsets[id] != 255)
2885
2886 static void
2887 setup_iso_safe_charsets (Lisp_Object attrs)
2888 {
2889   Lisp_Object charset_list, safe_charsets;
2890   Lisp_Object request;
2891   Lisp_Object reg_usage;
2892   Lisp_Object tail;
2893   EMACS_INT reg94, reg96;
2894   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2895   int max_charset_id;
2896
2897   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2898   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2899       && ! EQ (charset_list, Viso_2022_charset_list))
2900     {
2901       charset_list = Viso_2022_charset_list;
2902       ASET (attrs, coding_attr_charset_list, charset_list);
2903       ASET (attrs, coding_attr_safe_charsets, Qnil);
2904     }
2905
2906   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2907     return;
2908
2909   max_charset_id = 0;
2910   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2911     {
2912       int id = XINT (XCAR (tail));
2913       if (max_charset_id < id)
2914         max_charset_id = id;
2915     }
2916
2917   safe_charsets = make_uninit_string (max_charset_id + 1);
2918   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2919   request = AREF (attrs, coding_attr_iso_request);
2920   reg_usage = AREF (attrs, coding_attr_iso_usage);
2921   reg94 = XINT (XCAR (reg_usage));
2922   reg96 = XINT (XCDR (reg_usage));
2923
2924   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2925     {
2926       Lisp_Object id;
2927       Lisp_Object reg;
2928       struct charset *charset;
2929
2930       id = XCAR (tail);
2931       charset = CHARSET_FROM_ID (XINT (id));
2932       reg = Fcdr (Fassq (id, request));
2933       if (! NILP (reg))
2934         SSET (safe_charsets, XINT (id), XINT (reg));
2935       else if (charset->iso_chars_96)
2936         {
2937           if (reg96 < 4)
2938             SSET (safe_charsets, XINT (id), reg96);
2939         }
2940       else
2941         {
2942           if (reg94 < 4)
2943             SSET (safe_charsets, XINT (id), reg94);
2944         }
2945     }
2946   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2947 }
2948
2949
2950 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2951    Return true if a text is encoded in one of ISO-2022 based coding
2952    systems.  */
2953
2954 static bool
2955 detect_coding_iso_2022 (struct coding_system *coding,
2956                         struct coding_detection_info *detect_info)
2957 {
2958   const unsigned char *src = coding->source, *src_base = src;
2959   const unsigned char *src_end = coding->source + coding->src_bytes;
2960   bool multibytep = coding->src_multibyte;
2961   bool single_shifting = 0;
2962   int id;
2963   int c, c1;
2964   ptrdiff_t consumed_chars = 0;
2965   int i;
2966   int rejected = 0;
2967   int found = 0;
2968   int composition_count = -1;
2969
2970   detect_info->checked |= CATEGORY_MASK_ISO;
2971
2972   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2973     {
2974       struct coding_system *this = &(coding_categories[i]);
2975       Lisp_Object attrs, val;
2976
2977       if (this->id < 0)
2978         continue;
2979       attrs = CODING_ID_ATTRS (this->id);
2980       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2981           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
2982         setup_iso_safe_charsets (attrs);
2983       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2984       this->max_charset_id = SCHARS (val) - 1;
2985       this->safe_charsets = SDATA (val);
2986     }
2987
2988   /* A coding system of this category is always ASCII compatible.  */
2989   src += coding->head_ascii;
2990
2991   while (rejected != CATEGORY_MASK_ISO)
2992     {
2993       src_base = src;
2994       ONE_MORE_BYTE (c);
2995       switch (c)
2996         {
2997         case ISO_CODE_ESC:
2998           if (inhibit_iso_escape_detection)
2999             break;
3000           single_shifting = 0;
3001           ONE_MORE_BYTE (c);
3002           if (c == 'N' || c == 'O')
3003             {
3004               /* ESC <Fe> for SS2 or SS3.  */
3005               single_shifting = 1;
3006               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3007             }
3008           else if (c == '1')
3009             {
3010               /* End of composition.  */
3011               if (composition_count < 0
3012                   || composition_count > MAX_COMPOSITION_COMPONENTS)
3013                 /* Invalid */
3014                 break;
3015               composition_count = -1;
3016               found |= CATEGORY_MASK_ISO;
3017             }
3018           else if (c >= '0' && c <= '4')
3019             {
3020               /* ESC <Fp> for start/end composition.  */
3021               composition_count = 0;
3022             }
3023           else
3024             {
3025               if (c >= '(' && c <= '/')
3026                 {
3027                   /* Designation sequence for a charset of dimension 1.  */
3028                   ONE_MORE_BYTE (c1);
3029                   if (c1 < ' ' || c1 >= 0x80
3030                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3031                     {
3032                       /* Invalid designation sequence.  Just ignore.  */
3033                       if (c1 >= 0x80)
3034                         rejected |= (CATEGORY_MASK_ISO_7BIT
3035                                      | CATEGORY_MASK_ISO_7_ELSE);
3036                       break;
3037                     }
3038                 }
3039               else if (c == '$')
3040                 {
3041                   /* Designation sequence for a charset of dimension 2.  */
3042                   ONE_MORE_BYTE (c);
3043                   if (c >= '@' && c <= 'B')
3044                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3045                     id = iso_charset_table[1][0][c];
3046                   else if (c >= '(' && c <= '/')
3047                     {
3048                       ONE_MORE_BYTE (c1);
3049                       if (c1 < ' ' || c1 >= 0x80
3050                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3051                         {
3052                           /* Invalid designation sequence.  Just ignore.  */
3053                           if (c1 >= 0x80)
3054                             rejected |= (CATEGORY_MASK_ISO_7BIT
3055                                          | CATEGORY_MASK_ISO_7_ELSE);
3056                           break;
3057                         }
3058                     }
3059                   else
3060                     {
3061                       /* Invalid designation sequence.  Just ignore it.  */
3062                       if (c >= 0x80)
3063                         rejected |= (CATEGORY_MASK_ISO_7BIT
3064                                      | CATEGORY_MASK_ISO_7_ELSE);
3065                       break;
3066                     }
3067                 }
3068               else
3069                 {
3070                   /* Invalid escape sequence.  Just ignore it.  */
3071                   if (c >= 0x80)
3072                     rejected |= (CATEGORY_MASK_ISO_7BIT
3073                                  | CATEGORY_MASK_ISO_7_ELSE);
3074                   break;
3075                 }
3076
3077               /* We found a valid designation sequence for CHARSET.  */
3078               rejected |= CATEGORY_MASK_ISO_8BIT;
3079               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3080                                   id))
3081                 found |= CATEGORY_MASK_ISO_7;
3082               else
3083                 rejected |= CATEGORY_MASK_ISO_7;
3084               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3085                                   id))
3086                 found |= CATEGORY_MASK_ISO_7_TIGHT;
3087               else
3088                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3089               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3090                                   id))
3091                 found |= CATEGORY_MASK_ISO_7_ELSE;
3092               else
3093                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3094               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3095                                   id))
3096                 found |= CATEGORY_MASK_ISO_8_ELSE;
3097               else
3098                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3099             }
3100           break;
3101
3102         case ISO_CODE_SO:
3103         case ISO_CODE_SI:
3104           /* Locking shift out/in.  */
3105           if (inhibit_iso_escape_detection)
3106             break;
3107           single_shifting = 0;
3108           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3109           break;
3110
3111         case ISO_CODE_CSI:
3112           /* Control sequence introducer.  */
3113           single_shifting = 0;
3114           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3115           found |= CATEGORY_MASK_ISO_8_ELSE;
3116           goto check_extra_latin;
3117
3118         case ISO_CODE_SS2:
3119         case ISO_CODE_SS3:
3120           /* Single shift.   */
3121           if (inhibit_iso_escape_detection)
3122             break;
3123           single_shifting = 0;
3124           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3125           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3126               & CODING_ISO_FLAG_SINGLE_SHIFT)
3127             {
3128               found |= CATEGORY_MASK_ISO_8_1;
3129               single_shifting = 1;
3130             }
3131           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3132               & CODING_ISO_FLAG_SINGLE_SHIFT)
3133             {
3134               found |= CATEGORY_MASK_ISO_8_2;
3135               single_shifting = 1;
3136             }
3137           if (single_shifting)
3138             break;
3139           goto check_extra_latin;
3140
3141         default:
3142           if (c < 0)
3143             continue;
3144           if (c < 0x80)
3145             {
3146               if (composition_count >= 0)
3147                 composition_count++;
3148               single_shifting = 0;
3149               break;
3150             }
3151           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3152           if (c >= 0xA0)
3153             {
3154               found |= CATEGORY_MASK_ISO_8_1;
3155               /* Check the length of succeeding codes of the range
3156                  0xA0..0FF.  If the byte length is even, we include
3157                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3158                  only when we are not single shifting.  */
3159               if (! single_shifting
3160                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3161                 {
3162                   ptrdiff_t len = 1;
3163                   while (src < src_end)
3164                     {
3165                       src_base = src;
3166                       ONE_MORE_BYTE (c);
3167                       if (c < 0xA0)
3168                         {
3169                           src = src_base;
3170                           break;
3171                         }
3172                       len++;
3173                     }
3174
3175                   if (len & 1 && src < src_end)
3176                     {
3177                       rejected |= CATEGORY_MASK_ISO_8_2;
3178                       if (composition_count >= 0)
3179                         composition_count += len;
3180                     }
3181                   else
3182                     {
3183                       found |= CATEGORY_MASK_ISO_8_2;
3184                       if (composition_count >= 0)
3185                         composition_count += len / 2;
3186                     }
3187                 }
3188               break;
3189             }
3190         check_extra_latin:
3191           if (! VECTORP (Vlatin_extra_code_table)
3192               || NILP (AREF (Vlatin_extra_code_table, c)))
3193             {
3194               rejected = CATEGORY_MASK_ISO;
3195               break;
3196             }
3197           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3198               & CODING_ISO_FLAG_LATIN_EXTRA)
3199             found |= CATEGORY_MASK_ISO_8_1;
3200           else
3201             rejected |= CATEGORY_MASK_ISO_8_1;
3202           rejected |= CATEGORY_MASK_ISO_8_2;
3203           break;
3204         }
3205     }
3206   detect_info->rejected |= CATEGORY_MASK_ISO;
3207   return 0;
3208
3209  no_more_source:
3210   detect_info->rejected |= rejected;
3211   detect_info->found |= (found & ~rejected);
3212   return 1;
3213 }
3214
3215
3216 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3217    escape sequence should be kept.  */
3218 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3219   do {                                                                  \
3220     int id, prev;                                                       \
3221                                                                         \
3222     if (final < '0' || final >= 128                                     \
3223         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3224         || !SAFE_CHARSET_P (coding, id))                                \
3225       {                                                                 \
3226         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3227         chars_96 = -1;                                                  \
3228         break;                                                          \
3229       }                                                                 \
3230     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3231     if (id == charset_jisx0201_roman)                                   \
3232       {                                                                 \
3233         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3234           id = charset_ascii;                                           \
3235       }                                                                 \
3236     else if (id == charset_jisx0208_1978)                               \
3237       {                                                                 \
3238         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3239           id = charset_jisx0208;                                        \
3240       }                                                                 \
3241     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3242     /* If there was an invalid designation to REG previously, and this  \
3243        designation is ASCII to REG, we should keep this designation     \
3244        sequence.  */                                                    \
3245     if (prev == -2 && id == charset_ascii)                              \
3246       chars_96 = -1;                                                    \
3247   } while (0)
3248
3249
3250 /* Handle these composition sequence (ALT: alternate char):
3251
3252    (1) relative composition: ESC 0 CHAR ... ESC 1
3253    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3254    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3255    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3256
3257    When the start sequence (ESC 0/2/3/4) is found, this annotation
3258    header is produced.
3259
3260         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3261
3262    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3263    produced until the end sequence (ESC 1) is found:
3264
3265    (1) CHAR ... CHAR
3266    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3267    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3268    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3269
3270    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3271    annotation header is updated as below:
3272
3273    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3274    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3275    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3276    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3277
3278    If an error is found while composing, the annotation header is
3279    changed to:
3280
3281         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3282
3283    and the sequence [ -2 DECODED-RULE ] is changed to the original
3284    byte sequence as below:
3285         o the original byte sequence is B: [ B -1 ]
3286         o the original byte sequence is B1 B2: [ B1 B2 ]
3287    and the sequence [ -1 -1 ] is changed to the original byte
3288    sequence:
3289         [ ESC '0' ]
3290 */
3291
3292 /* Decode a composition rule C1 and maybe one more byte from the
3293    source, and set RULE to the encoded composition rule.  If the rule
3294    is invalid, goto invalid_code.  */
3295
3296 #define DECODE_COMPOSITION_RULE(rule)                                   \
3297   do {                                                                  \
3298     rule = c1 - 32;                                                     \
3299     if (rule < 0)                                                       \
3300       goto invalid_code;                                                \
3301     if (rule < 81)              /* old format (before ver.21) */        \
3302       {                                                                 \
3303         int gref = (rule) / 9;                                          \
3304         int nref = (rule) % 9;                                          \
3305         if (gref == 4) gref = 10;                                       \
3306         if (nref == 4) nref = 10;                                       \
3307         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3308       }                                                                 \
3309     else                        /* new format (after ver.21) */         \
3310       {                                                                 \
3311         int b;                                                          \
3312                                                                         \
3313         ONE_MORE_BYTE (b);                                              \
3314         if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))        \
3315           goto invalid_code;                                            \
3316         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
3317         rule += 0x100;   /* Distinguish it from the old format.  */     \
3318       }                                                                 \
3319   } while (0)
3320
3321 #define ENCODE_COMPOSITION_RULE(rule)                           \
3322   do {                                                          \
3323     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3324                                                                 \
3325     if (rule < 0x100)           /* old format */                \
3326       {                                                         \
3327         if (gref == 10) gref = 4;                               \
3328         if (nref == 10) nref = 4;                               \
3329         charbuf[idx] = 32 + gref * 9 + nref;                    \
3330         charbuf[idx + 1] = -1;                                  \
3331         new_chars++;                                            \
3332       }                                                         \
3333     else                                /* new format */        \
3334       {                                                         \
3335         charbuf[idx] = 32 + 81 + gref;                          \
3336         charbuf[idx + 1] = 32 + nref;                           \
3337         new_chars += 2;                                         \
3338       }                                                         \
3339   } while (0)
3340
3341 /* Finish the current composition as invalid.  */
3342
3343 static int
3344 finish_composition (int *charbuf, struct composition_status *cmp_status)
3345 {
3346   int idx = - cmp_status->length;
3347   int new_chars;
3348
3349   /* Recover the original ESC sequence */
3350   charbuf[idx++] = ISO_CODE_ESC;
3351   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3352                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3353                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3354                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3355                     : '4');
3356   charbuf[idx++] = -2;
3357   charbuf[idx++] = 0;
3358   charbuf[idx++] = -1;
3359   new_chars = cmp_status->nchars;
3360   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3361     for (; idx < 0; idx++)
3362       {
3363         int elt = charbuf[idx];
3364
3365         if (elt == -2)
3366           {
3367             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3368             idx++;
3369           }
3370         else if (elt == -1)
3371           {
3372             charbuf[idx++] = ISO_CODE_ESC;
3373             charbuf[idx] = '0';
3374             new_chars += 2;
3375           }
3376       }
3377   cmp_status->state = COMPOSING_NO;
3378   return new_chars;
3379 }
3380
3381 /* If characters are under composition, finish the composition.  */
3382 #define MAYBE_FINISH_COMPOSITION()                              \
3383   do {                                                          \
3384     if (cmp_status->state != COMPOSING_NO)                      \
3385       char_offset += finish_composition (charbuf, cmp_status);  \
3386   } while (0)
3387
3388 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3389
3390    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3391    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3392    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3393    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3394
3395    Produce this annotation sequence now:
3396
3397    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3398 */
3399
3400 #define DECODE_COMPOSITION_START(c1)                                       \
3401   do {                                                                     \
3402     if (c1 == '0'                                                          \
3403         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3404              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3405             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3406                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3407       {                                                                    \
3408         *charbuf++ = -1;                                                   \
3409         *charbuf++= -1;                                                    \
3410         cmp_status->state = COMPOSING_CHAR;                                \
3411         cmp_status->length += 2;                                           \
3412       }                                                                    \
3413     else                                                                   \
3414       {                                                                    \
3415         MAYBE_FINISH_COMPOSITION ();                                       \
3416         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3417                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3418                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3419                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3420         cmp_status->state                                                  \
3421           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3422         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3423         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3424         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3425         coding->annotated = 1;                                             \
3426       }                                                                    \
3427   } while (0)
3428
3429
3430 /* Handle composition end sequence ESC 1.  */
3431
3432 #define DECODE_COMPOSITION_END()                                        \
3433   do {                                                                  \
3434     if (cmp_status->nchars == 0                                         \
3435         || ((cmp_status->state == COMPOSING_CHAR)                       \
3436             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3437       {                                                                 \
3438         MAYBE_FINISH_COMPOSITION ();                                    \
3439         goto invalid_code;                                              \
3440       }                                                                 \
3441     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3442       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3443     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3444       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3445     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3446     char_offset += cmp_status->nchars;                                  \
3447     cmp_status->state = COMPOSING_NO;                                   \
3448   } while (0)
3449
3450 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3451
3452 #define STORE_COMPOSITION_RULE(rule)    \
3453   do {                                  \
3454     *charbuf++ = -2;                    \
3455     *charbuf++ = rule;                  \
3456     cmp_status->length += 2;            \
3457     cmp_status->state--;                \
3458   } while (0)
3459
3460 /* Store a composed char or a component char C in charbuf, and update
3461    cmp_status.  */
3462
3463 #define STORE_COMPOSITION_CHAR(c)                                       \
3464   do {                                                                  \
3465     *charbuf++ = (c);                                                   \
3466     cmp_status->length++;                                               \
3467     if (cmp_status->state == COMPOSING_CHAR)                            \
3468       cmp_status->nchars++;                                             \
3469     else                                                                \
3470       cmp_status->ncomps++;                                             \
3471     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3472         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3473             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3474       cmp_status->state++;                                              \
3475   } while (0)
3476
3477
3478 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3479
3480 static void
3481 decode_coding_iso_2022 (struct coding_system *coding)
3482 {
3483   const unsigned char *src = coding->source + coding->consumed;
3484   const unsigned char *src_end = coding->source + coding->src_bytes;
3485   const unsigned char *src_base;
3486   int *charbuf = coding->charbuf + coding->charbuf_used;
3487   /* We may produce two annotations (charset and composition) in one
3488      loop and one more charset annotation at the end.  */
3489   int *charbuf_end
3490     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3491   ptrdiff_t consumed_chars = 0, consumed_chars_base;
3492   bool multibytep = coding->src_multibyte;
3493   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3494   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3495   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3496   int charset_id_2, charset_id_3;
3497   struct charset *charset;
3498   int c;
3499   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3500   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3501   ptrdiff_t char_offset = coding->produced_char;
3502   ptrdiff_t last_offset = char_offset;
3503   int last_id = charset_ascii;
3504   bool eol_dos
3505     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3506   int byte_after_cr = -1;
3507   int i;
3508
3509   setup_iso_safe_charsets (attrs);
3510   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3511
3512   if (cmp_status->state != COMPOSING_NO)
3513     {
3514       if (charbuf_end - charbuf < cmp_status->length)
3515         emacs_abort ();
3516       for (i = 0; i < cmp_status->length; i++)
3517         *charbuf++ = cmp_status->carryover[i];
3518       coding->annotated = 1;
3519     }
3520
3521   while (1)
3522     {
3523       int c1, c2, c3;
3524
3525       src_base = src;
3526       consumed_chars_base = consumed_chars;
3527
3528       if (charbuf >= charbuf_end)
3529         {
3530           if (byte_after_cr >= 0)
3531             src_base--;
3532           break;
3533         }
3534
3535       if (byte_after_cr >= 0)
3536         c1 = byte_after_cr, byte_after_cr = -1;
3537       else
3538         ONE_MORE_BYTE (c1);
3539       if (c1 < 0)
3540         goto invalid_code;
3541
3542       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3543         {
3544           *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3545           char_offset++;
3546           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3547           continue;
3548         }
3549
3550       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3551         {
3552           if (c1 == ISO_CODE_ESC)
3553             {
3554               if (src + 1 >= src_end)
3555                 goto no_more_source;
3556               *charbuf++ = ISO_CODE_ESC;
3557               char_offset++;
3558               if (src[0] == '%' && src[1] == '@')
3559                 {
3560                   src += 2;
3561                   consumed_chars += 2;
3562                   char_offset += 2;
3563                   /* We are sure charbuf can contain two more chars. */
3564                   *charbuf++ = '%';
3565                   *charbuf++ = '@';
3566                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3567                 }
3568             }
3569           else
3570             {
3571               *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3572               char_offset++;
3573             }
3574           continue;
3575         }
3576
3577       if ((cmp_status->state == COMPOSING_RULE
3578            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3579           && c1 != ISO_CODE_ESC)
3580         {
3581           int rule;
3582
3583           DECODE_COMPOSITION_RULE (rule);
3584           STORE_COMPOSITION_RULE (rule);
3585           continue;
3586         }
3587
3588       /* We produce at most one character.  */
3589       switch (iso_code_class [c1])
3590         {
3591         case ISO_0x20_or_0x7F:
3592           if (charset_id_0 < 0
3593               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3594             /* This is SPACE or DEL.  */
3595             charset = CHARSET_FROM_ID (charset_ascii);
3596           else
3597             charset = CHARSET_FROM_ID (charset_id_0);
3598           break;
3599
3600         case ISO_graphic_plane_0:
3601           if (charset_id_0 < 0)
3602             charset = CHARSET_FROM_ID (charset_ascii);
3603           else
3604             charset = CHARSET_FROM_ID (charset_id_0);
3605           break;
3606
3607         case ISO_0xA0_or_0xFF:
3608           if (charset_id_1 < 0
3609               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3610               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3611             goto invalid_code;
3612           /* This is a graphic character, we fall down ... */
3613
3614         case ISO_graphic_plane_1:
3615           if (charset_id_1 < 0)
3616             goto invalid_code;
3617           charset = CHARSET_FROM_ID (charset_id_1);
3618           break;
3619
3620         case ISO_control_0:
3621           if (eol_dos && c1 == '\r')
3622             ONE_MORE_BYTE (byte_after_cr);
3623           MAYBE_FINISH_COMPOSITION ();
3624           charset = CHARSET_FROM_ID (charset_ascii);
3625           break;
3626
3627         case ISO_control_1:
3628           goto invalid_code;
3629
3630         case ISO_shift_out:
3631           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3632               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3633             goto invalid_code;
3634           CODING_ISO_INVOCATION (coding, 0) = 1;
3635           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3636           continue;
3637
3638         case ISO_shift_in:
3639           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3640             goto invalid_code;
3641           CODING_ISO_INVOCATION (coding, 0) = 0;
3642           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3643           continue;
3644
3645         case ISO_single_shift_2_7:
3646           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3647             goto invalid_code;
3648         case ISO_single_shift_2:
3649           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3650             goto invalid_code;
3651           /* SS2 is handled as an escape sequence of ESC 'N' */
3652           c1 = 'N';
3653           goto label_escape_sequence;
3654
3655         case ISO_single_shift_3:
3656           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3657             goto invalid_code;
3658           /* SS2 is handled as an escape sequence of ESC 'O' */
3659           c1 = 'O';
3660           goto label_escape_sequence;
3661
3662         case ISO_control_sequence_introducer:
3663           /* CSI is handled as an escape sequence of ESC '[' ...  */
3664           c1 = '[';
3665           goto label_escape_sequence;
3666
3667         case ISO_escape:
3668           ONE_MORE_BYTE (c1);
3669         label_escape_sequence:
3670           /* Escape sequences handled here are invocation,
3671              designation, direction specification, and character
3672              composition specification.  */
3673           switch (c1)
3674             {
3675             case '&':           /* revision of following character set */
3676               ONE_MORE_BYTE (c1);
3677               if (!(c1 >= '@' && c1 <= '~'))
3678                 goto invalid_code;
3679               ONE_MORE_BYTE (c1);
3680               if (c1 != ISO_CODE_ESC)
3681                 goto invalid_code;
3682               ONE_MORE_BYTE (c1);
3683               goto label_escape_sequence;
3684
3685             case '$':           /* designation of 2-byte character set */
3686               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3687                 goto invalid_code;
3688               {
3689                 int reg, chars96;
3690
3691                 ONE_MORE_BYTE (c1);
3692                 if (c1 >= '@' && c1 <= 'B')
3693                   {     /* designation of JISX0208.1978, GB2312.1980,
3694                            or JISX0208.1980 */
3695                     reg = 0, chars96 = 0;
3696                   }
3697                 else if (c1 >= 0x28 && c1 <= 0x2B)
3698                   { /* designation of DIMENSION2_CHARS94 character set */
3699                     reg = c1 - 0x28, chars96 = 0;
3700                     ONE_MORE_BYTE (c1);
3701                   }
3702                 else if (c1 >= 0x2C && c1 <= 0x2F)
3703                   { /* designation of DIMENSION2_CHARS96 character set */
3704                     reg = c1 - 0x2C, chars96 = 1;
3705                     ONE_MORE_BYTE (c1);
3706                   }
3707                 else
3708                   goto invalid_code;
3709                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3710                 /* We must update these variables now.  */
3711                 if (reg == 0)
3712                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3713                 else if (reg == 1)
3714                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3715                 if (chars96 < 0)
3716                   goto invalid_code;
3717               }
3718               continue;
3719
3720             case 'n':           /* invocation of locking-shift-2 */
3721               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3722                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3723                 goto invalid_code;
3724               CODING_ISO_INVOCATION (coding, 0) = 2;
3725               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3726               continue;
3727
3728             case 'o':           /* invocation of locking-shift-3 */
3729               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3730                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3731                 goto invalid_code;
3732               CODING_ISO_INVOCATION (coding, 0) = 3;
3733               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3734               continue;
3735
3736             case 'N':           /* invocation of single-shift-2 */
3737               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3738                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3739                 goto invalid_code;
3740               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3741               if (charset_id_2 < 0)
3742                 charset = CHARSET_FROM_ID (charset_ascii);
3743               else
3744                 charset = CHARSET_FROM_ID (charset_id_2);
3745               ONE_MORE_BYTE (c1);
3746               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3747                   || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3748                       && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3749                           ? c1 >= 0x80 : c1 < 0x80)))
3750                 goto invalid_code;
3751               break;
3752
3753             case 'O':           /* invocation of single-shift-3 */
3754               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3755                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3756                 goto invalid_code;
3757               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3758               if (charset_id_3 < 0)
3759                 charset = CHARSET_FROM_ID (charset_ascii);
3760               else
3761                 charset = CHARSET_FROM_ID (charset_id_3);
3762               ONE_MORE_BYTE (c1);
3763               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3764                   || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3765                       && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3766                           ? c1 >= 0x80 : c1 < 0x80)))
3767                 goto invalid_code;
3768               break;
3769
3770             case '0': case '2': case '3': case '4': /* start composition */
3771               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3772                 goto invalid_code;
3773               if (last_id != charset_ascii)
3774                 {
3775                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3776                   last_id = charset_ascii;
3777                   last_offset = char_offset;
3778                 }
3779               DECODE_COMPOSITION_START (c1);
3780               continue;
3781
3782             case '1':           /* end composition */
3783               if (cmp_status->state == COMPOSING_NO)
3784                 goto invalid_code;
3785               DECODE_COMPOSITION_END ();
3786               continue;
3787
3788             case '[':           /* specification of direction */
3789               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3790                 goto invalid_code;
3791               /* For the moment, nested direction is not supported.
3792                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3793                  left-to-right, and nonzero means right-to-left.  */
3794               ONE_MORE_BYTE (c1);
3795               switch (c1)
3796                 {
3797                 case ']':       /* end of the current direction */
3798                   coding->mode &= ~CODING_MODE_DIRECTION;
3799
3800                 case '0':       /* end of the current direction */
3801                 case '1':       /* start of left-to-right direction */
3802                   ONE_MORE_BYTE (c1);
3803                   if (c1 == ']')
3804                     coding->mode &= ~CODING_MODE_DIRECTION;
3805                   else
3806                     goto invalid_code;
3807                   break;
3808
3809                 case '2':       /* start of right-to-left direction */
3810                   ONE_MORE_BYTE (c1);
3811                   if (c1 == ']')
3812                     coding->mode |= CODING_MODE_DIRECTION;
3813                   else
3814                     goto invalid_code;
3815                   break;
3816
3817                 default:
3818                   goto invalid_code;
3819                 }
3820               continue;
3821
3822             case '%':
3823               ONE_MORE_BYTE (c1);
3824               if (c1 == '/')
3825                 {
3826                   /* CTEXT extended segment:
3827                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3828                      We keep these bytes as is for the moment.
3829                      They may be decoded by post-read-conversion.  */
3830                   int dim, M, L;
3831                   int size;
3832
3833                   ONE_MORE_BYTE (dim);
3834                   if (dim < '0' || dim > '4')
3835                     goto invalid_code;
3836                   ONE_MORE_BYTE (M);
3837                   if (M < 128)
3838                     goto invalid_code;
3839                   ONE_MORE_BYTE (L);
3840                   if (L < 128)
3841                     goto invalid_code;
3842                   size = ((M - 128) * 128) + (L - 128);
3843                   if (charbuf + 6 > charbuf_end)
3844                     goto break_loop;
3845                   *charbuf++ = ISO_CODE_ESC;
3846                   *charbuf++ = '%';
3847                   *charbuf++ = '/';
3848                   *charbuf++ = dim;
3849                   *charbuf++ = BYTE8_TO_CHAR (M);
3850                   *charbuf++ = BYTE8_TO_CHAR (L);
3851                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3852                 }
3853               else if (c1 == 'G')
3854                 {
3855                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3856                      ESC % G --UTF-8-BYTES-- ESC % @
3857                      We keep these bytes as is for the moment.
3858                      They may be decoded by post-read-conversion.  */
3859                   if (charbuf + 3 > charbuf_end)
3860                     goto break_loop;
3861                   *charbuf++ = ISO_CODE_ESC;
3862                   *charbuf++ = '%';
3863                   *charbuf++ = 'G';
3864                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3865                 }
3866               else
3867                 goto invalid_code;
3868               continue;
3869               break;
3870
3871             default:
3872               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3873                 goto invalid_code;
3874               {
3875                 int reg, chars96;
3876
3877                 if (c1 >= 0x28 && c1 <= 0x2B)
3878                   { /* designation of DIMENSION1_CHARS94 character set */
3879                     reg = c1 - 0x28, chars96 = 0;
3880                     ONE_MORE_BYTE (c1);
3881                   }
3882                 else if (c1 >= 0x2C && c1 <= 0x2F)
3883                   { /* designation of DIMENSION1_CHARS96 character set */
3884                     reg = c1 - 0x2C, chars96 = 1;
3885                     ONE_MORE_BYTE (c1);
3886                   }
3887                 else
3888                   goto invalid_code;
3889                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3890                 /* We must update these variables now.  */
3891                 if (reg == 0)
3892                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3893                 else if (reg == 1)
3894                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3895                 if (chars96 < 0)
3896                   goto invalid_code;
3897               }
3898               continue;
3899             }
3900           break;
3901
3902         default:
3903           emacs_abort ();
3904         }
3905
3906       if (cmp_status->state == COMPOSING_NO
3907           && charset->id != charset_ascii
3908           && last_id != charset->id)
3909         {
3910           if (last_id != charset_ascii)
3911             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3912           last_id = charset->id;
3913           last_offset = char_offset;
3914         }
3915
3916       /* Now we know CHARSET and 1st position code C1 of a character.
3917          Produce a decoded character while getting 2nd and 3rd
3918          position codes C2, C3 if necessary.  */
3919       if (CHARSET_DIMENSION (charset) > 1)
3920         {
3921           ONE_MORE_BYTE (c2);
3922           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3923               || ((c1 & 0x80) != (c2 & 0x80)))
3924             /* C2 is not in a valid range.  */
3925             goto invalid_code;
3926           if (CHARSET_DIMENSION (charset) == 2)
3927             c1 = (c1 << 8) | c2;
3928           else
3929             {
3930               ONE_MORE_BYTE (c3);
3931               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3932                   || ((c1 & 0x80) != (c3 & 0x80)))
3933                 /* C3 is not in a valid range.  */
3934                 goto invalid_code;
3935               c1 = (c1 << 16) | (c2 << 8) | c2;
3936             }
3937         }
3938       c1 &= 0x7F7F7F;
3939       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3940       if (c < 0)
3941         {
3942           MAYBE_FINISH_COMPOSITION ();
3943           for (; src_base < src; src_base++, char_offset++)
3944             {
3945               if (ASCII_CHAR_P (*src_base))
3946                 *charbuf++ = *src_base;
3947               else
3948                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3949             }
3950         }
3951       else if (cmp_status->state == COMPOSING_NO)
3952         {
3953           *charbuf++ = c;
3954           char_offset++;
3955         }
3956       else if ((cmp_status->state == COMPOSING_CHAR
3957                 ? cmp_status->nchars
3958                 : cmp_status->ncomps)
3959                >= MAX_COMPOSITION_COMPONENTS)
3960         {
3961           /* Too long composition.  */
3962           MAYBE_FINISH_COMPOSITION ();
3963           *charbuf++ = c;
3964           char_offset++;
3965         }
3966       else
3967         STORE_COMPOSITION_CHAR (c);
3968       continue;
3969
3970     invalid_code:
3971       MAYBE_FINISH_COMPOSITION ();
3972       src = src_base;
3973       consumed_chars = consumed_chars_base;
3974       ONE_MORE_BYTE (c);
3975       *charbuf++ = c < 0 ? -c : ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
3976       char_offset++;
3977       /* Reset the invocation and designation status to the safest
3978          one; i.e. designate ASCII to the graphic register 0, and
3979          invoke that register to the graphic plane 0.  This typically
3980          helps the case that an designation sequence for ASCII "ESC (
3981          B" is somehow broken (e.g. broken by a newline).  */
3982       CODING_ISO_INVOCATION (coding, 0) = 0;
3983       CODING_ISO_DESIGNATION (coding, 0) = charset_ascii;
3984       charset_id_0 = charset_ascii;
3985       continue;
3986
3987     break_loop:
3988       break;
3989     }
3990
3991  no_more_source:
3992   if (cmp_status->state != COMPOSING_NO)
3993     {
3994       if (coding->mode & CODING_MODE_LAST_BLOCK)
3995         MAYBE_FINISH_COMPOSITION ();
3996       else
3997         {
3998           charbuf -= cmp_status->length;
3999           for (i = 0; i < cmp_status->length; i++)
4000             cmp_status->carryover[i] = charbuf[i];
4001         }
4002     }
4003   else if (last_id != charset_ascii)
4004     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4005   coding->consumed_char += consumed_chars_base;
4006   coding->consumed = src_base - coding->source;
4007   coding->charbuf_used = charbuf - coding->charbuf;
4008 }
4009
4010
4011 /* ISO2022 encoding stuff.  */
4012
4013 /*
4014    It is not enough to say just "ISO2022" on encoding, we have to
4015    specify more details.  In Emacs, each coding system of ISO2022
4016    variant has the following specifications:
4017         1. Initial designation to G0 thru G3.
4018         2. Allows short-form designation?
4019         3. ASCII should be designated to G0 before control characters?
4020         4. ASCII should be designated to G0 at end of line?
4021         5. 7-bit environment or 8-bit environment?
4022         6. Use locking-shift?
4023         7. Use Single-shift?
4024    And the following two are only for Japanese:
4025         8. Use ASCII in place of JIS0201-1976-Roman?
4026         9. Use JISX0208-1983 in place of JISX0208-1978?
4027    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4028    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
4029    details.
4030 */
4031
4032 /* Produce codes (escape sequence) for designating CHARSET to graphic
4033    register REG at DST, and increment DST.  If <final-char> of CHARSET is
4034    '@', 'A', or 'B' and the coding system CODING allows, produce
4035    designation sequence of short-form.  */
4036
4037 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
4038   do {                                                                  \
4039     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
4040     const char *intermediate_char_94 = "()*+";                          \
4041     const char *intermediate_char_96 = ",-./";                          \
4042     int revision = -1;                                                  \
4043                                                                         \
4044     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
4045       revision = CHARSET_ISO_REVISION (charset);                        \
4046                                                                         \
4047     if (revision >= 0)                                                  \
4048       {                                                                 \
4049         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4050         EMIT_ONE_BYTE ('@' + revision);                                 \
4051       }                                                                 \
4052     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4053     if (CHARSET_DIMENSION (charset) == 1)                               \
4054       {                                                                 \
4055         int b;                                                          \
4056         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4057           b = intermediate_char_94[reg];                                \
4058         else                                                            \
4059           b = intermediate_char_96[reg];                                \
4060         EMIT_ONE_ASCII_BYTE (b);                                        \
4061       }                                                                 \
4062     else                                                                \
4063       {                                                                 \
4064         EMIT_ONE_ASCII_BYTE ('$');                                      \
4065         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4066           {                                                             \
4067             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4068                 || reg != 0                                             \
4069                 || final_char < '@' || final_char > 'B')                \
4070               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4071           }                                                             \
4072         else                                                            \
4073           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4074       }                                                                 \
4075     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4076                                                                         \
4077     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4078   } while (0)
4079
4080
4081 /* The following two macros produce codes (control character or escape
4082    sequence) for ISO2022 single-shift functions (single-shift-2 and
4083    single-shift-3).  */
4084
4085 #define ENCODE_SINGLE_SHIFT_2                                           \
4086   do {                                                                  \
4087     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4088       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4089     else                                                                \
4090       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4091     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4092   } while (0)
4093
4094
4095 #define ENCODE_SINGLE_SHIFT_3                                           \
4096   do {                                                                  \
4097     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4098       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4099     else                                                                \
4100       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4101     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4102   } while (0)
4103
4104
4105 /* The following four macros produce codes (control character or
4106    escape sequence) for ISO2022 locking-shift functions (shift-in,
4107    shift-out, locking-shift-2, and locking-shift-3).  */
4108
4109 #define ENCODE_SHIFT_IN                                 \
4110   do {                                                  \
4111     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4112     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4113   } while (0)
4114
4115
4116 #define ENCODE_SHIFT_OUT                                \
4117   do {                                                  \
4118     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4119     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4120   } while (0)
4121
4122
4123 #define ENCODE_LOCKING_SHIFT_2                          \
4124   do {                                                  \
4125     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4126     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4127   } while (0)
4128
4129
4130 #define ENCODE_LOCKING_SHIFT_3                          \
4131   do {                                                  \
4132     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4133     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4134   } while (0)
4135
4136
4137 /* Produce codes for a DIMENSION1 character whose character set is
4138    CHARSET and whose position-code is C1.  Designation and invocation
4139    sequences are also produced in advance if necessary.  */
4140
4141 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4142   do {                                                                  \
4143     int id = CHARSET_ID (charset);                                      \
4144                                                                         \
4145     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4146         && id == charset_ascii)                                         \
4147       {                                                                 \
4148         id = charset_jisx0201_roman;                                    \
4149         charset = CHARSET_FROM_ID (id);                                 \
4150       }                                                                 \
4151                                                                         \
4152     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4153       {                                                                 \
4154         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4155           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4156         else                                                            \
4157           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4158         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4159         break;                                                          \
4160       }                                                                 \
4161     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4162       {                                                                 \
4163         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4164         break;                                                          \
4165       }                                                                 \
4166     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4167       {                                                                 \
4168         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4169         break;                                                          \
4170       }                                                                 \
4171     else                                                                \
4172       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4173          must invoke it, or, at first, designate it to some graphic     \
4174          register.  Then repeat the loop to actually produce the        \
4175          character.  */                                                 \
4176       dst = encode_invocation_designation (charset, coding, dst,        \
4177                                            &produced_chars);            \
4178   } while (1)
4179
4180
4181 /* Produce codes for a DIMENSION2 character whose character set is
4182    CHARSET and whose position-codes are C1 and C2.  Designation and
4183    invocation codes are also produced in advance if necessary.  */
4184
4185 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4186   do {                                                                  \
4187     int id = CHARSET_ID (charset);                                      \
4188                                                                         \
4189     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4190         && id == charset_jisx0208)                                      \
4191       {                                                                 \
4192         id = charset_jisx0208_1978;                                     \
4193         charset = CHARSET_FROM_ID (id);                                 \
4194       }                                                                 \
4195                                                                         \
4196     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4197       {                                                                 \
4198         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4199           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4200         else                                                            \
4201           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4202         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4203         break;                                                          \
4204       }                                                                 \
4205     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4206       {                                                                 \
4207         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4208         break;                                                          \
4209       }                                                                 \
4210     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4211       {                                                                 \
4212         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4213         break;                                                          \
4214       }                                                                 \
4215     else                                                                \
4216       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4217          must invoke it, or, at first, designate it to some graphic     \
4218          register.  Then repeat the loop to actually produce the        \
4219          character.  */                                                 \
4220       dst = encode_invocation_designation (charset, coding, dst,        \
4221                                            &produced_chars);            \
4222   } while (1)
4223
4224
4225 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4226   do {                                                                     \
4227     unsigned code;                                                         \
4228     CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code);       \
4229                                                                            \
4230     if (CHARSET_DIMENSION (charset) == 1)                                  \
4231       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4232     else                                                                   \
4233       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4234   } while (0)
4235
4236
4237 /* Produce designation and invocation codes at a place pointed by DST
4238    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4239    Return new DST.  */
4240
4241 static unsigned char *
4242 encode_invocation_designation (struct charset *charset,
4243                                struct coding_system *coding,
4244                                unsigned char *dst, ptrdiff_t *p_nchars)
4245 {
4246   bool multibytep = coding->dst_multibyte;
4247   ptrdiff_t produced_chars = *p_nchars;
4248   int reg;                      /* graphic register number */
4249   int id = CHARSET_ID (charset);
4250
4251   /* At first, check designations.  */
4252   for (reg = 0; reg < 4; reg++)
4253     if (id == CODING_ISO_DESIGNATION (coding, reg))
4254       break;
4255
4256   if (reg >= 4)
4257     {
4258       /* CHARSET is not yet designated to any graphic registers.  */
4259       /* At first check the requested designation.  */
4260       reg = CODING_ISO_REQUEST (coding, id);
4261       if (reg < 0)
4262         /* Since CHARSET requests no special designation, designate it
4263            to graphic register 0.  */
4264         reg = 0;
4265
4266       ENCODE_DESIGNATION (charset, reg, coding);
4267     }
4268
4269   if (CODING_ISO_INVOCATION (coding, 0) != reg
4270       && CODING_ISO_INVOCATION (coding, 1) != reg)
4271     {
4272       /* Since the graphic register REG is not invoked to any graphic
4273          planes, invoke it to graphic plane 0.  */
4274       switch (reg)
4275         {
4276         case 0:                 /* graphic register 0 */
4277           ENCODE_SHIFT_IN;
4278           break;
4279
4280         case 1:                 /* graphic register 1 */
4281           ENCODE_SHIFT_OUT;
4282           break;
4283
4284         case 2:                 /* graphic register 2 */
4285           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4286             ENCODE_SINGLE_SHIFT_2;
4287           else
4288             ENCODE_LOCKING_SHIFT_2;
4289           break;
4290
4291         case 3:                 /* graphic register 3 */
4292           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4293             ENCODE_SINGLE_SHIFT_3;
4294           else
4295             ENCODE_LOCKING_SHIFT_3;
4296           break;
4297
4298         default:
4299           break;
4300         }
4301     }
4302
4303   *p_nchars = produced_chars;
4304   return dst;
4305 }
4306
4307
4308 /* Produce codes for designation and invocation to reset the graphic
4309    planes and registers to initial state.  */
4310 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4311   do {                                                                  \
4312     int reg;                                                            \
4313     struct charset *charset;                                            \
4314                                                                         \
4315     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4316       ENCODE_SHIFT_IN;                                                  \
4317     for (reg = 0; reg < 4; reg++)                                       \
4318       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4319           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4320               != CODING_ISO_INITIAL (coding, reg)))                     \
4321         {                                                               \
4322           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4323           ENCODE_DESIGNATION (charset, reg, coding);                    \
4324         }                                                               \
4325   } while (0)
4326
4327
4328 /* Produce designation sequences of charsets in the line started from
4329    CHARBUF to a place pointed by DST, and return the number of
4330    produced bytes.  DST should not directly point a buffer text area
4331    which may be relocated by char_charset call.
4332
4333    If the current block ends before any end-of-line, we may fail to
4334    find all the necessary designations.  */
4335
4336 static ptrdiff_t
4337 encode_designation_at_bol (struct coding_system *coding,
4338                            int *charbuf, int *charbuf_end,
4339                            unsigned char *dst)
4340 {
4341   unsigned char *orig = dst;
4342   struct charset *charset;
4343   /* Table of charsets to be designated to each graphic register.  */
4344   int r[4];
4345   int c, found = 0, reg;
4346   ptrdiff_t produced_chars = 0;
4347   bool multibytep = coding->dst_multibyte;
4348   Lisp_Object attrs;
4349   Lisp_Object charset_list;
4350
4351   attrs = CODING_ID_ATTRS (coding->id);
4352   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4353   if (EQ (charset_list, Qiso_2022))
4354     charset_list = Viso_2022_charset_list;
4355
4356   for (reg = 0; reg < 4; reg++)
4357     r[reg] = -1;
4358
4359   while (charbuf < charbuf_end && found < 4)
4360     {
4361       int id;
4362
4363       c = *charbuf++;
4364       if (c == '\n')
4365         break;
4366       charset = char_charset (c, charset_list, NULL);
4367       id = CHARSET_ID (charset);
4368       reg = CODING_ISO_REQUEST (coding, id);
4369       if (reg >= 0 && r[reg] < 0)
4370         {
4371           found++;
4372           r[reg] = id;
4373         }
4374     }
4375
4376   if (found)
4377     {
4378       for (reg = 0; reg < 4; reg++)
4379         if (r[reg] >= 0
4380             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4381           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4382     }
4383
4384   return dst - orig;
4385 }
4386
4387 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4388
4389 static bool
4390 encode_coding_iso_2022 (struct coding_system *coding)
4391 {
4392   bool multibytep = coding->dst_multibyte;
4393   int *charbuf = coding->charbuf;
4394   int *charbuf_end = charbuf + coding->charbuf_used;
4395   unsigned char *dst = coding->destination + coding->produced;
4396   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4397   int safe_room = 16;
4398   bool bol_designation
4399     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4400        && CODING_ISO_BOL (coding));
4401   ptrdiff_t produced_chars = 0;
4402   Lisp_Object attrs, eol_type, charset_list;
4403   bool ascii_compatible;
4404   int c;
4405   int preferred_charset_id = -1;
4406
4407   CODING_GET_INFO (coding, attrs, charset_list);
4408   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4409   if (VECTORP (eol_type))
4410     eol_type = Qunix;
4411
4412   setup_iso_safe_charsets (attrs);
4413   /* Charset list may have been changed.  */
4414   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4415   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4416
4417   ascii_compatible
4418     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4419        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4420                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4421
4422   while (charbuf < charbuf_end)
4423     {
4424       ASSURE_DESTINATION (safe_room);
4425
4426       if (bol_designation)
4427         {
4428           /* We have to produce designation sequences if any now.  */
4429           unsigned char desig_buf[16];
4430           ptrdiff_t nbytes;
4431           ptrdiff_t offset;
4432
4433           charset_map_loaded = 0;
4434           nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4435                                               desig_buf);
4436           if (charset_map_loaded
4437               && (offset = coding_change_destination (coding)))
4438             {
4439               dst += offset;
4440               dst_end += offset;
4441             }
4442           memcpy (dst, desig_buf, nbytes);
4443           dst += nbytes;
4444           /* We are sure that designation sequences are all ASCII bytes.  */
4445           produced_chars += nbytes;
4446           bol_designation = 0;
4447           ASSURE_DESTINATION (safe_room);
4448         }
4449
4450       c = *charbuf++;
4451
4452       if (c < 0)
4453         {
4454           /* Handle an annotation.  */
4455           switch (*charbuf)
4456             {
4457             case CODING_ANNOTATE_COMPOSITION_MASK:
4458               /* Not yet implemented.  */
4459               break;
4460             case CODING_ANNOTATE_CHARSET_MASK:
4461               preferred_charset_id = charbuf[2];
4462               if (preferred_charset_id >= 0
4463                   && NILP (Fmemq (make_number (preferred_charset_id),
4464                                   charset_list)))
4465                 preferred_charset_id = -1;
4466               break;
4467             default:
4468               emacs_abort ();
4469             }
4470           charbuf += -c - 1;
4471           continue;
4472         }
4473
4474       /* Now encode the character C.  */
4475       if (c < 0x20 || c == 0x7F)
4476         {
4477           if (c == '\n'
4478               || (c == '\r' && EQ (eol_type, Qmac)))
4479             {
4480               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4481                 ENCODE_RESET_PLANE_AND_REGISTER ();
4482               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4483                 {
4484                   int i;
4485
4486                   for (i = 0; i < 4; i++)
4487                     CODING_ISO_DESIGNATION (coding, i)
4488                       = CODING_ISO_INITIAL (coding, i);
4489                 }
4490               bol_designation = ((CODING_ISO_FLAGS (coding)
4491                                   & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4492                                  != 0);
4493             }
4494           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4495             ENCODE_RESET_PLANE_AND_REGISTER ();
4496           EMIT_ONE_ASCII_BYTE (c);
4497         }
4498       else if (ASCII_CHAR_P (c))
4499         {
4500           if (ascii_compatible)
4501             EMIT_ONE_ASCII_BYTE (c);
4502           else
4503             {
4504               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4505               ENCODE_ISO_CHARACTER (charset, c);
4506             }
4507         }
4508       else if (CHAR_BYTE8_P (c))
4509         {
4510           c = CHAR_TO_BYTE8 (c);
4511           EMIT_ONE_BYTE (c);
4512         }
4513       else
4514         {
4515           struct charset *charset;
4516
4517           if (preferred_charset_id >= 0)
4518             {
4519               bool result;
4520
4521               charset = CHARSET_FROM_ID (preferred_charset_id);
4522               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4523               if (! result)
4524                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4525                                      NULL, charset);
4526             }
4527           else
4528             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4529                                  NULL, charset);
4530           if (!charset)
4531             {
4532               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4533                 {
4534                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4535                   charset = CHARSET_FROM_ID (charset_ascii);
4536                 }
4537               else
4538                 {
4539                   c = coding->default_char;
4540                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4541                                        charset_list, NULL, charset);
4542                 }
4543             }
4544           ENCODE_ISO_CHARACTER (charset, c);
4545         }
4546     }
4547
4548   if (coding->mode & CODING_MODE_LAST_BLOCK
4549       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4550     {
4551       ASSURE_DESTINATION (safe_room);
4552       ENCODE_RESET_PLANE_AND_REGISTER ();
4553     }
4554   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4555   CODING_ISO_BOL (coding) = bol_designation;
4556   coding->produced_char += produced_chars;
4557   coding->produced = dst - coding->destination;
4558   return 0;
4559 }
4560
4561 \f
4562 /*** 8,9. SJIS and BIG5 handlers ***/
4563
4564 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4565    quite widely.  So, for the moment, Emacs supports them in the bare
4566    C code.  But, in the future, they may be supported only by CCL.  */
4567
4568 /* SJIS is a coding system encoding three character sets: ASCII, right
4569    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4570    as is.  A character of charset katakana-jisx0201 is encoded by
4571    "position-code + 0x80".  A character of charset japanese-jisx0208
4572    is encoded in 2-byte but two position-codes are divided and shifted
4573    so that it fit in the range below.
4574
4575    --- CODE RANGE of SJIS ---
4576    (character set)      (range)
4577    ASCII                0x00 .. 0x7F
4578    KATAKANA-JISX0201    0xA0 .. 0xDF
4579    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4580             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4581    -------------------------------
4582
4583 */
4584
4585 /* BIG5 is a coding system encoding two character sets: ASCII and
4586    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4587    character set and is encoded in two-byte.
4588
4589    --- CODE RANGE of BIG5 ---
4590    (character set)      (range)
4591    ASCII                0x00 .. 0x7F
4592    Big5 (1st byte)      0xA1 .. 0xFE
4593         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4594    --------------------------
4595
4596   */
4597
4598 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4599    Return true if a text is encoded in SJIS.  */
4600
4601 static bool
4602 detect_coding_sjis (struct coding_system *coding,
4603                     struct coding_detection_info *detect_info)
4604 {
4605   const unsigned char *src = coding->source, *src_base;
4606   const unsigned char *src_end = coding->source + coding->src_bytes;
4607   bool multibytep = coding->src_multibyte;
4608   ptrdiff_t consumed_chars = 0;
4609   int found = 0;
4610   int c;
4611   Lisp_Object attrs, charset_list;
4612   int max_first_byte_of_2_byte_code;
4613
4614   CODING_GET_INFO (coding, attrs, charset_list);
4615   max_first_byte_of_2_byte_code
4616     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4617
4618   detect_info->checked |= CATEGORY_MASK_SJIS;
4619   /* A coding system of this category is always ASCII compatible.  */
4620   src += coding->head_ascii;
4621
4622   while (1)
4623     {
4624       src_base = src;
4625       ONE_MORE_BYTE (c);
4626       if (c < 0x80)
4627         continue;
4628       if ((c >= 0x81 && c <= 0x9F)
4629           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4630         {
4631           ONE_MORE_BYTE (c);
4632           if (c < 0x40 || c == 0x7F || c > 0xFC)
4633             break;
4634           found = CATEGORY_MASK_SJIS;
4635         }
4636       else if (c >= 0xA0 && c < 0xE0)
4637         found = CATEGORY_MASK_SJIS;
4638       else
4639         break;
4640     }
4641   detect_info->rejected |= CATEGORY_MASK_SJIS;
4642   return 0;
4643
4644  no_more_source:
4645   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4646     {
4647       detect_info->rejected |= CATEGORY_MASK_SJIS;
4648       return 0;
4649     }
4650   detect_info->found |= found;
4651   return 1;
4652 }
4653
4654 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4655    Return true if a text is encoded in BIG5.  */
4656
4657 static bool
4658 detect_coding_big5 (struct coding_system *coding,
4659                     struct coding_detection_info *detect_info)
4660 {
4661   const unsigned char *src = coding->source, *src_base;
4662   const unsigned char *src_end = coding->source + coding->src_bytes;
4663   bool multibytep = coding->src_multibyte;
4664   ptrdiff_t consumed_chars = 0;
4665   int found = 0;
4666   int c;
4667
4668   detect_info->checked |= CATEGORY_MASK_BIG5;
4669   /* A coding system of this category is always ASCII compatible.  */
4670   src += coding->head_ascii;
4671
4672   while (1)
4673     {
4674       src_base = src;
4675       ONE_MORE_BYTE (c);
4676       if (c < 0x80)
4677         continue;
4678       if (c >= 0xA1)
4679         {
4680           ONE_MORE_BYTE (c);
4681           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4682             return 0;
4683           found = CATEGORY_MASK_BIG5;
4684         }
4685       else
4686         break;
4687     }
4688   detect_info->rejected |= CATEGORY_MASK_BIG5;
4689   return 0;
4690
4691  no_more_source:
4692   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4693     {
4694       detect_info->rejected |= CATEGORY_MASK_BIG5;
4695       return 0;
4696     }
4697   detect_info->found |= found;
4698   return 1;
4699 }
4700
4701 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4702
4703 static void
4704 decode_coding_sjis (struct coding_system *coding)
4705 {
4706   const unsigned char *src = coding->source + coding->consumed;
4707   const unsigned char *src_end = coding->source + coding->src_bytes;
4708   const unsigned char *src_base;
4709   int *charbuf = coding->charbuf + coding->charbuf_used;
4710   /* We may produce one charset annotation in one loop and one more at
4711      the end.  */
4712   int *charbuf_end
4713     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4714   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4715   bool multibytep = coding->src_multibyte;
4716   struct charset *charset_roman, *charset_kanji, *charset_kana;
4717   struct charset *charset_kanji2;
4718   Lisp_Object attrs, charset_list, val;
4719   ptrdiff_t char_offset = coding->produced_char;
4720   ptrdiff_t last_offset = char_offset;
4721   int last_id = charset_ascii;
4722   bool eol_dos
4723     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4724   int byte_after_cr = -1;
4725
4726   CODING_GET_INFO (coding, attrs, charset_list);
4727
4728   val = charset_list;
4729   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4730   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4731   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4732   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4733
4734   while (1)
4735     {
4736       int c, c1;
4737       struct charset *charset;
4738
4739       src_base = src;
4740       consumed_chars_base = consumed_chars;
4741
4742       if (charbuf >= charbuf_end)
4743         {
4744           if (byte_after_cr >= 0)
4745             src_base--;
4746           break;
4747         }
4748
4749       if (byte_after_cr >= 0)
4750         c = byte_after_cr, byte_after_cr = -1;
4751       else
4752         ONE_MORE_BYTE (c);
4753       if (c < 0)
4754         goto invalid_code;
4755       if (c < 0x80)
4756         {
4757           if (eol_dos && c == '\r')
4758             ONE_MORE_BYTE (byte_after_cr);
4759           charset = charset_roman;
4760         }
4761       else if (c == 0x80 || c == 0xA0)
4762         goto invalid_code;
4763       else if (c >= 0xA1 && c <= 0xDF)
4764         {
4765           /* SJIS -> JISX0201-Kana */
4766           c &= 0x7F;
4767           charset = charset_kana;
4768         }
4769       else if (c <= 0xEF)
4770         {
4771           /* SJIS -> JISX0208 */
4772           ONE_MORE_BYTE (c1);
4773           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4774             goto invalid_code;
4775           c = (c << 8) | c1;
4776           SJIS_TO_JIS (c);
4777           charset = charset_kanji;
4778         }
4779       else if (c <= 0xFC && charset_kanji2)
4780         {
4781           /* SJIS -> JISX0213-2 */
4782           ONE_MORE_BYTE (c1);
4783           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4784             goto invalid_code;
4785           c = (c << 8) | c1;
4786           SJIS_TO_JIS2 (c);
4787           charset = charset_kanji2;
4788         }
4789       else
4790         goto invalid_code;
4791       if (charset->id != charset_ascii
4792           && last_id != charset->id)
4793         {
4794           if (last_id != charset_ascii)
4795             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4796           last_id = charset->id;
4797           last_offset = char_offset;
4798         }
4799       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4800       *charbuf++ = c;
4801       char_offset++;
4802       continue;
4803
4804     invalid_code:
4805       src = src_base;
4806       consumed_chars = consumed_chars_base;
4807       ONE_MORE_BYTE (c);
4808       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4809       char_offset++;
4810     }
4811
4812  no_more_source:
4813   if (last_id != charset_ascii)
4814     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4815   coding->consumed_char += consumed_chars_base;
4816   coding->consumed = src_base - coding->source;
4817   coding->charbuf_used = charbuf - coding->charbuf;
4818 }
4819
4820 static void
4821 decode_coding_big5 (struct coding_system *coding)
4822 {
4823   const unsigned char *src = coding->source + coding->consumed;
4824   const unsigned char *src_end = coding->source + coding->src_bytes;
4825   const unsigned char *src_base;
4826   int *charbuf = coding->charbuf + coding->charbuf_used;
4827   /* We may produce one charset annotation in one loop and one more at
4828      the end.  */
4829   int *charbuf_end
4830     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4831   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4832   bool multibytep = coding->src_multibyte;
4833   struct charset *charset_roman, *charset_big5;
4834   Lisp_Object attrs, charset_list, val;
4835   ptrdiff_t char_offset = coding->produced_char;
4836   ptrdiff_t last_offset = char_offset;
4837   int last_id = charset_ascii;
4838   bool eol_dos
4839     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4840   int byte_after_cr = -1;
4841
4842   CODING_GET_INFO (coding, attrs, charset_list);
4843   val = charset_list;
4844   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4845   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4846
4847   while (1)
4848     {
4849       int c, c1;
4850       struct charset *charset;
4851
4852       src_base = src;
4853       consumed_chars_base = consumed_chars;
4854
4855       if (charbuf >= charbuf_end)
4856         {
4857           if (byte_after_cr >= 0)
4858             src_base--;
4859           break;
4860         }
4861
4862       if (byte_after_cr >= 0)
4863         c = byte_after_cr, byte_after_cr = -1;
4864       else
4865         ONE_MORE_BYTE (c);
4866
4867       if (c < 0)
4868         goto invalid_code;
4869       if (c < 0x80)
4870         {
4871           if (eol_dos && c == '\r')
4872             ONE_MORE_BYTE (byte_after_cr);
4873           charset = charset_roman;
4874         }
4875       else
4876         {
4877           /* BIG5 -> Big5 */
4878           if (c < 0xA1 || c > 0xFE)
4879             goto invalid_code;
4880           ONE_MORE_BYTE (c1);
4881           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4882             goto invalid_code;
4883           c = c << 8 | c1;
4884           charset = charset_big5;
4885         }
4886       if (charset->id != charset_ascii
4887           && last_id != charset->id)
4888         {
4889           if (last_id != charset_ascii)
4890             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4891           last_id = charset->id;
4892           last_offset = char_offset;
4893         }
4894       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4895       *charbuf++ = c;
4896       char_offset++;
4897       continue;
4898
4899     invalid_code:
4900       src = src_base;
4901       consumed_chars = consumed_chars_base;
4902       ONE_MORE_BYTE (c);
4903       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4904       char_offset++;
4905     }
4906
4907  no_more_source:
4908   if (last_id != charset_ascii)
4909     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4910   coding->consumed_char += consumed_chars_base;
4911   coding->consumed = src_base - coding->source;
4912   coding->charbuf_used = charbuf - coding->charbuf;
4913 }
4914
4915 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4916    This function can encode charsets `ascii', `katakana-jisx0201',
4917    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4918    are sure that all these charsets are registered as official charset
4919    (i.e. do not have extended leading-codes).  Characters of other
4920    charsets are produced without any encoding.  */
4921
4922 static bool
4923 encode_coding_sjis (struct coding_system *coding)
4924 {
4925   bool multibytep = coding->dst_multibyte;
4926   int *charbuf = coding->charbuf;
4927   int *charbuf_end = charbuf + coding->charbuf_used;
4928   unsigned char *dst = coding->destination + coding->produced;
4929   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4930   int safe_room = 4;
4931   ptrdiff_t produced_chars = 0;
4932   Lisp_Object attrs, charset_list, val;
4933   bool ascii_compatible;
4934   struct charset *charset_kanji, *charset_kana;
4935   struct charset *charset_kanji2;
4936   int c;
4937
4938   CODING_GET_INFO (coding, attrs, charset_list);
4939   val = XCDR (charset_list);
4940   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4941   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4942   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4943
4944   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4945
4946   while (charbuf < charbuf_end)
4947     {
4948       ASSURE_DESTINATION (safe_room);
4949       c = *charbuf++;
4950       /* Now encode the character C.  */
4951       if (ASCII_CHAR_P (c) && ascii_compatible)
4952         EMIT_ONE_ASCII_BYTE (c);
4953       else if (CHAR_BYTE8_P (c))
4954         {
4955           c = CHAR_TO_BYTE8 (c);
4956           EMIT_ONE_BYTE (c);
4957         }
4958       else
4959         {
4960           unsigned code;
4961           struct charset *charset;
4962           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4963                                &code, charset);
4964
4965           if (!charset)
4966             {
4967               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4968                 {
4969                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4970                   charset = CHARSET_FROM_ID (charset_ascii);
4971                 }
4972               else
4973                 {
4974                   c = coding->default_char;
4975                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4976                                        charset_list, &code, charset);
4977                 }
4978             }
4979           if (code == CHARSET_INVALID_CODE (charset))
4980             emacs_abort ();
4981           if (charset == charset_kanji)
4982             {
4983               int c1, c2;
4984               JIS_TO_SJIS (code);
4985               c1 = code >> 8, c2 = code & 0xFF;
4986               EMIT_TWO_BYTES (c1, c2);
4987             }
4988           else if (charset == charset_kana)
4989             EMIT_ONE_BYTE (code | 0x80);
4990           else if (charset_kanji2 && charset == charset_kanji2)
4991             {
4992               int c1, c2;
4993
4994               c1 = code >> 8;
4995               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4996                   || c1 == 0x28
4997                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4998                 {
4999                   JIS_TO_SJIS2 (code);
5000                   c1 = code >> 8, c2 = code & 0xFF;
5001                   EMIT_TWO_BYTES (c1, c2);
5002                 }
5003               else
5004                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5005             }
5006           else
5007             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5008         }
5009     }
5010   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5011   coding->produced_char += produced_chars;
5012   coding->produced = dst - coding->destination;
5013   return 0;
5014 }
5015
5016 static bool
5017 encode_coding_big5 (struct coding_system *coding)
5018 {
5019   bool multibytep = coding->dst_multibyte;
5020   int *charbuf = coding->charbuf;
5021   int *charbuf_end = charbuf + coding->charbuf_used;
5022   unsigned char *dst = coding->destination + coding->produced;
5023   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5024   int safe_room = 4;
5025   ptrdiff_t produced_chars = 0;
5026   Lisp_Object attrs, charset_list, val;
5027   bool ascii_compatible;
5028   struct charset *charset_big5;
5029   int c;
5030
5031   CODING_GET_INFO (coding, attrs, charset_list);
5032   val = XCDR (charset_list);
5033   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5034   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5035
5036   while (charbuf < charbuf_end)
5037     {
5038       ASSURE_DESTINATION (safe_room);
5039       c = *charbuf++;
5040       /* Now encode the character C.  */
5041       if (ASCII_CHAR_P (c) && ascii_compatible)
5042         EMIT_ONE_ASCII_BYTE (c);
5043       else if (CHAR_BYTE8_P (c))
5044         {
5045           c = CHAR_TO_BYTE8 (c);
5046           EMIT_ONE_BYTE (c);
5047         }
5048       else
5049         {
5050           unsigned code;
5051           struct charset *charset;
5052           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5053                                &code, charset);
5054
5055           if (! charset)
5056             {
5057               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5058                 {
5059                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5060                   charset = CHARSET_FROM_ID (charset_ascii);
5061                 }
5062               else
5063                 {
5064                   c = coding->default_char;
5065                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5066                                        charset_list, &code, charset);
5067                 }
5068             }
5069           if (code == CHARSET_INVALID_CODE (charset))
5070             emacs_abort ();
5071           if (charset == charset_big5)
5072             {
5073               int c1, c2;
5074
5075               c1 = code >> 8, c2 = code & 0xFF;
5076               EMIT_TWO_BYTES (c1, c2);
5077             }
5078           else
5079             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5080         }
5081     }
5082   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5083   coding->produced_char += produced_chars;
5084   coding->produced = dst - coding->destination;
5085   return 0;
5086 }
5087
5088 \f
5089 /*** 10. CCL handlers ***/
5090
5091 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5092    Return true if a text is encoded in a coding system of which
5093    encoder/decoder are written in CCL program.  */
5094
5095 static bool
5096 detect_coding_ccl (struct coding_system *coding,
5097                    struct coding_detection_info *detect_info)
5098 {
5099   const unsigned char *src = coding->source, *src_base;
5100   const unsigned char *src_end = coding->source + coding->src_bytes;
5101   bool multibytep = coding->src_multibyte;
5102   ptrdiff_t consumed_chars = 0;
5103   int found = 0;
5104   unsigned char *valids;
5105   ptrdiff_t head_ascii = coding->head_ascii;
5106   Lisp_Object attrs;
5107
5108   detect_info->checked |= CATEGORY_MASK_CCL;
5109
5110   coding = &coding_categories[coding_category_ccl];
5111   valids = CODING_CCL_VALIDS (coding);
5112   attrs = CODING_ID_ATTRS (coding->id);
5113   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5114     src += head_ascii;
5115
5116   while (1)
5117     {
5118       int c;
5119
5120       src_base = src;
5121       ONE_MORE_BYTE (c);
5122       if (c < 0 || ! valids[c])
5123         break;
5124       if ((valids[c] > 1))
5125         found = CATEGORY_MASK_CCL;
5126     }
5127   detect_info->rejected |= CATEGORY_MASK_CCL;
5128   return 0;
5129
5130  no_more_source:
5131   detect_info->found |= found;
5132   return 1;
5133 }
5134
5135 static void
5136 decode_coding_ccl (struct coding_system *coding)
5137 {
5138   const unsigned char *src = coding->source + coding->consumed;
5139   const unsigned char *src_end = coding->source + coding->src_bytes;
5140   int *charbuf = coding->charbuf + coding->charbuf_used;
5141   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5142   ptrdiff_t consumed_chars = 0;
5143   bool multibytep = coding->src_multibyte;
5144   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5145   int source_charbuf[1024];
5146   int source_byteidx[1025];
5147   Lisp_Object attrs, charset_list;
5148
5149   CODING_GET_INFO (coding, attrs, charset_list);
5150
5151   while (1)
5152     {
5153       const unsigned char *p = src;
5154       ptrdiff_t offset;
5155       int i = 0;
5156
5157       if (multibytep)
5158         {
5159           while (i < 1024 && p < src_end)
5160             {
5161               source_byteidx[i] = p - src;
5162               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5163             }
5164           source_byteidx[i] = p - src;
5165         }
5166       else
5167         while (i < 1024 && p < src_end)
5168           source_charbuf[i++] = *p++;
5169
5170       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5171         ccl->last_block = true;
5172       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5173       charset_map_loaded = 0;
5174       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5175                   charset_list);
5176       if (charset_map_loaded
5177           && (offset = coding_change_source (coding)))
5178         {
5179           p += offset;
5180           src += offset;
5181           src_end += offset;
5182         }
5183       charbuf += ccl->produced;
5184       if (multibytep)
5185         src += source_byteidx[ccl->consumed];
5186       else
5187         src += ccl->consumed;
5188       consumed_chars += ccl->consumed;
5189       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5190         break;
5191     }
5192
5193   switch (ccl->status)
5194     {
5195     case CCL_STAT_SUSPEND_BY_SRC:
5196       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5197       break;
5198     case CCL_STAT_SUSPEND_BY_DST:
5199       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5200       break;
5201     case CCL_STAT_QUIT:
5202     case CCL_STAT_INVALID_CMD:
5203       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5204       break;
5205     default:
5206       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5207       break;
5208     }
5209   coding->consumed_char += consumed_chars;
5210   coding->consumed = src - coding->source;
5211   coding->charbuf_used = charbuf - coding->charbuf;
5212 }
5213
5214 static bool
5215 encode_coding_ccl (struct coding_system *coding)
5216 {
5217   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5218   bool multibytep = coding->dst_multibyte;
5219   int *charbuf = coding->charbuf;
5220   int *charbuf_end = charbuf + coding->charbuf_used;
5221   unsigned char *dst = coding->destination + coding->produced;
5222   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5223   int destination_charbuf[1024];
5224   ptrdiff_t produced_chars = 0;
5225   int i;
5226   Lisp_Object attrs, charset_list;
5227
5228   CODING_GET_INFO (coding, attrs, charset_list);
5229   if (coding->consumed_char == coding->src_chars
5230       && coding->mode & CODING_MODE_LAST_BLOCK)
5231     ccl->last_block = true;
5232
5233   do
5234     {
5235       ptrdiff_t offset;
5236
5237       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5238       charset_map_loaded = 0;
5239       ccl_driver (ccl, charbuf, destination_charbuf,
5240                   charbuf_end - charbuf, 1024, charset_list);
5241       if (charset_map_loaded
5242           && (offset = coding_change_destination (coding)))
5243         dst += offset;
5244       if (multibytep)
5245         {
5246           ASSURE_DESTINATION (ccl->produced * 2);
5247           for (i = 0; i < ccl->produced; i++)
5248             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5249         }
5250       else
5251         {
5252           ASSURE_DESTINATION (ccl->produced);
5253           for (i = 0; i < ccl->produced; i++)
5254             *dst++ = destination_charbuf[i] & 0xFF;
5255           produced_chars += ccl->produced;
5256         }
5257       charbuf += ccl->consumed;
5258       if (ccl->status == CCL_STAT_QUIT
5259           || ccl->status == CCL_STAT_INVALID_CMD)
5260         break;
5261     }
5262   while (charbuf < charbuf_end);
5263
5264   switch (ccl->status)
5265     {
5266     case CCL_STAT_SUSPEND_BY_SRC:
5267       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5268       break;
5269     case CCL_STAT_SUSPEND_BY_DST:
5270       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5271       break;
5272     case CCL_STAT_QUIT:
5273     case CCL_STAT_INVALID_CMD:
5274       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5275       break;
5276     default:
5277       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5278       break;
5279     }
5280
5281   coding->produced_char += produced_chars;
5282   coding->produced = dst - coding->destination;
5283   return 0;
5284 }
5285
5286 \f
5287 /*** 10, 11. no-conversion handlers ***/
5288
5289 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5290
5291 static void
5292 decode_coding_raw_text (struct coding_system *coding)
5293 {
5294   bool eol_dos
5295     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5296
5297   coding->chars_at_source = 1;
5298   coding->consumed_char = coding->src_chars;
5299   coding->consumed = coding->src_bytes;
5300   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5301     {
5302       coding->consumed_char--;
5303       coding->consumed--;
5304       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5305     }
5306   else
5307     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5308 }
5309
5310 static bool
5311 encode_coding_raw_text (struct coding_system *coding)
5312 {
5313   bool multibytep = coding->dst_multibyte;
5314   int *charbuf = coding->charbuf;
5315   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5316   unsigned char *dst = coding->destination + coding->produced;
5317   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5318   ptrdiff_t produced_chars = 0;
5319   int c;
5320
5321   if (multibytep)
5322     {
5323       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5324
5325       if (coding->src_multibyte)
5326         while (charbuf < charbuf_end)
5327           {
5328             ASSURE_DESTINATION (safe_room);
5329             c = *charbuf++;
5330             if (ASCII_CHAR_P (c))
5331               EMIT_ONE_ASCII_BYTE (c);
5332             else if (CHAR_BYTE8_P (c))
5333               {
5334                 c = CHAR_TO_BYTE8 (c);
5335                 EMIT_ONE_BYTE (c);
5336               }
5337             else
5338               {
5339                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5340
5341                 CHAR_STRING_ADVANCE (c, p1);
5342                 do
5343                   {
5344                     EMIT_ONE_BYTE (*p0);
5345                     p0++;
5346                   }
5347                 while (p0 < p1);
5348               }
5349           }
5350       else
5351         while (charbuf < charbuf_end)
5352           {
5353             ASSURE_DESTINATION (safe_room);
5354             c = *charbuf++;
5355             EMIT_ONE_BYTE (c);
5356           }
5357     }
5358   else
5359     {
5360       if (coding->src_multibyte)
5361         {
5362           int safe_room = MAX_MULTIBYTE_LENGTH;
5363
5364           while (charbuf < charbuf_end)
5365             {
5366               ASSURE_DESTINATION (safe_room);
5367               c = *charbuf++;
5368               if (ASCII_CHAR_P (c))
5369                 *dst++ = c;
5370               else if (CHAR_BYTE8_P (c))
5371                 *dst++ = CHAR_TO_BYTE8 (c);
5372               else
5373                 CHAR_STRING_ADVANCE (c, dst);
5374             }
5375         }
5376       else
5377         {
5378           ASSURE_DESTINATION (charbuf_end - charbuf);
5379           while (charbuf < charbuf_end && dst < dst_end)
5380             *dst++ = *charbuf++;
5381         }
5382       produced_chars = dst - (coding->destination + coding->produced);
5383     }
5384   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5385   coding->produced_char += produced_chars;
5386   coding->produced = dst - coding->destination;
5387   return 0;
5388 }
5389
5390 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5391    Return true if a text is encoded in a charset-based coding system.  */
5392
5393 static bool
5394 detect_coding_charset (struct coding_system *coding,
5395                        struct coding_detection_info *detect_info)
5396 {
5397   const unsigned char *src = coding->source, *src_base;
5398   const unsigned char *src_end = coding->source + coding->src_bytes;
5399   bool multibytep = coding->src_multibyte;
5400   ptrdiff_t consumed_chars = 0;
5401   Lisp_Object attrs, valids, name;
5402   int found = 0;
5403   ptrdiff_t head_ascii = coding->head_ascii;
5404   bool check_latin_extra = 0;
5405
5406   detect_info->checked |= CATEGORY_MASK_CHARSET;
5407
5408   coding = &coding_categories[coding_category_charset];
5409   attrs = CODING_ID_ATTRS (coding->id);
5410   valids = AREF (attrs, coding_attr_charset_valids);
5411   name = CODING_ID_NAME (coding->id);
5412   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5413                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5414       || strncmp (SSDATA (SYMBOL_NAME (name)),
5415                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5416     check_latin_extra = 1;
5417
5418   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5419     src += head_ascii;
5420
5421   while (1)
5422     {
5423       int c;
5424       Lisp_Object val;
5425       struct charset *charset;
5426       int dim, idx;
5427
5428       src_base = src;
5429       ONE_MORE_BYTE (c);
5430       if (c < 0)
5431         continue;
5432       val = AREF (valids, c);
5433       if (NILP (val))
5434         break;
5435       if (c >= 0x80)
5436         {
5437           if (c < 0xA0
5438               && check_latin_extra
5439               && (!VECTORP (Vlatin_extra_code_table)
5440                   || NILP (AREF (Vlatin_extra_code_table, c))))
5441             break;
5442           found = CATEGORY_MASK_CHARSET;
5443         }
5444       if (INTEGERP (val))
5445         {
5446           charset = CHARSET_FROM_ID (XFASTINT (val));
5447           dim = CHARSET_DIMENSION (charset);
5448           for (idx = 1; idx < dim; idx++)
5449             {
5450               if (src == src_end)
5451                 goto too_short;
5452               ONE_MORE_BYTE (c);
5453               if (c < charset->code_space[(dim - 1 - idx) * 4]
5454                   || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5455                 break;
5456             }
5457           if (idx < dim)
5458             break;
5459         }
5460       else
5461         {
5462           idx = 1;
5463           for (; CONSP (val); val = XCDR (val))
5464             {
5465               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5466               dim = CHARSET_DIMENSION (charset);
5467               while (idx < dim)
5468                 {
5469                   if (src == src_end)
5470                     goto too_short;
5471                   ONE_MORE_BYTE (c);
5472                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5473                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5474                     break;
5475                   idx++;
5476                 }
5477               if (idx == dim)
5478                 {
5479                   val = Qnil;
5480                   break;
5481                 }
5482             }
5483           if (CONSP (val))
5484             break;
5485         }
5486     }
5487  too_short:
5488   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5489   return 0;
5490
5491  no_more_source:
5492   detect_info->found |= found;
5493   return 1;
5494 }
5495
5496 static void
5497 decode_coding_charset (struct coding_system *coding)
5498 {
5499   const unsigned char *src = coding->source + coding->consumed;
5500   const unsigned char *src_end = coding->source + coding->src_bytes;
5501   const unsigned char *src_base;
5502   int *charbuf = coding->charbuf + coding->charbuf_used;
5503   /* We may produce one charset annotation in one loop and one more at
5504      the end.  */
5505   int *charbuf_end
5506     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5507   ptrdiff_t consumed_chars = 0, consumed_chars_base;
5508   bool multibytep = coding->src_multibyte;
5509   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5510   Lisp_Object valids;
5511   ptrdiff_t char_offset = coding->produced_char;
5512   ptrdiff_t last_offset = char_offset;
5513   int last_id = charset_ascii;
5514   bool eol_dos
5515     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5516   int byte_after_cr = -1;
5517
5518   valids = AREF (attrs, coding_attr_charset_valids);
5519
5520   while (1)
5521     {
5522       int c;
5523       Lisp_Object val;
5524       struct charset *charset;
5525       int dim;
5526       int len = 1;
5527       unsigned code;
5528
5529       src_base = src;
5530       consumed_chars_base = consumed_chars;
5531
5532       if (charbuf >= charbuf_end)
5533         {
5534           if (byte_after_cr >= 0)
5535             src_base--;
5536           break;
5537         }
5538
5539       if (byte_after_cr >= 0)
5540         {
5541           c = byte_after_cr;
5542           byte_after_cr = -1;
5543         }
5544       else
5545         {
5546           ONE_MORE_BYTE (c);
5547           if (eol_dos && c == '\r')
5548             ONE_MORE_BYTE (byte_after_cr);
5549         }
5550       if (c < 0)
5551         goto invalid_code;
5552       code = c;
5553
5554       val = AREF (valids, c);
5555       if (! INTEGERP (val) && ! CONSP (val))
5556         goto invalid_code;
5557       if (INTEGERP (val))
5558         {
5559           charset = CHARSET_FROM_ID (XFASTINT (val));
5560           dim = CHARSET_DIMENSION (charset);
5561           while (len < dim)
5562             {
5563               ONE_MORE_BYTE (c);
5564               code = (code << 8) | c;
5565               len++;
5566             }
5567           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5568                               charset, code, c);
5569         }
5570       else
5571         {
5572           /* VAL is a list of charset IDs.  It is assured that the
5573              list is sorted by charset dimensions (smaller one
5574              comes first).  */
5575           while (CONSP (val))
5576             {
5577               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5578               dim = CHARSET_DIMENSION (charset);
5579               while (len < dim)
5580                 {
5581                   ONE_MORE_BYTE (c);
5582                   code = (code << 8) | c;
5583                   len++;
5584                 }
5585               CODING_DECODE_CHAR (coding, src, src_base,
5586                                   src_end, charset, code, c);
5587               if (c >= 0)
5588                 break;
5589               val = XCDR (val);
5590             }
5591         }
5592       if (c < 0)
5593         goto invalid_code;
5594       if (charset->id != charset_ascii
5595           && last_id != charset->id)
5596         {
5597           if (last_id != charset_ascii)
5598             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5599           last_id = charset->id;
5600           last_offset = char_offset;
5601         }
5602
5603       *charbuf++ = c;
5604       char_offset++;
5605       continue;
5606
5607     invalid_code:
5608       src = src_base;
5609       consumed_chars = consumed_chars_base;
5610       ONE_MORE_BYTE (c);
5611       *charbuf++ = c < 0 ? -c : ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
5612       char_offset++;
5613     }
5614
5615  no_more_source:
5616   if (last_id != charset_ascii)
5617     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5618   coding->consumed_char += consumed_chars_base;
5619   coding->consumed = src_base - coding->source;
5620   coding->charbuf_used = charbuf - coding->charbuf;
5621 }
5622
5623 static bool
5624 encode_coding_charset (struct coding_system *coding)
5625 {
5626   bool multibytep = coding->dst_multibyte;
5627   int *charbuf = coding->charbuf;
5628   int *charbuf_end = charbuf + coding->charbuf_used;
5629   unsigned char *dst = coding->destination + coding->produced;
5630   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5631   int safe_room = MAX_MULTIBYTE_LENGTH;
5632   ptrdiff_t produced_chars = 0;
5633   Lisp_Object attrs, charset_list;
5634   bool ascii_compatible;
5635   int c;
5636
5637   CODING_GET_INFO (coding, attrs, charset_list);
5638   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5639
5640   while (charbuf < charbuf_end)
5641     {
5642       struct charset *charset;
5643       unsigned code;
5644
5645       ASSURE_DESTINATION (safe_room);
5646       c = *charbuf++;
5647       if (ascii_compatible && ASCII_CHAR_P (c))
5648         EMIT_ONE_ASCII_BYTE (c);
5649       else if (CHAR_BYTE8_P (c))
5650         {
5651           c = CHAR_TO_BYTE8 (c);
5652           EMIT_ONE_BYTE (c);
5653         }
5654       else
5655         {
5656           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5657                                &code, charset);
5658
5659           if (charset)
5660             {
5661               if (CHARSET_DIMENSION (charset) == 1)
5662                 EMIT_ONE_BYTE (code);
5663               else if (CHARSET_DIMENSION (charset) == 2)
5664                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5665               else if (CHARSET_DIMENSION (charset) == 3)
5666                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5667               else
5668                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5669                                  (code >> 8) & 0xFF, code & 0xFF);
5670             }
5671           else
5672             {
5673               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5674                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5675               else
5676                 c = coding->default_char;
5677               EMIT_ONE_BYTE (c);
5678             }
5679         }
5680     }
5681
5682   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5683   coding->produced_char += produced_chars;
5684   coding->produced = dst - coding->destination;
5685   return 0;
5686 }
5687
5688 \f
5689 /*** 7. C library functions ***/
5690
5691 /* Setup coding context CODING from information about CODING_SYSTEM.
5692    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5693    CODING_SYSTEM is invalid, signal an error.  */
5694
5695 void
5696 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5697 {
5698   Lisp_Object attrs;
5699   Lisp_Object eol_type;
5700   Lisp_Object coding_type;
5701   Lisp_Object val;
5702
5703   if (NILP (coding_system))
5704     coding_system = Qundecided;
5705
5706   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5707
5708   attrs = CODING_ID_ATTRS (coding->id);
5709   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5710
5711   coding->mode = 0;
5712   if (VECTORP (eol_type))
5713     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5714                             | CODING_REQUIRE_DETECTION_MASK);
5715   else if (! EQ (eol_type, Qunix))
5716     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5717                             | CODING_REQUIRE_ENCODING_MASK);
5718   else
5719     coding->common_flags = 0;
5720   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5721     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5722   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5723     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5724   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5725     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5726
5727   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5728   coding->max_charset_id = SCHARS (val) - 1;
5729   coding->safe_charsets = SDATA (val);
5730   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5731   coding->carryover_bytes = 0;
5732   coding->raw_destination = 0;
5733
5734   coding_type = CODING_ATTR_TYPE (attrs);
5735   if (EQ (coding_type, Qundecided))
5736     {
5737       coding->detector = NULL;
5738       coding->decoder = decode_coding_raw_text;
5739       coding->encoder = encode_coding_raw_text;
5740       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5741       coding->spec.undecided.inhibit_nbd
5742         = (encode_inhibit_flag
5743            (AREF (attrs, coding_attr_undecided_inhibit_null_byte_detection)));
5744       coding->spec.undecided.inhibit_ied
5745         = (encode_inhibit_flag
5746            (AREF (attrs, coding_attr_undecided_inhibit_iso_escape_detection)));
5747       coding->spec.undecided.prefer_utf_8
5748         = ! NILP (AREF (attrs, coding_attr_undecided_prefer_utf_8));
5749     }
5750   else if (EQ (coding_type, Qiso_2022))
5751     {
5752       int i;
5753       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5754
5755       /* Invoke graphic register 0 to plane 0.  */
5756       CODING_ISO_INVOCATION (coding, 0) = 0;
5757       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5758       CODING_ISO_INVOCATION (coding, 1)
5759         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5760       /* Setup the initial status of designation.  */
5761       for (i = 0; i < 4; i++)
5762         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5763       /* Not single shifting initially.  */
5764       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5765       /* Beginning of buffer should also be regarded as bol. */
5766       CODING_ISO_BOL (coding) = 1;
5767       coding->detector = detect_coding_iso_2022;
5768       coding->decoder = decode_coding_iso_2022;
5769       coding->encoder = encode_coding_iso_2022;
5770       if (flags & CODING_ISO_FLAG_SAFE)
5771         coding->mode |= CODING_MODE_SAFE_ENCODING;
5772       coding->common_flags
5773         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5774             | CODING_REQUIRE_FLUSHING_MASK);
5775       if (flags & CODING_ISO_FLAG_COMPOSITION)
5776         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5777       if (flags & CODING_ISO_FLAG_DESIGNATION)
5778         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5779       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5780         {
5781           setup_iso_safe_charsets (attrs);
5782           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5783           coding->max_charset_id = SCHARS (val) - 1;
5784           coding->safe_charsets = SDATA (val);
5785         }
5786       CODING_ISO_FLAGS (coding) = flags;
5787       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5788       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5789       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5790       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5791     }
5792   else if (EQ (coding_type, Qcharset))
5793     {
5794       coding->detector = detect_coding_charset;
5795       coding->decoder = decode_coding_charset;
5796       coding->encoder = encode_coding_charset;
5797       coding->common_flags
5798         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5799     }
5800   else if (EQ (coding_type, Qutf_8))
5801     {
5802       val = AREF (attrs, coding_attr_utf_bom);
5803       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5804                                    : EQ (val, Qt) ? utf_with_bom
5805                                    : utf_without_bom);
5806       coding->detector = detect_coding_utf_8;
5807       coding->decoder = decode_coding_utf_8;
5808       coding->encoder = encode_coding_utf_8;
5809       coding->common_flags
5810         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5811       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5812         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5813     }
5814   else if (EQ (coding_type, Qutf_16))
5815     {
5816       val = AREF (attrs, coding_attr_utf_bom);
5817       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5818                                     : EQ (val, Qt) ? utf_with_bom
5819                                     : utf_without_bom);
5820       val = AREF (attrs, coding_attr_utf_16_endian);
5821       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5822                                        : utf_16_little_endian);
5823       CODING_UTF_16_SURROGATE (coding) = 0;
5824       coding->detector = detect_coding_utf_16;
5825       coding->decoder = decode_coding_utf_16;
5826       coding->encoder = encode_coding_utf_16;
5827       coding->common_flags
5828         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5829       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5830         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5831     }
5832   else if (EQ (coding_type, Qccl))
5833     {
5834       coding->detector = detect_coding_ccl;
5835       coding->decoder = decode_coding_ccl;
5836       coding->encoder = encode_coding_ccl;
5837       coding->common_flags
5838         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5839             | CODING_REQUIRE_FLUSHING_MASK);
5840     }
5841   else if (EQ (coding_type, Qemacs_mule))
5842     {
5843       coding->detector = detect_coding_emacs_mule;
5844       coding->decoder = decode_coding_emacs_mule;
5845       coding->encoder = encode_coding_emacs_mule;
5846       coding->common_flags
5847         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5848       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5849           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5850         {
5851           Lisp_Object tail, safe_charsets;
5852           int max_charset_id = 0;
5853
5854           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5855                tail = XCDR (tail))
5856             if (max_charset_id < XFASTINT (XCAR (tail)))
5857               max_charset_id = XFASTINT (XCAR (tail));
5858           safe_charsets = make_uninit_string (max_charset_id + 1);
5859           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5860           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5861                tail = XCDR (tail))
5862             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5863           coding->max_charset_id = max_charset_id;
5864           coding->safe_charsets = SDATA (safe_charsets);
5865         }
5866       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5867       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5868     }
5869   else if (EQ (coding_type, Qshift_jis))
5870     {
5871       coding->detector = detect_coding_sjis;
5872       coding->decoder = decode_coding_sjis;
5873       coding->encoder = encode_coding_sjis;
5874       coding->common_flags
5875         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5876     }
5877   else if (EQ (coding_type, Qbig5))
5878     {
5879       coding->detector = detect_coding_big5;
5880       coding->decoder = decode_coding_big5;
5881       coding->encoder = encode_coding_big5;
5882       coding->common_flags
5883         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5884     }
5885   else                          /* EQ (coding_type, Qraw_text) */
5886     {
5887       coding->detector = NULL;
5888       coding->decoder = decode_coding_raw_text;
5889       coding->encoder = encode_coding_raw_text;
5890       if (! EQ (eol_type, Qunix))
5891         {
5892           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5893           if (! VECTORP (eol_type))
5894             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5895         }
5896
5897     }
5898
5899   return;
5900 }
5901
5902 /* Return a list of charsets supported by CODING.  */
5903
5904 Lisp_Object
5905 coding_charset_list (struct coding_system *coding)
5906 {
5907   Lisp_Object attrs, charset_list;
5908
5909   CODING_GET_INFO (coding, attrs, charset_list);
5910   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5911     {
5912       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5913
5914       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5915         charset_list = Viso_2022_charset_list;
5916     }
5917   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5918     {
5919       charset_list = Vemacs_mule_charset_list;
5920     }
5921   return charset_list;
5922 }
5923
5924
5925 /* Return a list of charsets supported by CODING-SYSTEM.  */
5926
5927 Lisp_Object
5928 coding_system_charset_list (Lisp_Object coding_system)
5929 {
5930   ptrdiff_t id;
5931   Lisp_Object attrs, charset_list;
5932
5933   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5934   attrs = CODING_ID_ATTRS (id);
5935
5936   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5937     {
5938       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5939
5940       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5941         charset_list = Viso_2022_charset_list;
5942       else
5943         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5944     }
5945   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5946     {
5947       charset_list = Vemacs_mule_charset_list;
5948     }
5949   else
5950     {
5951       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5952     }
5953   return charset_list;
5954 }
5955
5956
5957 /* Return raw-text or one of its subsidiaries that has the same
5958    eol_type as CODING-SYSTEM.  */
5959
5960 Lisp_Object
5961 raw_text_coding_system (Lisp_Object coding_system)
5962 {
5963   Lisp_Object spec, attrs;
5964   Lisp_Object eol_type, raw_text_eol_type;
5965
5966   if (NILP (coding_system))
5967     return Qraw_text;
5968   spec = CODING_SYSTEM_SPEC (coding_system);
5969   attrs = AREF (spec, 0);
5970
5971   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5972     return coding_system;
5973
5974   eol_type = AREF (spec, 2);
5975   if (VECTORP (eol_type))
5976     return Qraw_text;
5977   spec = CODING_SYSTEM_SPEC (Qraw_text);
5978   raw_text_eol_type = AREF (spec, 2);
5979   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5980           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5981           : AREF (raw_text_eol_type, 2));
5982 }
5983
5984 /* Return true if CODING corresponds to raw-text coding-system.  */
5985
5986 bool
5987 raw_text_coding_system_p (struct coding_system *coding)
5988 {
5989   return (coding->decoder == decode_coding_raw_text
5990           && coding->encoder == encode_coding_raw_text) ? true : false;
5991 }
5992
5993
5994 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5995    the subsidiary that has the same eol-spec as PARENT (if it is not
5996    nil and specifies end-of-line format) or the system's setting
5997    (system_eol_type).  */
5998
5999 Lisp_Object
6000 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
6001 {
6002   Lisp_Object spec, eol_type;
6003
6004   if (NILP (coding_system))
6005     coding_system = Qraw_text;
6006   else
6007     CHECK_CODING_SYSTEM (coding_system);
6008   spec = CODING_SYSTEM_SPEC (coding_system);
6009   eol_type = AREF (spec, 2);
6010   if (VECTORP (eol_type))
6011     {
6012       Lisp_Object parent_eol_type;
6013
6014       if (! NILP (parent))
6015         {
6016           Lisp_Object parent_spec;
6017
6018           CHECK_CODING_SYSTEM (parent);
6019           parent_spec = CODING_SYSTEM_SPEC (parent);
6020           parent_eol_type = AREF (parent_spec, 2);
6021           if (VECTORP (parent_eol_type))
6022             parent_eol_type = system_eol_type;
6023         }
6024       else
6025         parent_eol_type = system_eol_type;
6026       if (EQ (parent_eol_type, Qunix))
6027         coding_system = AREF (eol_type, 0);
6028       else if (EQ (parent_eol_type, Qdos))
6029         coding_system = AREF (eol_type, 1);
6030       else if (EQ (parent_eol_type, Qmac))
6031         coding_system = AREF (eol_type, 2);
6032     }
6033   return coding_system;
6034 }
6035
6036
6037 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
6038    decided for writing to a process.  If not, complement them, and
6039    return a new coding system.  */
6040
6041 Lisp_Object
6042 complement_process_encoding_system (Lisp_Object coding_system)
6043 {
6044   Lisp_Object coding_base = Qnil, eol_base = Qnil;
6045   Lisp_Object spec, attrs;
6046   int i;
6047
6048   for (i = 0; i < 3; i++)
6049     {
6050       if (i == 1)
6051         coding_system = CDR_SAFE (Vdefault_process_coding_system);
6052       else if (i == 2)
6053         coding_system = preferred_coding_system ();
6054       spec = CODING_SYSTEM_SPEC (coding_system);
6055       if (NILP (spec))
6056         continue;
6057       attrs = AREF (spec, 0);
6058       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6059         coding_base = CODING_ATTR_BASE_NAME (attrs);
6060       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
6061         eol_base = coding_system;
6062       if (! NILP (coding_base) && ! NILP (eol_base))
6063         break;
6064     }
6065
6066   if (i > 0)
6067     /* The original CODING_SYSTEM didn't specify text-conversion or
6068        eol-conversion.  Be sure that we return a fully complemented
6069        coding system.  */
6070     coding_system = coding_inherit_eol_type (coding_base, eol_base);
6071   return coding_system;
6072 }
6073
6074
6075 /* Emacs has a mechanism to automatically detect a coding system if it
6076    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
6077    it's impossible to distinguish some coding systems accurately
6078    because they use the same range of codes.  So, at first, coding
6079    systems are categorized into 7, those are:
6080
6081    o coding-category-emacs-mule
6082
6083         The category for a coding system which has the same code range
6084         as Emacs' internal format.  Assigned the coding-system (Lisp
6085         symbol) `emacs-mule' by default.
6086
6087    o coding-category-sjis
6088
6089         The category for a coding system which has the same code range
6090         as SJIS.  Assigned the coding-system (Lisp
6091         symbol) `japanese-shift-jis' by default.
6092
6093    o coding-category-iso-7
6094
6095         The category for a coding system which has the same code range
6096         as ISO2022 of 7-bit environment.  This doesn't use any locking
6097         shift and single shift functions.  This can encode/decode all
6098         charsets.  Assigned the coding-system (Lisp symbol)
6099         `iso-2022-7bit' by default.
6100
6101    o coding-category-iso-7-tight
6102
6103         Same as coding-category-iso-7 except that this can
6104         encode/decode only the specified charsets.
6105
6106    o coding-category-iso-8-1
6107
6108         The category for a coding system which has the same code range
6109         as ISO2022 of 8-bit environment and graphic plane 1 used only
6110         for DIMENSION1 charset.  This doesn't use any locking shift
6111         and single shift functions.  Assigned the coding-system (Lisp
6112         symbol) `iso-latin-1' by default.
6113
6114    o coding-category-iso-8-2
6115
6116         The category for a coding system which has the same code range
6117         as ISO2022 of 8-bit environment and graphic plane 1 used only
6118         for DIMENSION2 charset.  This doesn't use any locking shift
6119         and single shift functions.  Assigned the coding-system (Lisp
6120         symbol) `japanese-iso-8bit' by default.
6121
6122    o coding-category-iso-7-else
6123
6124         The category for a coding system which has the same code range
6125         as ISO2022 of 7-bit environment but uses locking shift or
6126         single shift functions.  Assigned the coding-system (Lisp
6127         symbol) `iso-2022-7bit-lock' by default.
6128
6129    o coding-category-iso-8-else
6130
6131         The category for a coding system which has the same code range
6132         as ISO2022 of 8-bit environment but uses locking shift or
6133         single shift functions.  Assigned the coding-system (Lisp
6134         symbol) `iso-2022-8bit-ss2' by default.
6135
6136    o coding-category-big5
6137
6138         The category for a coding system which has the same code range
6139         as BIG5.  Assigned the coding-system (Lisp symbol)
6140         `cn-big5' by default.
6141
6142    o coding-category-utf-8
6143
6144         The category for a coding system which has the same code range
6145         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6146         symbol) `utf-8' by default.
6147
6148    o coding-category-utf-16-be
6149
6150         The category for a coding system in which a text has an
6151         Unicode signature (cf. Unicode Standard) in the order of BIG
6152         endian at the head.  Assigned the coding-system (Lisp symbol)
6153         `utf-16-be' by default.
6154
6155    o coding-category-utf-16-le
6156
6157         The category for a coding system in which a text has an
6158         Unicode signature (cf. Unicode Standard) in the order of
6159         LITTLE endian at the head.  Assigned the coding-system (Lisp
6160         symbol) `utf-16-le' by default.
6161
6162    o coding-category-ccl
6163
6164         The category for a coding system of which encoder/decoder is
6165         written in CCL programs.  The default value is nil, i.e., no
6166         coding system is assigned.
6167
6168    o coding-category-binary
6169
6170         The category for a coding system not categorized in any of the
6171         above.  Assigned the coding-system (Lisp symbol)
6172         `no-conversion' by default.
6173
6174    Each of them is a Lisp symbol and the value is an actual
6175    `coding-system's (this is also a Lisp symbol) assigned by a user.
6176    What Emacs does actually is to detect a category of coding system.
6177    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6178    decide only one possible category, it selects a category of the
6179    highest priority.  Priorities of categories are also specified by a
6180    user in a Lisp variable `coding-category-list'.
6181
6182 */
6183
6184 static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,
6185                                            int eol_seen);
6186
6187
6188 /* Return the number of ASCII characters at the head of the source.
6189    By side effects, set coding->head_ascii and update
6190    coding->eol_seen.  The value of coding->eol_seen is "logical or" of
6191    EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is
6192    reliable only when all the source bytes are ASCII.  */
6193
6194 static ptrdiff_t
6195 check_ascii (struct coding_system *coding)
6196 {
6197   const unsigned char *src, *end;
6198   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6199   int eol_seen = coding->eol_seen;
6200
6201   coding_set_source (coding);
6202   src = coding->source;
6203   end = src + coding->src_bytes;
6204
6205   if (inhibit_eol_conversion
6206       || SYMBOLP (eol_type))
6207     {
6208       /* We don't have to check EOL format.  */
6209       while (src < end && !( *src & 0x80))
6210         {
6211           if (*src++ == '\n')
6212             eol_seen |= EOL_SEEN_LF;
6213         }
6214     }
6215   else
6216     {
6217       end--;                /* We look ahead one byte for "CR LF".  */
6218       while (src < end)
6219         {
6220           int c = *src;
6221
6222           if (c & 0x80)
6223             break;
6224           src++;
6225           if (c == '\r')
6226             {
6227               if (*src == '\n')
6228                 {
6229                   eol_seen |= EOL_SEEN_CRLF;
6230                   src++;
6231                 }
6232               else
6233                 eol_seen |= EOL_SEEN_CR;
6234             }
6235           else if (c == '\n')
6236             eol_seen |= EOL_SEEN_LF;
6237         }
6238       if (src == end)
6239         {
6240           int c = *src;
6241
6242           /* All bytes but the last one C are ASCII.  */
6243           if (! (c & 0x80))
6244             {
6245               if (c == '\r')
6246                 eol_seen |= EOL_SEEN_CR;
6247               else if (c  == '\n')
6248                 eol_seen |= EOL_SEEN_LF;
6249               src++;
6250             }
6251         }
6252     }
6253   coding->head_ascii = src - coding->source;
6254   coding->eol_seen = eol_seen;
6255   return (coding->head_ascii);
6256 }
6257
6258
6259 /* Return the number of characters at the source if all the bytes are
6260    valid UTF-8 (of Unicode range).  Otherwise, return -1.  By side
6261    effects, update coding->eol_seen.  The value of coding->eol_seen is
6262    "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but
6263    the value is reliable only when all the source bytes are valid
6264    UTF-8.  */
6265
6266 static ptrdiff_t
6267 check_utf_8 (struct coding_system *coding)
6268 {
6269   const unsigned char *src, *end;
6270   int eol_seen;
6271   ptrdiff_t nchars = coding->head_ascii;
6272
6273   if (coding->head_ascii < 0)
6274     check_ascii (coding);
6275   else
6276     coding_set_source (coding);
6277   src = coding->source + coding->head_ascii;
6278   /* We look ahead one byte for CR LF.  */
6279   end = coding->source + coding->src_bytes - 1;
6280   eol_seen = coding->eol_seen;
6281   while (src < end)
6282     {
6283       int c = *src;
6284
6285       if (UTF_8_1_OCTET_P (*src))
6286         {
6287           src++;
6288           if (c < 0x20)
6289             {
6290               if (c == '\r')
6291                 {
6292                   if (*src == '\n')
6293                     {
6294                       eol_seen |= EOL_SEEN_CRLF;
6295                       src++;
6296                       nchars++;
6297                     }
6298                   else
6299                     eol_seen |= EOL_SEEN_CR;
6300                 }
6301               else if (c == '\n')
6302                 eol_seen |= EOL_SEEN_LF;
6303             }
6304         }
6305       else if (UTF_8_2_OCTET_LEADING_P (c))
6306         {
6307           if (c < 0xC2          /* overlong sequence */
6308               || src + 1 >= end
6309               || ! UTF_8_EXTRA_OCTET_P (src[1]))
6310             return -1;
6311           src += 2;
6312         }
6313       else if (UTF_8_3_OCTET_LEADING_P (c))
6314         {
6315           if (src + 2 >= end
6316               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6317                     && UTF_8_EXTRA_OCTET_P (src[2])))
6318             return -1;
6319           c = (((c & 0xF) << 12)
6320                | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
6321           if (c < 0x800                       /* overlong sequence */
6322               || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
6323             return -1;
6324           src += 3;
6325         }
6326       else if (UTF_8_4_OCTET_LEADING_P (c))
6327         {
6328           if (src + 3 >= end
6329               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6330                     && UTF_8_EXTRA_OCTET_P (src[2])
6331                     && UTF_8_EXTRA_OCTET_P (src[3])))
6332             return -1;
6333           c = (((c & 0x7) << 18) | ((src[1] & 0x3F) << 12)
6334                | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
6335           if (c < 0x10000       /* overlong sequence */
6336               || c >= 0x110000) /* non-Unicode character  */
6337             return -1;
6338           src += 4;
6339         }
6340       else
6341         return -1;
6342       nchars++;
6343     }
6344
6345   if (src == end)
6346     {
6347       if (! UTF_8_1_OCTET_P (*src))
6348         return -1;
6349       nchars++;
6350       if (*src == '\r')
6351         eol_seen |= EOL_SEEN_CR;
6352       else if (*src  == '\n')
6353         eol_seen |= EOL_SEEN_LF;
6354     }
6355   coding->eol_seen = eol_seen;
6356   return nchars;
6357 }
6358
6359
6360 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6361    SOURCE is encoded.  If CATEGORY is one of
6362    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6363    two-byte, else they are encoded by one-byte.
6364
6365    Return one of EOL_SEEN_XXX.  */
6366
6367 #define MAX_EOL_CHECK_COUNT 3
6368
6369 static int
6370 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6371             enum coding_category category)
6372 {
6373   const unsigned char *src = source, *src_end = src + src_bytes;
6374   unsigned char c;
6375   int total  = 0;
6376   int eol_seen = EOL_SEEN_NONE;
6377
6378   if ((1 << category) & CATEGORY_MASK_UTF_16)
6379     {
6380       bool msb = category == (coding_category_utf_16_le
6381                               | coding_category_utf_16_le_nosig);
6382       bool lsb = !msb;
6383
6384       while (src + 1 < src_end)
6385         {
6386           c = src[lsb];
6387           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6388             {
6389               int this_eol;
6390
6391               if (c == '\n')
6392                 this_eol = EOL_SEEN_LF;
6393               else if (src + 3 >= src_end
6394                        || src[msb + 2] != 0
6395                        || src[lsb + 2] != '\n')
6396                 this_eol = EOL_SEEN_CR;
6397               else
6398                 {
6399                   this_eol = EOL_SEEN_CRLF;
6400                   src += 2;
6401                 }
6402
6403               if (eol_seen == EOL_SEEN_NONE)
6404                 /* This is the first end-of-line.  */
6405                 eol_seen = this_eol;
6406               else if (eol_seen != this_eol)
6407                 {
6408                   /* The found type is different from what found before.
6409                      Allow for stray ^M characters in DOS EOL files.  */
6410                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6411                       || (eol_seen == EOL_SEEN_CRLF
6412                           && this_eol == EOL_SEEN_CR))
6413                     eol_seen = EOL_SEEN_CRLF;
6414                   else
6415                     {
6416                       eol_seen = EOL_SEEN_LF;
6417                       break;
6418                     }
6419                 }
6420               if (++total == MAX_EOL_CHECK_COUNT)
6421                 break;
6422             }
6423           src += 2;
6424         }
6425     }
6426   else
6427     while (src < src_end)
6428       {
6429         c = *src++;
6430         if (c == '\n' || c == '\r')
6431           {
6432             int this_eol;
6433
6434             if (c == '\n')
6435               this_eol = EOL_SEEN_LF;
6436             else if (src >= src_end || *src != '\n')
6437               this_eol = EOL_SEEN_CR;
6438             else
6439               this_eol = EOL_SEEN_CRLF, src++;
6440
6441             if (eol_seen == EOL_SEEN_NONE)
6442               /* This is the first end-of-line.  */
6443               eol_seen = this_eol;
6444             else if (eol_seen != this_eol)
6445               {
6446                 /* The found type is different from what found before.
6447                    Allow for stray ^M characters in DOS EOL files.  */
6448                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6449                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6450                   eol_seen = EOL_SEEN_CRLF;
6451                 else
6452                   {
6453                     eol_seen = EOL_SEEN_LF;
6454                     break;
6455                   }
6456               }
6457             if (++total == MAX_EOL_CHECK_COUNT)
6458               break;
6459           }
6460       }
6461   return eol_seen;
6462 }
6463
6464
6465 static Lisp_Object
6466 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6467 {
6468   Lisp_Object eol_type;
6469
6470   eol_type = CODING_ID_EOL_TYPE (coding->id);
6471   if (! VECTORP (eol_type))
6472     /* Already adjusted.  */
6473     return eol_type;
6474   if (eol_seen & EOL_SEEN_LF)
6475     {
6476       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6477       eol_type = Qunix;
6478     }
6479   else if (eol_seen & EOL_SEEN_CRLF)
6480     {
6481       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6482       eol_type = Qdos;
6483     }
6484   else if (eol_seen & EOL_SEEN_CR)
6485     {
6486       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6487       eol_type = Qmac;
6488     }
6489   return eol_type;
6490 }
6491
6492 /* Detect how a text specified in CODING is encoded.  If a coding
6493    system is detected, update fields of CODING by the detected coding
6494    system.  */
6495
6496 static void
6497 detect_coding (struct coding_system *coding)
6498 {
6499   const unsigned char *src, *src_end;
6500   unsigned int saved_mode = coding->mode;
6501   Lisp_Object found = Qnil;
6502   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6503
6504   coding->consumed = coding->consumed_char = 0;
6505   coding->produced = coding->produced_char = 0;
6506   coding_set_source (coding);
6507
6508   src_end = coding->source + coding->src_bytes;
6509
6510   coding->eol_seen = EOL_SEEN_NONE;
6511   /* If we have not yet decided the text encoding type, detect it
6512      now.  */
6513   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6514     {
6515       int c, i;
6516       struct coding_detection_info detect_info;
6517       bool null_byte_found = 0, eight_bit_found = 0;
6518       bool inhibit_nbd = inhibit_flag (coding->spec.undecided.inhibit_nbd,
6519                                        inhibit_null_byte_detection);
6520       bool inhibit_ied = inhibit_flag (coding->spec.undecided.inhibit_ied,
6521                                        inhibit_iso_escape_detection);
6522       bool prefer_utf_8 = coding->spec.undecided.prefer_utf_8;
6523
6524       coding->head_ascii = 0;
6525       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6526       for (src = coding->source; src < src_end; src++)
6527         {
6528           c = *src;
6529           if (c & 0x80)
6530             {
6531               eight_bit_found = 1;
6532               if (null_byte_found)
6533                 break;
6534             }
6535           else if (c < 0x20)
6536             {
6537               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6538                   && ! inhibit_ied
6539                   && ! detect_info.checked)
6540                 {
6541                   if (detect_coding_iso_2022 (coding, &detect_info))
6542                     {
6543                       /* We have scanned the whole data.  */
6544                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6545                         {
6546                           /* We didn't find an 8-bit code.  We may
6547                              have found a null-byte, but it's very
6548                              rare that a binary file conforms to
6549                              ISO-2022.  */
6550                           src = src_end;
6551                           coding->head_ascii = src - coding->source;
6552                         }
6553                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6554                       break;
6555                     }
6556                 }
6557               else if (! c && !inhibit_nbd)
6558                 {
6559                   null_byte_found = 1;
6560                   if (eight_bit_found)
6561                     break;
6562                 }
6563               else if (! disable_ascii_optimization
6564                        && ! inhibit_eol_conversion)
6565                 {
6566                   if (c == '\r')
6567                     {
6568                       if (src < src_end && src[1] == '\n')
6569                         {
6570                           coding->eol_seen |= EOL_SEEN_CRLF;
6571                           src++;
6572                           if (! eight_bit_found)
6573                             coding->head_ascii++;
6574                         }
6575                       else
6576                         coding->eol_seen |= EOL_SEEN_CR;
6577                     }
6578                   else if (c == '\n')
6579                     {
6580                       coding->eol_seen |= EOL_SEEN_LF;
6581                     }
6582                 }
6583
6584               if (! eight_bit_found)
6585                 coding->head_ascii++;
6586             }
6587           else if (! eight_bit_found)
6588             coding->head_ascii++;
6589         }
6590
6591       if (null_byte_found || eight_bit_found
6592           || coding->head_ascii < coding->src_bytes
6593           || detect_info.found)
6594         {
6595           enum coding_category category;
6596           struct coding_system *this;
6597
6598           if (coding->head_ascii == coding->src_bytes)
6599             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6600             for (i = 0; i < coding_category_raw_text; i++)
6601               {
6602                 category = coding_priorities[i];
6603                 this = coding_categories + category;
6604                 if (detect_info.found & (1 << category))
6605                   break;
6606               }
6607           else
6608             {
6609               if (null_byte_found)
6610                 {
6611                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6612                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6613                 }
6614               else if (prefer_utf_8
6615                        && detect_coding_utf_8 (coding, &detect_info))
6616                 {
6617                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
6618                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
6619                 }
6620               for (i = 0; i < coding_category_raw_text; i++)
6621                 {
6622                   category = coding_priorities[i];
6623                   this = coding_categories + category;
6624                   /* Some of this->detector (e.g. detect_coding_sjis)
6625                      require this information.  */
6626                   coding->id = this->id;
6627                   if (this->id < 0)
6628                     {
6629                       /* No coding system of this category is defined.  */
6630                       detect_info.rejected |= (1 << category);
6631                     }
6632                   else if (category >= coding_category_raw_text)
6633                     continue;
6634                   else if (detect_info.checked & (1 << category))
6635                     {
6636                       if (detect_info.found & (1 << category))
6637                         break;
6638                     }
6639                   else if ((*(this->detector)) (coding, &detect_info)
6640                            && detect_info.found & (1 << category))
6641                     break;
6642                 }
6643             }
6644
6645           if (i < coding_category_raw_text)
6646             {
6647               if (category == coding_category_utf_8_auto)
6648                 {
6649                   Lisp_Object coding_systems;
6650
6651                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6652                                          coding_attr_utf_bom);
6653                   if (CONSP (coding_systems))
6654                     {
6655                       if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6656                         found = XCAR (coding_systems);
6657                       else
6658                         found = XCDR (coding_systems);
6659                     }
6660                   else
6661                     found = CODING_ID_NAME (this->id);
6662                 }
6663               else if (category == coding_category_utf_16_auto)
6664                 {
6665                   Lisp_Object coding_systems;
6666
6667                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6668                                          coding_attr_utf_bom);
6669                   if (CONSP (coding_systems))
6670                     {
6671                       if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6672                         found = XCAR (coding_systems);
6673                       else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6674                         found = XCDR (coding_systems);
6675                     }
6676                   else
6677                     found = CODING_ID_NAME (this->id);
6678                 }
6679               else
6680                 found = CODING_ID_NAME (this->id);
6681             }
6682           else if (null_byte_found)
6683             found = Qno_conversion;
6684           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6685                    == CATEGORY_MASK_ANY)
6686             found = Qraw_text;
6687           else if (detect_info.rejected)
6688             for (i = 0; i < coding_category_raw_text; i++)
6689               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6690                 {
6691                   this = coding_categories + coding_priorities[i];
6692                   found = CODING_ID_NAME (this->id);
6693                   break;
6694                 }
6695         }
6696     }
6697   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6698            == coding_category_utf_8_auto)
6699     {
6700       Lisp_Object coding_systems;
6701       struct coding_detection_info detect_info;
6702
6703       coding_systems
6704         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6705       detect_info.found = detect_info.rejected = 0;
6706       if (check_ascii (coding) == coding->src_bytes)
6707         {
6708           if (CONSP (coding_systems))
6709             found = XCDR (coding_systems);
6710         }
6711       else
6712         {
6713           if (CONSP (coding_systems)
6714               && detect_coding_utf_8 (coding, &detect_info))
6715             {
6716               if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6717                 found = XCAR (coding_systems);
6718               else
6719                 found = XCDR (coding_systems);
6720             }
6721         }
6722     }
6723   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6724            == coding_category_utf_16_auto)
6725     {
6726       Lisp_Object coding_systems;
6727       struct coding_detection_info detect_info;
6728
6729       coding_systems
6730         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6731       detect_info.found = detect_info.rejected = 0;
6732       coding->head_ascii = 0;
6733       if (CONSP (coding_systems)
6734           && detect_coding_utf_16 (coding, &detect_info))
6735         {
6736           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6737             found = XCAR (coding_systems);
6738           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6739             found = XCDR (coding_systems);
6740         }
6741     }
6742
6743   if (! NILP (found))
6744     {
6745       int specified_eol = (VECTORP (eol_type) ? EOL_SEEN_NONE
6746                            : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
6747                            : EQ (eol_type, Qmac) ? EOL_SEEN_CR
6748                            : EOL_SEEN_LF);
6749
6750       setup_coding_system (found, coding);
6751       if (specified_eol != EOL_SEEN_NONE)
6752         adjust_coding_eol_type (coding, specified_eol);
6753     }
6754
6755   coding->mode = saved_mode;
6756 }
6757
6758
6759 static void
6760 decode_eol (struct coding_system *coding)
6761 {
6762   Lisp_Object eol_type;
6763   unsigned char *p, *pbeg, *pend;
6764
6765   eol_type = CODING_ID_EOL_TYPE (coding->id);
6766   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6767     return;
6768
6769   if (NILP (coding->dst_object))
6770     pbeg = coding->destination;
6771   else
6772     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6773   pend = pbeg + coding->produced;
6774
6775   if (VECTORP (eol_type))
6776     {
6777       int eol_seen = EOL_SEEN_NONE;
6778
6779       for (p = pbeg; p < pend; p++)
6780         {
6781           if (*p == '\n')
6782             eol_seen |= EOL_SEEN_LF;
6783           else if (*p == '\r')
6784             {
6785               if (p + 1 < pend && *(p + 1) == '\n')
6786                 {
6787                   eol_seen |= EOL_SEEN_CRLF;
6788                   p++;
6789                 }
6790               else
6791                 eol_seen |= EOL_SEEN_CR;
6792             }
6793         }
6794       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6795       if ((eol_seen & EOL_SEEN_CRLF) != 0
6796           && (eol_seen & EOL_SEEN_CR) != 0
6797           && (eol_seen & EOL_SEEN_LF) == 0)
6798         eol_seen = EOL_SEEN_CRLF;
6799       else if (eol_seen != EOL_SEEN_NONE
6800           && eol_seen != EOL_SEEN_LF
6801           && eol_seen != EOL_SEEN_CRLF
6802           && eol_seen != EOL_SEEN_CR)
6803         eol_seen = EOL_SEEN_LF;
6804       if (eol_seen != EOL_SEEN_NONE)
6805         eol_type = adjust_coding_eol_type (coding, eol_seen);
6806     }
6807
6808   if (EQ (eol_type, Qmac))
6809     {
6810       for (p = pbeg; p < pend; p++)
6811         if (*p == '\r')
6812           *p = '\n';
6813     }
6814   else if (EQ (eol_type, Qdos))
6815     {
6816       ptrdiff_t n = 0;
6817       ptrdiff_t pos = coding->dst_pos;
6818       ptrdiff_t pos_byte = coding->dst_pos_byte;
6819       ptrdiff_t pos_end = pos_byte + coding->produced - 1;
6820
6821       /* This assertion is here instead of code, now deleted, that
6822          handled the NILP case, which no longer happens with the
6823          current codebase.  */
6824       eassert (!NILP (coding->dst_object));
6825
6826       while (pos_byte < pos_end)
6827         {
6828           p = BYTE_POS_ADDR (pos_byte);
6829           if (*p == '\r' && p[1] == '\n')
6830             {
6831               del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6832               n++;
6833               pos_end--;
6834             }
6835           pos++;
6836           if (coding->dst_multibyte)
6837             pos_byte += BYTES_BY_CHAR_HEAD (*p);
6838           else
6839             pos_byte++;
6840         }
6841       coding->produced -= n;
6842       coding->produced_char -= n;
6843     }
6844 }
6845
6846
6847 /* MAX_LOOKUP's maximum value.  MAX_LOOKUP is an int and so cannot
6848    exceed INT_MAX.  Also, MAX_LOOKUP is multiplied by sizeof (int) for
6849    alloca, so it cannot exceed MAX_ALLOCA / sizeof (int).  */
6850 enum { MAX_LOOKUP_MAX = min (INT_MAX, MAX_ALLOCA / sizeof (int)) };
6851
6852 /* Return a translation table (or list of them) from coding system
6853    attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6854    not ENCODEP). */
6855
6856 static Lisp_Object
6857 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6858 {
6859   Lisp_Object standard, translation_table;
6860   Lisp_Object val;
6861
6862   if (NILP (Venable_character_translation))
6863     {
6864       if (max_lookup)
6865         *max_lookup = 0;
6866       return Qnil;
6867     }
6868   if (encodep)
6869     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6870       standard = Vstandard_translation_table_for_encode;
6871   else
6872     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6873       standard = Vstandard_translation_table_for_decode;
6874   if (NILP (translation_table))
6875     translation_table = standard;
6876   else
6877     {
6878       if (SYMBOLP (translation_table))
6879         translation_table = Fget (translation_table, Qtranslation_table);
6880       else if (CONSP (translation_table))
6881         {
6882           translation_table = Fcopy_sequence (translation_table);
6883           for (val = translation_table; CONSP (val); val = XCDR (val))
6884             if (SYMBOLP (XCAR (val)))
6885               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6886         }
6887       if (CHAR_TABLE_P (standard))
6888         {
6889           if (CONSP (translation_table))
6890             translation_table = nconc2 (translation_table, list1 (standard));
6891           else
6892             translation_table = list2 (translation_table, standard);
6893         }
6894     }
6895
6896   if (max_lookup)
6897     {
6898       *max_lookup = 1;
6899       if (CHAR_TABLE_P (translation_table)
6900           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6901         {
6902           val = XCHAR_TABLE (translation_table)->extras[1];
6903           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6904             *max_lookup = min (XFASTINT (val), MAX_LOOKUP_MAX);
6905         }
6906       else if (CONSP (translation_table))
6907         {
6908           Lisp_Object tail;
6909
6910           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6911             if (CHAR_TABLE_P (XCAR (tail))
6912                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6913               {
6914                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6915                 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6916                   *max_lookup = min (XFASTINT (tailval), MAX_LOOKUP_MAX);
6917               }
6918         }
6919     }
6920   return translation_table;
6921 }
6922
6923 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6924   do {                                                          \
6925     trans = Qnil;                                               \
6926     if (CHAR_TABLE_P (table))                                   \
6927       {                                                         \
6928         trans = CHAR_TABLE_REF (table, c);                      \
6929         if (CHARACTERP (trans))                                 \
6930           c = XFASTINT (trans), trans = Qnil;                   \
6931       }                                                         \
6932     else if (CONSP (table))                                     \
6933       {                                                         \
6934         Lisp_Object tail;                                       \
6935                                                                 \
6936         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6937           if (CHAR_TABLE_P (XCAR (tail)))                       \
6938             {                                                   \
6939               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6940               if (CHARACTERP (trans))                           \
6941                 c = XFASTINT (trans), trans = Qnil;             \
6942               else if (! NILP (trans))                          \
6943                 break;                                          \
6944             }                                                   \
6945       }                                                         \
6946   } while (0)
6947
6948
6949 /* Return a translation of character(s) at BUF according to TRANS.
6950    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6951    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6952    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6953    translation is found, and Qnil if not found..
6954    If BUF is too short to lookup characters in FROM, return Qt.  */
6955
6956 static Lisp_Object
6957 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6958 {
6959
6960   if (INTEGERP (trans))
6961     return trans;
6962   for (; CONSP (trans); trans = XCDR (trans))
6963     {
6964       Lisp_Object val = XCAR (trans);
6965       Lisp_Object from = XCAR (val);
6966       ptrdiff_t len = ASIZE (from);
6967       ptrdiff_t i;
6968
6969       for (i = 0; i < len; i++)
6970         {
6971           if (buf + i == buf_end)
6972             return Qt;
6973           if (XINT (AREF (from, i)) != buf[i])
6974             break;
6975         }
6976       if (i == len)
6977         return val;
6978     }
6979   return Qnil;
6980 }
6981
6982
6983 static int
6984 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6985                bool last_block)
6986 {
6987   unsigned char *dst = coding->destination + coding->produced;
6988   unsigned char *dst_end = coding->destination + coding->dst_bytes;
6989   ptrdiff_t produced;
6990   ptrdiff_t produced_chars = 0;
6991   int carryover = 0;
6992
6993   if (! coding->chars_at_source)
6994     {
6995       /* Source characters are in coding->charbuf.  */
6996       int *buf = coding->charbuf;
6997       int *buf_end = buf + coding->charbuf_used;
6998
6999       if (EQ (coding->src_object, coding->dst_object)
7000           && ! NILP (coding->dst_object))
7001         {
7002           eassert (growable_destination (coding));
7003           coding_set_source (coding);
7004           dst_end = ((unsigned char *) coding->source) + coding->consumed;
7005         }
7006
7007       while (buf < buf_end)
7008         {
7009           int c = *buf;
7010           ptrdiff_t i;
7011
7012           if (c >= 0)
7013             {
7014               ptrdiff_t from_nchars = 1, to_nchars = 1;
7015               Lisp_Object trans = Qnil;
7016
7017               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7018               if (! NILP (trans))
7019                 {
7020                   trans = get_translation (trans, buf, buf_end);
7021                   if (INTEGERP (trans))
7022                     c = XINT (trans);
7023                   else if (CONSP (trans))
7024                     {
7025                       from_nchars = ASIZE (XCAR (trans));
7026                       trans = XCDR (trans);
7027                       if (INTEGERP (trans))
7028                         c = XINT (trans);
7029                       else
7030                         {
7031                           to_nchars = ASIZE (trans);
7032                           c = XINT (AREF (trans, 0));
7033                         }
7034                     }
7035                   else if (EQ (trans, Qt) && ! last_block)
7036                     break;
7037                 }
7038
7039               if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
7040                 {
7041                   eassert (growable_destination (coding));
7042                   ptrdiff_t dst_size;
7043                   if (INT_MULTIPLY_WRAPV (to_nchars, MAX_MULTIBYTE_LENGTH,
7044                                           &dst_size)
7045                       || INT_ADD_WRAPV (buf_end - buf, dst_size, &dst_size))
7046                     memory_full (SIZE_MAX);
7047                   dst = alloc_destination (coding, dst_size, dst);
7048                   if (EQ (coding->src_object, coding->dst_object))
7049                     {
7050                       coding_set_source (coding);
7051                       dst_end = (((unsigned char *) coding->source)
7052                                  + coding->consumed);
7053                     }
7054                   else
7055                     dst_end = coding->destination + coding->dst_bytes;
7056                 }
7057
7058               for (i = 0; i < to_nchars; i++)
7059                 {
7060                   if (i > 0)
7061                     c = XINT (AREF (trans, i));
7062                   if (coding->dst_multibyte
7063                       || ! CHAR_BYTE8_P (c))
7064                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
7065                   else
7066                     *dst++ = CHAR_TO_BYTE8 (c);
7067                 }
7068               produced_chars += to_nchars;
7069               buf += from_nchars;
7070             }
7071           else
7072             /* This is an annotation datum.  (-C) is the length.  */
7073             buf += -c;
7074         }
7075       carryover = buf_end - buf;
7076     }
7077   else
7078     {
7079       /* Source characters are at coding->source.  */
7080       const unsigned char *src = coding->source;
7081       const unsigned char *src_end = src + coding->consumed;
7082
7083       if (EQ (coding->dst_object, coding->src_object))
7084         {
7085           eassert (growable_destination (coding));
7086           dst_end = (unsigned char *) src;
7087         }
7088       if (coding->src_multibyte != coding->dst_multibyte)
7089         {
7090           if (coding->src_multibyte)
7091             {
7092               bool multibytep = 1;
7093               ptrdiff_t consumed_chars = 0;
7094
7095               while (1)
7096                 {
7097                   const unsigned char *src_base = src;
7098                   int c;
7099
7100                   ONE_MORE_BYTE (c);
7101                   if (dst == dst_end)
7102                     {
7103                       eassert (growable_destination (coding));
7104                       if (EQ (coding->src_object, coding->dst_object))
7105                         dst_end = (unsigned char *) src;
7106                       if (dst == dst_end)
7107                         {
7108                           ptrdiff_t offset = src - coding->source;
7109
7110                           dst = alloc_destination (coding, src_end - src + 1,
7111                                                    dst);
7112                           dst_end = coding->destination + coding->dst_bytes;
7113                           coding_set_source (coding);
7114                           src = coding->source + offset;
7115                           src_end = coding->source + coding->consumed;
7116                           if (EQ (coding->src_object, coding->dst_object))
7117                             dst_end = (unsigned char *) src;
7118                         }
7119                     }
7120                   *dst++ = c;
7121                   produced_chars++;
7122                 }
7123             no_more_source:
7124               ;
7125             }
7126           else
7127             while (src < src_end)
7128               {
7129                 bool multibytep = 1;
7130                 int c = *src++;
7131
7132                 if (dst >= dst_end - 1)
7133                   {
7134                     eassert (growable_destination (coding));
7135                     if (EQ (coding->src_object, coding->dst_object))
7136                       dst_end = (unsigned char *) src;
7137                     if (dst >= dst_end - 1)
7138                       {
7139                         ptrdiff_t offset = src - coding->source;
7140                         ptrdiff_t more_bytes;
7141
7142                         if (EQ (coding->src_object, coding->dst_object))
7143                           more_bytes = ((src_end - src) / 2) + 2;
7144                         else
7145                           more_bytes = src_end - src + 2;
7146                         dst = alloc_destination (coding, more_bytes, dst);
7147                         dst_end = coding->destination + coding->dst_bytes;
7148                         coding_set_source (coding);
7149                         src = coding->source + offset;
7150                         src_end = coding->source + coding->consumed;
7151                         if (EQ (coding->src_object, coding->dst_object))
7152                           dst_end = (unsigned char *) src;
7153                       }
7154                   }
7155                 EMIT_ONE_BYTE (c);
7156               }
7157         }
7158       else
7159         {
7160           if (!EQ (coding->src_object, coding->dst_object))
7161             {
7162               ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
7163
7164               if (require > 0)
7165                 {
7166                   ptrdiff_t offset = src - coding->source;
7167
7168                   dst = alloc_destination (coding, require, dst);
7169                   coding_set_source (coding);
7170                   src = coding->source + offset;
7171                   src_end = coding->source + coding->consumed;
7172                 }
7173             }
7174           produced_chars = coding->consumed_char;
7175           while (src < src_end)
7176             *dst++ = *src++;
7177         }
7178     }
7179
7180   produced = dst - (coding->destination + coding->produced);
7181   if (BUFFERP (coding->dst_object) && produced_chars > 0)
7182     insert_from_gap (produced_chars, produced, 0);
7183   coding->produced += produced;
7184   coding->produced_char += produced_chars;
7185   return carryover;
7186 }
7187
7188 /* Compose text in CODING->object according to the annotation data at
7189    CHARBUF.  CHARBUF is an array:
7190      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
7191  */
7192
7193 static void
7194 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7195 {
7196   int len;
7197   ptrdiff_t to;
7198   enum composition_method method;
7199   Lisp_Object components;
7200
7201   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
7202   to = pos + charbuf[2];
7203   method = (enum composition_method) (charbuf[4]);
7204
7205   if (method == COMPOSITION_RELATIVE)
7206     components = Qnil;
7207   else
7208     {
7209       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
7210       int i, j;
7211
7212       if (method == COMPOSITION_WITH_RULE)
7213         len = charbuf[2] * 3 - 2;
7214       charbuf += MAX_ANNOTATION_LENGTH;
7215       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
7216       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
7217         {
7218           if (charbuf[i] >= 0)
7219             args[j] = make_number (charbuf[i]);
7220           else
7221             {
7222               i++;
7223               args[j] = make_number (charbuf[i] % 0x100);
7224             }
7225         }
7226       components = (i == j ? Fstring (j, args) : Fvector (j, args));
7227     }
7228   compose_text (pos, to, components, Qnil, coding->dst_object);
7229 }
7230
7231
7232 /* Put `charset' property on text in CODING->object according to
7233    the annotation data at CHARBUF.  CHARBUF is an array:
7234      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
7235  */
7236
7237 static void
7238 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7239 {
7240   ptrdiff_t from = pos - charbuf[2];
7241   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
7242
7243   Fput_text_property (make_number (from), make_number (pos),
7244                       Qcharset, CHARSET_NAME (charset),
7245                       coding->dst_object);
7246 }
7247
7248 #define MAX_CHARBUF_SIZE 0x4000
7249 /* How many units decoding functions expect in coding->charbuf at
7250    most.  Currently, decode_coding_emacs_mule expects the following
7251    size, and that is the largest value.  */
7252 #define MAX_CHARBUF_EXTRA_SIZE ((MAX_ANNOTATION_LENGTH * 3) + 1)
7253
7254 #define ALLOC_CONVERSION_WORK_AREA(coding, size)                \
7255   do {                                                          \
7256     ptrdiff_t units = min ((size) + MAX_CHARBUF_EXTRA_SIZE,     \
7257                            MAX_CHARBUF_SIZE);                   \
7258     coding->charbuf = SAFE_ALLOCA (units * sizeof (int));       \
7259     coding->charbuf_size = units;                               \
7260   } while (0)
7261
7262 static void
7263 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
7264 {
7265   int *charbuf = coding->charbuf;
7266   int *charbuf_end = charbuf + coding->charbuf_used;
7267
7268   if (NILP (coding->dst_object))
7269     return;
7270
7271   while (charbuf < charbuf_end)
7272     {
7273       if (*charbuf >= 0)
7274         pos++, charbuf++;
7275       else
7276         {
7277           int len = -*charbuf;
7278
7279           if (len > 2)
7280             switch (charbuf[1])
7281               {
7282               case CODING_ANNOTATE_COMPOSITION_MASK:
7283                 produce_composition (coding, charbuf, pos);
7284                 break;
7285               case CODING_ANNOTATE_CHARSET_MASK:
7286                 produce_charset (coding, charbuf, pos);
7287                 break;
7288               default:
7289                 break;
7290               }
7291           charbuf += len;
7292         }
7293     }
7294 }
7295
7296 /* Decode the data at CODING->src_object into CODING->dst_object.
7297    CODING->src_object is a buffer, a string, or nil.
7298    CODING->dst_object is a buffer.
7299
7300    If CODING->src_object is a buffer, it must be the current buffer.
7301    In this case, if CODING->src_pos is positive, it is a position of
7302    the source text in the buffer, otherwise, the source text is in the
7303    gap area of the buffer, and CODING->src_pos specifies the offset of
7304    the text from GPT (which must be the same as PT).  If this is the
7305    same buffer as CODING->dst_object, CODING->src_pos must be
7306    negative.
7307
7308    If CODING->src_object is a string, CODING->src_pos is an index to
7309    that string.
7310
7311    If CODING->src_object is nil, CODING->source must already point to
7312    the non-relocatable memory area.  In this case, CODING->src_pos is
7313    an offset from CODING->source.
7314
7315    The decoded data is inserted at the current point of the buffer
7316    CODING->dst_object.
7317 */
7318
7319 static void
7320 decode_coding (struct coding_system *coding)
7321 {
7322   Lisp_Object attrs;
7323   Lisp_Object undo_list;
7324   Lisp_Object translation_table;
7325   struct ccl_spec cclspec;
7326   int carryover;
7327   int i;
7328
7329   USE_SAFE_ALLOCA;
7330
7331   if (BUFFERP (coding->src_object)
7332       && coding->src_pos > 0
7333       && coding->src_pos < GPT
7334       && coding->src_pos + coding->src_chars > GPT)
7335     move_gap_both (coding->src_pos, coding->src_pos_byte);
7336
7337   undo_list = Qt;
7338   if (BUFFERP (coding->dst_object))
7339     {
7340       set_buffer_internal (XBUFFER (coding->dst_object));
7341       if (GPT != PT)
7342         move_gap_both (PT, PT_BYTE);
7343
7344       /* We must disable undo_list in order to record the whole insert
7345          transaction via record_insert at the end.  But doing so also
7346          disables the recording of the first change to the undo_list.
7347          Therefore we check for first change here and record it via
7348          record_first_change if needed.  */
7349       if (MODIFF <= SAVE_MODIFF)
7350         record_first_change ();
7351
7352       undo_list = BVAR (current_buffer, undo_list);
7353       bset_undo_list (current_buffer, Qt);
7354     }
7355
7356   coding->consumed = coding->consumed_char = 0;
7357   coding->produced = coding->produced_char = 0;
7358   coding->chars_at_source = 0;
7359   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7360
7361   ALLOC_CONVERSION_WORK_AREA (coding, coding->src_bytes);
7362
7363   attrs = CODING_ID_ATTRS (coding->id);
7364   translation_table = get_translation_table (attrs, 0, NULL);
7365
7366   carryover = 0;
7367   if (coding->decoder == decode_coding_ccl)
7368     {
7369       coding->spec.ccl = &cclspec;
7370       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7371     }
7372   do
7373     {
7374       ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7375
7376       coding_set_source (coding);
7377       coding->annotated = 0;
7378       coding->charbuf_used = carryover;
7379       (*(coding->decoder)) (coding);
7380       coding_set_destination (coding);
7381       carryover = produce_chars (coding, translation_table, 0);
7382       if (coding->annotated)
7383         produce_annotation (coding, pos);
7384       for (i = 0; i < carryover; i++)
7385         coding->charbuf[i]
7386           = coding->charbuf[coding->charbuf_used - carryover + i];
7387     }
7388   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7389          || (coding->consumed < coding->src_bytes
7390              && (coding->result == CODING_RESULT_SUCCESS
7391                  || coding->result == CODING_RESULT_INVALID_SRC)));
7392
7393   if (carryover > 0)
7394     {
7395       coding_set_destination (coding);
7396       coding->charbuf_used = carryover;
7397       produce_chars (coding, translation_table, 1);
7398     }
7399
7400   coding->carryover_bytes = 0;
7401   if (coding->consumed < coding->src_bytes)
7402     {
7403       ptrdiff_t nbytes = coding->src_bytes - coding->consumed;
7404       const unsigned char *src;
7405
7406       coding_set_source (coding);
7407       coding_set_destination (coding);
7408       src = coding->source + coding->consumed;
7409
7410       if (coding->mode & CODING_MODE_LAST_BLOCK)
7411         {
7412           /* Flush out unprocessed data as binary chars.  We are sure
7413              that the number of data is less than the size of
7414              coding->charbuf.  */
7415           coding->charbuf_used = 0;
7416           coding->chars_at_source = 0;
7417
7418           while (nbytes-- > 0)
7419             {
7420               int c = *src++;
7421
7422               if (c & 0x80)
7423                 c = BYTE8_TO_CHAR (c);
7424               coding->charbuf[coding->charbuf_used++] = c;
7425             }
7426           produce_chars (coding, Qnil, 1);
7427         }
7428       else
7429         {
7430           /* Record unprocessed bytes in coding->carryover.  We are
7431              sure that the number of data is less than the size of
7432              coding->carryover.  */
7433           unsigned char *p = coding->carryover;
7434
7435           if (nbytes > sizeof coding->carryover)
7436             nbytes = sizeof coding->carryover;
7437           coding->carryover_bytes = nbytes;
7438           while (nbytes-- > 0)
7439             *p++ = *src++;
7440         }
7441       coding->consumed = coding->src_bytes;
7442     }
7443
7444   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7445       && !inhibit_eol_conversion)
7446     decode_eol (coding);
7447   if (BUFFERP (coding->dst_object))
7448     {
7449       bset_undo_list (current_buffer, undo_list);
7450       record_insert (coding->dst_pos, coding->produced_char);
7451     }
7452
7453   SAFE_FREE ();
7454 }
7455
7456
7457 /* Extract an annotation datum from a composition starting at POS and
7458    ending before LIMIT of CODING->src_object (buffer or string), store
7459    the data in BUF, set *STOP to a starting position of the next
7460    composition (if any) or to LIMIT, and return the address of the
7461    next element of BUF.
7462
7463    If such an annotation is not found, set *STOP to a starting
7464    position of a composition after POS (if any) or to LIMIT, and
7465    return BUF.  */
7466
7467 static int *
7468 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7469                                struct coding_system *coding, int *buf,
7470                                ptrdiff_t *stop)
7471 {
7472   ptrdiff_t start, end;
7473   Lisp_Object prop;
7474
7475   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7476       || end > limit)
7477     *stop = limit;
7478   else if (start > pos)
7479     *stop = start;
7480   else
7481     {
7482       if (start == pos)
7483         {
7484           /* We found a composition.  Store the corresponding
7485              annotation data in BUF.  */
7486           int *head = buf;
7487           enum composition_method method = composition_method (prop);
7488           int nchars = COMPOSITION_LENGTH (prop);
7489
7490           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7491           if (method != COMPOSITION_RELATIVE)
7492             {
7493               Lisp_Object components;
7494               ptrdiff_t i, len, i_byte;
7495
7496               components = COMPOSITION_COMPONENTS (prop);
7497               if (VECTORP (components))
7498                 {
7499                   len = ASIZE (components);
7500                   for (i = 0; i < len; i++)
7501                     *buf++ = XINT (AREF (components, i));
7502                 }
7503               else if (STRINGP (components))
7504                 {
7505                   len = SCHARS (components);
7506                   i = i_byte = 0;
7507                   while (i < len)
7508                     {
7509                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7510                       buf++;
7511                     }
7512                 }
7513               else if (INTEGERP (components))
7514                 {
7515                   len = 1;
7516                   *buf++ = XINT (components);
7517                 }
7518               else if (CONSP (components))
7519                 {
7520                   for (len = 0; CONSP (components);
7521                        len++, components = XCDR (components))
7522                     *buf++ = XINT (XCAR (components));
7523                 }
7524               else
7525                 emacs_abort ();
7526               *head -= len;
7527             }
7528         }
7529
7530       if (find_composition (end, limit, &start, &end, &prop,
7531                             coding->src_object)
7532           && end <= limit)
7533         *stop = start;
7534       else
7535         *stop = limit;
7536     }
7537   return buf;
7538 }
7539
7540
7541 /* Extract an annotation datum from a text property `charset' at POS of
7542    CODING->src_object (buffer of string), store the data in BUF, set
7543    *STOP to the position where the value of `charset' property changes
7544    (limiting by LIMIT), and return the address of the next element of
7545    BUF.
7546
7547    If the property value is nil, set *STOP to the position where the
7548    property value is non-nil (limiting by LIMIT), and return BUF.  */
7549
7550 static int *
7551 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7552                            struct coding_system *coding, int *buf,
7553                            ptrdiff_t *stop)
7554 {
7555   Lisp_Object val, next;
7556   int id;
7557
7558   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7559   if (! NILP (val) && CHARSETP (val))
7560     id = XINT (CHARSET_SYMBOL_ID (val));
7561   else
7562     id = -1;
7563   ADD_CHARSET_DATA (buf, 0, id);
7564   next = Fnext_single_property_change (make_number (pos), Qcharset,
7565                                        coding->src_object,
7566                                        make_number (limit));
7567   *stop = XINT (next);
7568   return buf;
7569 }
7570
7571
7572 static void
7573 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7574                int max_lookup)
7575 {
7576   int *buf = coding->charbuf;
7577   int *buf_end = coding->charbuf + coding->charbuf_size;
7578   const unsigned char *src = coding->source + coding->consumed;
7579   const unsigned char *src_end = coding->source + coding->src_bytes;
7580   ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7581   ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7582   bool multibytep = coding->src_multibyte;
7583   Lisp_Object eol_type;
7584   int c;
7585   ptrdiff_t stop, stop_composition, stop_charset;
7586   int *lookup_buf = NULL;
7587
7588   if (! NILP (translation_table))
7589     lookup_buf = alloca (sizeof (int) * max_lookup);
7590
7591   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7592   if (VECTORP (eol_type))
7593     eol_type = Qunix;
7594
7595   /* Note: composition handling is not yet implemented.  */
7596   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7597
7598   if (NILP (coding->src_object))
7599     stop = stop_composition = stop_charset = end_pos;
7600   else
7601     {
7602       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7603         stop = stop_composition = pos;
7604       else
7605         stop = stop_composition = end_pos;
7606       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7607         stop = stop_charset = pos;
7608       else
7609         stop_charset = end_pos;
7610     }
7611
7612   /* Compensate for CRLF and conversion.  */
7613   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7614   while (buf < buf_end)
7615     {
7616       Lisp_Object trans;
7617
7618       if (pos == stop)
7619         {
7620           if (pos == end_pos)
7621             break;
7622           if (pos == stop_composition)
7623             buf = handle_composition_annotation (pos, end_pos, coding,
7624                                                  buf, &stop_composition);
7625           if (pos == stop_charset)
7626             buf = handle_charset_annotation (pos, end_pos, coding,
7627                                              buf, &stop_charset);
7628           stop = (stop_composition < stop_charset
7629                   ? stop_composition : stop_charset);
7630         }
7631
7632       if (! multibytep)
7633         {
7634           int bytes;
7635
7636           if (coding->encoder == encode_coding_raw_text
7637               || coding->encoder == encode_coding_ccl)
7638             c = *src++, pos++;
7639           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7640             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7641           else
7642             c = BYTE8_TO_CHAR (*src), src++, pos++;
7643         }
7644       else
7645         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7646       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7647         c = '\n';
7648       if (! EQ (eol_type, Qunix))
7649         {
7650           if (c == '\n')
7651             {
7652               if (EQ (eol_type, Qdos))
7653                 *buf++ = '\r';
7654               else
7655                 c = '\r';
7656             }
7657         }
7658
7659       trans = Qnil;
7660       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7661       if (NILP (trans))
7662         *buf++ = c;
7663       else
7664         {
7665           ptrdiff_t from_nchars = 1, to_nchars = 1;
7666           int *lookup_buf_end;
7667           const unsigned char *p = src;
7668           int i;
7669
7670           lookup_buf[0] = c;
7671           for (i = 1; i < max_lookup && p < src_end; i++)
7672             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7673           lookup_buf_end = lookup_buf + i;
7674           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7675           if (INTEGERP (trans))
7676             c = XINT (trans);
7677           else if (CONSP (trans))
7678             {
7679               from_nchars = ASIZE (XCAR (trans));
7680               trans = XCDR (trans);
7681               if (INTEGERP (trans))
7682                 c = XINT (trans);
7683               else
7684                 {
7685                   to_nchars = ASIZE (trans);
7686                   if (buf_end - buf < to_nchars)
7687                     break;
7688                   c = XINT (AREF (trans, 0));
7689                 }
7690             }
7691           else
7692             break;
7693           *buf++ = c;
7694           for (i = 1; i < to_nchars; i++)
7695             *buf++ = XINT (AREF (trans, i));
7696           for (i = 1; i < from_nchars; i++, pos++)
7697             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7698         }
7699     }
7700
7701   coding->consumed = src - coding->source;
7702   coding->consumed_char = pos - coding->src_pos;
7703   coding->charbuf_used = buf - coding->charbuf;
7704   coding->chars_at_source = 0;
7705 }
7706
7707
7708 /* Encode the text at CODING->src_object into CODING->dst_object.
7709    CODING->src_object is a buffer or a string.
7710    CODING->dst_object is a buffer or nil.
7711
7712    If CODING->src_object is a buffer, it must be the current buffer.
7713    In this case, if CODING->src_pos is positive, it is a position of
7714    the source text in the buffer, otherwise. the source text is in the
7715    gap area of the buffer, and coding->src_pos specifies the offset of
7716    the text from GPT (which must be the same as PT).  If this is the
7717    same buffer as CODING->dst_object, CODING->src_pos must be
7718    negative and CODING should not have `pre-write-conversion'.
7719
7720    If CODING->src_object is a string, CODING should not have
7721    `pre-write-conversion'.
7722
7723    If CODING->dst_object is a buffer, the encoded data is inserted at
7724    the current point of that buffer.
7725
7726    If CODING->dst_object is nil, the encoded data is placed at the
7727    memory area specified by CODING->destination.  */
7728
7729 static void
7730 encode_coding (struct coding_system *coding)
7731 {
7732   Lisp_Object attrs;
7733   Lisp_Object translation_table;
7734   int max_lookup;
7735   struct ccl_spec cclspec;
7736
7737   USE_SAFE_ALLOCA;
7738
7739   attrs = CODING_ID_ATTRS (coding->id);
7740   if (coding->encoder == encode_coding_raw_text)
7741     translation_table = Qnil, max_lookup = 0;
7742   else
7743     translation_table = get_translation_table (attrs, 1, &max_lookup);
7744
7745   if (BUFFERP (coding->dst_object))
7746     {
7747       set_buffer_internal (XBUFFER (coding->dst_object));
7748       coding->dst_multibyte
7749         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7750     }
7751
7752   coding->consumed = coding->consumed_char = 0;
7753   coding->produced = coding->produced_char = 0;
7754   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7755
7756   ALLOC_CONVERSION_WORK_AREA (coding, coding->src_chars);
7757
7758   if (coding->encoder == encode_coding_ccl)
7759     {
7760       coding->spec.ccl = &cclspec;
7761       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7762     }
7763   do {
7764     coding_set_source (coding);
7765     consume_chars (coding, translation_table, max_lookup);
7766     coding_set_destination (coding);
7767     (*(coding->encoder)) (coding);
7768   } while (coding->consumed_char < coding->src_chars);
7769
7770   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7771     insert_from_gap (coding->produced_char, coding->produced, 0);
7772
7773   SAFE_FREE ();
7774 }
7775
7776
7777 /* Name (or base name) of work buffer for code conversion.  */
7778 static Lisp_Object Vcode_conversion_workbuf_name;
7779
7780 /* A working buffer used by the top level conversion.  Once it is
7781    created, it is never destroyed.  It has the name
7782    Vcode_conversion_workbuf_name.  The other working buffers are
7783    destroyed after the use is finished, and their names are modified
7784    versions of Vcode_conversion_workbuf_name.  */
7785 static Lisp_Object Vcode_conversion_reused_workbuf;
7786
7787 /* True iff Vcode_conversion_reused_workbuf is already in use.  */
7788 static bool reused_workbuf_in_use;
7789
7790
7791 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7792    multibyteness of returning buffer.  */
7793
7794 static Lisp_Object
7795 make_conversion_work_buffer (bool multibyte)
7796 {
7797   Lisp_Object name, workbuf;
7798   struct buffer *current;
7799
7800   if (reused_workbuf_in_use)
7801     {
7802       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7803       workbuf = Fget_buffer_create (name);
7804     }
7805   else
7806     {
7807       reused_workbuf_in_use = 1;
7808       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7809         Vcode_conversion_reused_workbuf
7810           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7811       workbuf = Vcode_conversion_reused_workbuf;
7812     }
7813   current = current_buffer;
7814   set_buffer_internal (XBUFFER (workbuf));
7815   /* We can't allow modification hooks to run in the work buffer.  For
7816      instance, directory_files_internal assumes that file decoding
7817      doesn't compile new regexps.  */
7818   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7819   Ferase_buffer ();
7820   bset_undo_list (current_buffer, Qt);
7821   bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7822   set_buffer_internal (current);
7823   return workbuf;
7824 }
7825
7826
7827 static void
7828 code_conversion_restore (Lisp_Object arg)
7829 {
7830   Lisp_Object current, workbuf;
7831
7832   current = XCAR (arg);
7833   workbuf = XCDR (arg);
7834   if (! NILP (workbuf))
7835     {
7836       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7837         reused_workbuf_in_use = 0;
7838       else
7839         Fkill_buffer (workbuf);
7840     }
7841   set_buffer_internal (XBUFFER (current));
7842 }
7843
7844 Lisp_Object
7845 code_conversion_save (bool with_work_buf, bool multibyte)
7846 {
7847   Lisp_Object workbuf = Qnil;
7848
7849   if (with_work_buf)
7850     workbuf = make_conversion_work_buffer (multibyte);
7851   record_unwind_protect (code_conversion_restore,
7852                          Fcons (Fcurrent_buffer (), workbuf));
7853   return workbuf;
7854 }
7855
7856 void
7857 decode_coding_gap (struct coding_system *coding,
7858                    ptrdiff_t chars, ptrdiff_t bytes)
7859 {
7860   ptrdiff_t count = SPECPDL_INDEX ();
7861   Lisp_Object attrs;
7862
7863   coding->src_object = Fcurrent_buffer ();
7864   coding->src_chars = chars;
7865   coding->src_bytes = bytes;
7866   coding->src_pos = -chars;
7867   coding->src_pos_byte = -bytes;
7868   coding->src_multibyte = chars < bytes;
7869   coding->dst_object = coding->src_object;
7870   coding->dst_pos = PT;
7871   coding->dst_pos_byte = PT_BYTE;
7872   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7873
7874   coding->head_ascii = -1;
7875   coding->detected_utf8_bytes = coding->detected_utf8_chars = -1;
7876   coding->eol_seen = EOL_SEEN_NONE;
7877   if (CODING_REQUIRE_DETECTION (coding))
7878     detect_coding (coding);
7879   attrs = CODING_ID_ATTRS (coding->id);
7880   if (! disable_ascii_optimization
7881       && ! coding->src_multibyte
7882       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
7883       && NILP (CODING_ATTR_POST_READ (attrs))
7884       && NILP (get_translation_table (attrs, 0, NULL)))
7885     {
7886       chars = coding->head_ascii;
7887       if (chars < 0)
7888         chars = check_ascii (coding);
7889       if (chars != bytes)
7890         {
7891           /* There exists a non-ASCII byte.  */
7892           if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8)
7893               && coding->detected_utf8_bytes == coding->src_bytes)
7894             {
7895               if (coding->detected_utf8_chars >= 0)
7896                 chars = coding->detected_utf8_chars;
7897               else
7898                 chars = check_utf_8 (coding);
7899               if (CODING_UTF_8_BOM (coding) != utf_without_bom
7900                   && coding->head_ascii == 0
7901                   && coding->source[0] == UTF_8_BOM_1
7902                   && coding->source[1] == UTF_8_BOM_2
7903                   && coding->source[2] == UTF_8_BOM_3)
7904                 {
7905                   chars--;
7906                   bytes -= 3;
7907                   coding->src_bytes -= 3;
7908                 }
7909             }
7910           else
7911             chars = -1;
7912         }
7913       if (chars >= 0)
7914         {
7915           Lisp_Object eol_type;
7916
7917           eol_type = CODING_ID_EOL_TYPE (coding->id);
7918           if (VECTORP (eol_type))
7919             {
7920               if (coding->eol_seen != EOL_SEEN_NONE)
7921                 eol_type = adjust_coding_eol_type (coding, coding->eol_seen);
7922             }
7923           if (EQ (eol_type, Qmac))
7924             {
7925               unsigned char *src_end = GAP_END_ADDR;
7926               unsigned char *src = src_end - coding->src_bytes;
7927
7928               while (src < src_end)
7929                 {
7930                   if (*src++ == '\r')
7931                     src[-1] = '\n';
7932                 }
7933             }
7934           else if (EQ (eol_type, Qdos))
7935             {
7936               unsigned char *src = GAP_END_ADDR;
7937               unsigned char *src_beg = src - coding->src_bytes;
7938               unsigned char *dst = src;
7939               ptrdiff_t diff;
7940
7941               while (src_beg < src)
7942                 {
7943                   *--dst = *--src;
7944                   if (*src == '\n' && src > src_beg && src[-1] == '\r')
7945                     src--;
7946                 }
7947               diff = dst - src;
7948               bytes -= diff;
7949               chars -= diff;
7950             }
7951           coding->produced = bytes;
7952           coding->produced_char = chars;
7953           insert_from_gap (chars, bytes, 1);
7954           return;
7955         }
7956     }
7957   code_conversion_save (0, 0);
7958
7959   coding->mode |= CODING_MODE_LAST_BLOCK;
7960   current_buffer->text->inhibit_shrinking = 1;
7961   decode_coding (coding);
7962   current_buffer->text->inhibit_shrinking = 0;
7963
7964   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7965     {
7966       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7967       Lisp_Object val;
7968
7969       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7970       val = call1 (CODING_ATTR_POST_READ (attrs),
7971                    make_number (coding->produced_char));
7972       CHECK_NATNUM (val);
7973       coding->produced_char += Z - prev_Z;
7974       coding->produced += Z_BYTE - prev_Z_BYTE;
7975     }
7976
7977   unbind_to (count, Qnil);
7978 }
7979
7980
7981 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7982    SRC_OBJECT into DST_OBJECT by coding context CODING.
7983
7984    SRC_OBJECT is a buffer, a string, or Qnil.
7985
7986    If it is a buffer, the text is at point of the buffer.  FROM and TO
7987    are positions in the buffer.
7988
7989    If it is a string, the text is at the beginning of the string.
7990    FROM and TO are indices to the string.
7991
7992    If it is nil, the text is at coding->source.  FROM and TO are
7993    indices to coding->source.
7994
7995    DST_OBJECT is a buffer, Qt, or Qnil.
7996
7997    If it is a buffer, the decoded text is inserted at point of the
7998    buffer.  If the buffer is the same as SRC_OBJECT, the source text
7999    is deleted.
8000
8001    If it is Qt, a string is made from the decoded text, and
8002    set in CODING->dst_object.
8003
8004    If it is Qnil, the decoded text is stored at CODING->destination.
8005    The caller must allocate CODING->dst_bytes bytes at
8006    CODING->destination by xmalloc.  If the decoded text is longer than
8007    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
8008  */
8009
8010 void
8011 decode_coding_object (struct coding_system *coding,
8012                       Lisp_Object src_object,
8013                       ptrdiff_t from, ptrdiff_t from_byte,
8014                       ptrdiff_t to, ptrdiff_t to_byte,
8015                       Lisp_Object dst_object)
8016 {
8017   ptrdiff_t count = SPECPDL_INDEX ();
8018   unsigned char *destination IF_LINT (= NULL);
8019   ptrdiff_t dst_bytes IF_LINT (= 0);
8020   ptrdiff_t chars = to - from;
8021   ptrdiff_t bytes = to_byte - from_byte;
8022   Lisp_Object attrs;
8023   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
8024   bool need_marker_adjustment = 0;
8025   Lisp_Object old_deactivate_mark;
8026
8027   old_deactivate_mark = Vdeactivate_mark;
8028
8029   if (NILP (dst_object))
8030     {
8031       destination = coding->destination;
8032       dst_bytes = coding->dst_bytes;
8033     }
8034
8035   coding->src_object = src_object;
8036   coding->src_chars = chars;
8037   coding->src_bytes = bytes;
8038   coding->src_multibyte = chars < bytes;
8039
8040   if (STRINGP (src_object))
8041     {
8042       coding->src_pos = from;
8043       coding->src_pos_byte = from_byte;
8044     }
8045   else if (BUFFERP (src_object))
8046     {
8047       set_buffer_internal (XBUFFER (src_object));
8048       if (from != GPT)
8049         move_gap_both (from, from_byte);
8050       if (EQ (src_object, dst_object))
8051         {
8052           struct Lisp_Marker *tail;
8053
8054           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8055             {
8056               tail->need_adjustment
8057                 = tail->charpos == (tail->insertion_type ? from : to);
8058               need_marker_adjustment |= tail->need_adjustment;
8059             }
8060           saved_pt = PT, saved_pt_byte = PT_BYTE;
8061           TEMP_SET_PT_BOTH (from, from_byte);
8062           current_buffer->text->inhibit_shrinking = 1;
8063           del_range_both (from, from_byte, to, to_byte, 1);
8064           coding->src_pos = -chars;
8065           coding->src_pos_byte = -bytes;
8066         }
8067       else
8068         {
8069           coding->src_pos = from;
8070           coding->src_pos_byte = from_byte;
8071         }
8072     }
8073
8074   if (CODING_REQUIRE_DETECTION (coding))
8075     detect_coding (coding);
8076   attrs = CODING_ID_ATTRS (coding->id);
8077
8078   if (EQ (dst_object, Qt)
8079       || (! NILP (CODING_ATTR_POST_READ (attrs))
8080           && NILP (dst_object)))
8081     {
8082       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
8083       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
8084       coding->dst_pos = BEG;
8085       coding->dst_pos_byte = BEG_BYTE;
8086     }
8087   else if (BUFFERP (dst_object))
8088     {
8089       code_conversion_save (0, 0);
8090       coding->dst_object = dst_object;
8091       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
8092       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
8093       coding->dst_multibyte
8094         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8095     }
8096   else
8097     {
8098       code_conversion_save (0, 0);
8099       coding->dst_object = Qnil;
8100       /* Most callers presume this will return a multibyte result, and they
8101          won't use `binary' or `raw-text' anyway, so let's not worry about
8102          CODING_FOR_UNIBYTE.  */
8103       coding->dst_multibyte = 1;
8104     }
8105
8106   decode_coding (coding);
8107
8108   if (BUFFERP (coding->dst_object))
8109     set_buffer_internal (XBUFFER (coding->dst_object));
8110
8111   if (! NILP (CODING_ATTR_POST_READ (attrs)))
8112     {
8113       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
8114       Lisp_Object val;
8115
8116       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
8117       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
8118                         make_number (coding->produced_char));
8119       CHECK_NATNUM (val);
8120       coding->produced_char += Z - prev_Z;
8121       coding->produced += Z_BYTE - prev_Z_BYTE;
8122     }
8123
8124   if (EQ (dst_object, Qt))
8125     {
8126       coding->dst_object = Fbuffer_string ();
8127     }
8128   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
8129     {
8130       set_buffer_internal (XBUFFER (coding->dst_object));
8131       if (dst_bytes < coding->produced)
8132         {
8133           eassert (coding->produced > 0);
8134           destination = xrealloc (destination, coding->produced);
8135           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
8136             move_gap_both (BEGV, BEGV_BYTE);
8137           memcpy (destination, BEGV_ADDR, coding->produced);
8138           coding->destination = destination;
8139         }
8140     }
8141
8142   if (saved_pt >= 0)
8143     {
8144       /* This is the case of:
8145          (BUFFERP (src_object) && EQ (src_object, dst_object))
8146          As we have moved PT while replacing the original buffer
8147          contents, we must recover it now.  */
8148       set_buffer_internal (XBUFFER (src_object));
8149       current_buffer->text->inhibit_shrinking = 0;
8150       if (saved_pt < from)
8151         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8152       else if (saved_pt < from + chars)
8153         TEMP_SET_PT_BOTH (from, from_byte);
8154       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8155         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8156                           saved_pt_byte + (coding->produced - bytes));
8157       else
8158         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8159                           saved_pt_byte + (coding->produced - bytes));
8160
8161       if (need_marker_adjustment)
8162         {
8163           struct Lisp_Marker *tail;
8164
8165           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8166             if (tail->need_adjustment)
8167               {
8168                 tail->need_adjustment = 0;
8169                 if (tail->insertion_type)
8170                   {
8171                     tail->bytepos = from_byte;
8172                     tail->charpos = from;
8173                   }
8174                 else
8175                   {
8176                     tail->bytepos = from_byte + coding->produced;
8177                     tail->charpos
8178                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8179                          ? tail->bytepos : from + coding->produced_char);
8180                   }
8181               }
8182         }
8183     }
8184
8185   Vdeactivate_mark = old_deactivate_mark;
8186   unbind_to (count, coding->dst_object);
8187 }
8188
8189
8190 void
8191 encode_coding_object (struct coding_system *coding,
8192                       Lisp_Object src_object,
8193                       ptrdiff_t from, ptrdiff_t from_byte,
8194                       ptrdiff_t to, ptrdiff_t to_byte,
8195                       Lisp_Object dst_object)
8196 {
8197   ptrdiff_t count = SPECPDL_INDEX ();
8198   ptrdiff_t chars = to - from;
8199   ptrdiff_t bytes = to_byte - from_byte;
8200   Lisp_Object attrs;
8201   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
8202   bool need_marker_adjustment = 0;
8203   bool kill_src_buffer = 0;
8204   Lisp_Object old_deactivate_mark;
8205
8206   old_deactivate_mark = Vdeactivate_mark;
8207
8208   coding->src_object = src_object;
8209   coding->src_chars = chars;
8210   coding->src_bytes = bytes;
8211   coding->src_multibyte = chars < bytes;
8212
8213   attrs = CODING_ID_ATTRS (coding->id);
8214
8215   if (EQ (src_object, dst_object))
8216     {
8217       struct Lisp_Marker *tail;
8218
8219       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8220         {
8221           tail->need_adjustment
8222             = tail->charpos == (tail->insertion_type ? from : to);
8223           need_marker_adjustment |= tail->need_adjustment;
8224         }
8225     }
8226
8227   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
8228     {
8229       coding->src_object = code_conversion_save (1, coding->src_multibyte);
8230       set_buffer_internal (XBUFFER (coding->src_object));
8231       if (STRINGP (src_object))
8232         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
8233       else if (BUFFERP (src_object))
8234         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
8235       else
8236         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
8237
8238       if (EQ (src_object, dst_object))
8239         {
8240           set_buffer_internal (XBUFFER (src_object));
8241           saved_pt = PT, saved_pt_byte = PT_BYTE;
8242           del_range_both (from, from_byte, to, to_byte, 1);
8243           set_buffer_internal (XBUFFER (coding->src_object));
8244         }
8245
8246       safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
8247                   make_number (BEG), make_number (Z));
8248       if (XBUFFER (coding->src_object) != current_buffer)
8249         kill_src_buffer = 1;
8250       coding->src_object = Fcurrent_buffer ();
8251       if (BEG != GPT)
8252         move_gap_both (BEG, BEG_BYTE);
8253       coding->src_chars = Z - BEG;
8254       coding->src_bytes = Z_BYTE - BEG_BYTE;
8255       coding->src_pos = BEG;
8256       coding->src_pos_byte = BEG_BYTE;
8257       coding->src_multibyte = Z < Z_BYTE;
8258     }
8259   else if (STRINGP (src_object))
8260     {
8261       code_conversion_save (0, 0);
8262       coding->src_pos = from;
8263       coding->src_pos_byte = from_byte;
8264     }
8265   else if (BUFFERP (src_object))
8266     {
8267       code_conversion_save (0, 0);
8268       set_buffer_internal (XBUFFER (src_object));
8269       if (EQ (src_object, dst_object))
8270         {
8271           saved_pt = PT, saved_pt_byte = PT_BYTE;
8272           coding->src_object = del_range_1 (from, to, 1, 1);
8273           coding->src_pos = 0;
8274           coding->src_pos_byte = 0;
8275         }
8276       else
8277         {
8278           if (from < GPT && to >= GPT)
8279             move_gap_both (from, from_byte);
8280           coding->src_pos = from;
8281           coding->src_pos_byte = from_byte;
8282         }
8283     }
8284   else
8285     {
8286       code_conversion_save (0, 0);
8287       coding->src_pos = from;
8288       coding->src_pos_byte = from_byte;
8289     }
8290
8291   if (BUFFERP (dst_object))
8292     {
8293       coding->dst_object = dst_object;
8294       if (EQ (src_object, dst_object))
8295         {
8296           coding->dst_pos = from;
8297           coding->dst_pos_byte = from_byte;
8298         }
8299       else
8300         {
8301           struct buffer *current = current_buffer;
8302
8303           set_buffer_temp (XBUFFER (dst_object));
8304           coding->dst_pos = PT;
8305           coding->dst_pos_byte = PT_BYTE;
8306           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8307           set_buffer_temp (current);
8308         }
8309       coding->dst_multibyte
8310         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8311     }
8312   else if (EQ (dst_object, Qt))
8313     {
8314       ptrdiff_t dst_bytes = max (1, coding->src_chars);
8315       coding->dst_object = Qnil;
8316       coding->destination = xmalloc (dst_bytes);
8317       coding->dst_bytes = dst_bytes;
8318       coding->dst_multibyte = 0;
8319     }
8320   else
8321     {
8322       coding->dst_object = Qnil;
8323       coding->dst_multibyte = 0;
8324     }
8325
8326   encode_coding (coding);
8327
8328   if (EQ (dst_object, Qt))
8329     {
8330       if (BUFFERP (coding->dst_object))
8331         coding->dst_object = Fbuffer_string ();
8332       else if (coding->raw_destination)
8333         /* This is used to avoid creating huge Lisp string.
8334            NOTE: caller who sets `raw_destination' is also
8335            responsible for freeing `destination' buffer.  */
8336         coding->dst_object = Qnil;
8337       else
8338         {
8339           coding->dst_object
8340             = make_unibyte_string ((char *) coding->destination,
8341                                    coding->produced);
8342           xfree (coding->destination);
8343         }
8344     }
8345
8346   if (saved_pt >= 0)
8347     {
8348       /* This is the case of:
8349          (BUFFERP (src_object) && EQ (src_object, dst_object))
8350          As we have moved PT while replacing the original buffer
8351          contents, we must recover it now.  */
8352       set_buffer_internal (XBUFFER (src_object));
8353       if (saved_pt < from)
8354         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8355       else if (saved_pt < from + chars)
8356         TEMP_SET_PT_BOTH (from, from_byte);
8357       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8358         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8359                           saved_pt_byte + (coding->produced - bytes));
8360       else
8361         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8362                           saved_pt_byte + (coding->produced - bytes));
8363
8364       if (need_marker_adjustment)
8365         {
8366           struct Lisp_Marker *tail;
8367
8368           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8369             if (tail->need_adjustment)
8370               {
8371                 tail->need_adjustment = 0;
8372                 if (tail->insertion_type)
8373                   {
8374                     tail->bytepos = from_byte;
8375                     tail->charpos = from;
8376                   }
8377                 else
8378                   {
8379                     tail->bytepos = from_byte + coding->produced;
8380                     tail->charpos
8381                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8382                          ? tail->bytepos : from + coding->produced_char);
8383                   }
8384               }
8385         }
8386     }
8387
8388   if (kill_src_buffer)
8389     Fkill_buffer (coding->src_object);
8390
8391   Vdeactivate_mark = old_deactivate_mark;
8392   unbind_to (count, Qnil);
8393 }
8394
8395
8396 Lisp_Object
8397 preferred_coding_system (void)
8398 {
8399   int id = coding_categories[coding_priorities[0]].id;
8400
8401   return CODING_ID_NAME (id);
8402 }
8403
8404 #if defined (WINDOWSNT) || defined (CYGWIN)
8405
8406 Lisp_Object
8407 from_unicode (Lisp_Object str)
8408 {
8409   CHECK_STRING (str);
8410   if (!STRING_MULTIBYTE (str) &&
8411       SBYTES (str) & 1)
8412     {
8413       str = Fsubstring (str, make_number (0), make_number (-1));
8414     }
8415
8416   return code_convert_string_norecord (str, Qutf_16le, 0);
8417 }
8418
8419 Lisp_Object
8420 from_unicode_buffer (const wchar_t *wstr)
8421 {
8422     return from_unicode (
8423         make_unibyte_string (
8424             (char *) wstr,
8425             /* we get one of the two final 0 bytes for free. */
8426             1 + sizeof (wchar_t) * wcslen (wstr)));
8427 }
8428
8429 wchar_t *
8430 to_unicode (Lisp_Object str, Lisp_Object *buf)
8431 {
8432   *buf = code_convert_string_norecord (str, Qutf_16le, 1);
8433   /* We need to make another copy (in addition to the one made by
8434      code_convert_string_norecord) to ensure that the final string is
8435      _doubly_ zero terminated --- that is, that the string is
8436      terminated by two zero bytes and one utf-16le null character.
8437      Because strings are already terminated with a single zero byte,
8438      we just add one additional zero. */
8439   str = make_uninit_string (SBYTES (*buf) + 1);
8440   memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
8441   SDATA (str) [SBYTES (*buf)] = '\0';
8442   *buf = str;
8443   return WCSDATA (*buf);
8444 }
8445
8446 #endif /* WINDOWSNT || CYGWIN */
8447
8448 \f
8449 #ifdef emacs
8450 /*** 8. Emacs Lisp library functions ***/
8451
8452 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8453        doc: /* Return t if OBJECT is nil or a coding-system.
8454 See the documentation of `define-coding-system' for information
8455 about coding-system objects.  */)
8456   (Lisp_Object object)
8457 {
8458   if (NILP (object)
8459       || CODING_SYSTEM_ID (object) >= 0)
8460     return Qt;
8461   if (! SYMBOLP (object)
8462       || NILP (Fget (object, Qcoding_system_define_form)))
8463     return Qnil;
8464   return Qt;
8465 }
8466
8467 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8468        Sread_non_nil_coding_system, 1, 1, 0,
8469        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8470   (Lisp_Object prompt)
8471 {
8472   Lisp_Object val;
8473   do
8474     {
8475       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8476                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8477     }
8478   while (SCHARS (val) == 0);
8479   return (Fintern (val, Qnil));
8480 }
8481
8482 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8483        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8484 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8485 Ignores case when completing coding systems (all Emacs coding systems
8486 are lower-case).  */)
8487   (Lisp_Object prompt, Lisp_Object default_coding_system)
8488 {
8489   Lisp_Object val;
8490   ptrdiff_t count = SPECPDL_INDEX ();
8491
8492   if (SYMBOLP (default_coding_system))
8493     default_coding_system = SYMBOL_NAME (default_coding_system);
8494   specbind (Qcompletion_ignore_case, Qt);
8495   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8496                           Qt, Qnil, Qcoding_system_history,
8497                           default_coding_system, Qnil);
8498   unbind_to (count, Qnil);
8499   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8500 }
8501
8502 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8503        1, 1, 0,
8504        doc: /* Check validity of CODING-SYSTEM.
8505 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8506 It is valid if it is nil or a symbol defined as a coding system by the
8507 function `define-coding-system'.  */)
8508   (Lisp_Object coding_system)
8509 {
8510   Lisp_Object define_form;
8511
8512   define_form = Fget (coding_system, Qcoding_system_define_form);
8513   if (! NILP (define_form))
8514     {
8515       Fput (coding_system, Qcoding_system_define_form, Qnil);
8516       safe_eval (define_form);
8517     }
8518   if (!NILP (Fcoding_system_p (coding_system)))
8519     return coding_system;
8520   xsignal1 (Qcoding_system_error, coding_system);
8521 }
8522
8523 \f
8524 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8525    HIGHEST, return the coding system of the highest
8526    priority among the detected coding systems.  Otherwise return a
8527    list of detected coding systems sorted by their priorities.  If
8528    MULTIBYTEP, it is assumed that the bytes are in correct
8529    multibyte form but contains only ASCII and eight-bit chars.
8530    Otherwise, the bytes are raw bytes.
8531
8532    CODING-SYSTEM controls the detection as below:
8533
8534    If it is nil, detect both text-format and eol-format.  If the
8535    text-format part of CODING-SYSTEM is already specified
8536    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8537    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8538    detect only text-format.  */
8539
8540 Lisp_Object
8541 detect_coding_system (const unsigned char *src,
8542                       ptrdiff_t src_chars, ptrdiff_t src_bytes,
8543                       bool highest, bool multibytep,
8544                       Lisp_Object coding_system)
8545 {
8546   const unsigned char *src_end = src + src_bytes;
8547   Lisp_Object attrs, eol_type;
8548   Lisp_Object val = Qnil;
8549   struct coding_system coding;
8550   ptrdiff_t id;
8551   struct coding_detection_info detect_info;
8552   enum coding_category base_category;
8553   bool null_byte_found = 0, eight_bit_found = 0;
8554
8555   if (NILP (coding_system))
8556     coding_system = Qundecided;
8557   setup_coding_system (coding_system, &coding);
8558   attrs = CODING_ID_ATTRS (coding.id);
8559   eol_type = CODING_ID_EOL_TYPE (coding.id);
8560   coding_system = CODING_ATTR_BASE_NAME (attrs);
8561
8562   coding.source = src;
8563   coding.src_chars = src_chars;
8564   coding.src_bytes = src_bytes;
8565   coding.src_multibyte = multibytep;
8566   coding.consumed = 0;
8567   coding.mode |= CODING_MODE_LAST_BLOCK;
8568   coding.head_ascii = 0;
8569
8570   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8571
8572   /* At first, detect text-format if necessary.  */
8573   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8574   if (base_category == coding_category_undecided)
8575     {
8576       enum coding_category category IF_LINT (= 0);
8577       struct coding_system *this IF_LINT (= NULL);
8578       int c, i;
8579       bool inhibit_nbd = inhibit_flag (coding.spec.undecided.inhibit_nbd,
8580                                        inhibit_null_byte_detection);
8581       bool inhibit_ied = inhibit_flag (coding.spec.undecided.inhibit_ied,
8582                                        inhibit_iso_escape_detection);
8583       bool prefer_utf_8 = coding.spec.undecided.prefer_utf_8;
8584
8585       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8586       for (; src < src_end; src++)
8587         {
8588           c = *src;
8589           if (c & 0x80)
8590             {
8591               eight_bit_found = 1;
8592               if (null_byte_found)
8593                 break;
8594             }
8595           else if (c < 0x20)
8596             {
8597               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8598                   && ! inhibit_ied
8599                   && ! detect_info.checked)
8600                 {
8601                   if (detect_coding_iso_2022 (&coding, &detect_info))
8602                     {
8603                       /* We have scanned the whole data.  */
8604                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8605                         {
8606                           /* We didn't find an 8-bit code.  We may
8607                              have found a null-byte, but it's very
8608                              rare that a binary file confirm to
8609                              ISO-2022.  */
8610                           src = src_end;
8611                           coding.head_ascii = src - coding.source;
8612                         }
8613                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8614                       break;
8615                     }
8616                 }
8617               else if (! c && !inhibit_nbd)
8618                 {
8619                   null_byte_found = 1;
8620                   if (eight_bit_found)
8621                     break;
8622                 }
8623               if (! eight_bit_found)
8624                 coding.head_ascii++;
8625             }
8626           else if (! eight_bit_found)
8627             coding.head_ascii++;
8628         }
8629
8630       if (null_byte_found || eight_bit_found
8631           || coding.head_ascii < coding.src_bytes
8632           || detect_info.found)
8633         {
8634           if (coding.head_ascii == coding.src_bytes)
8635             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8636             for (i = 0; i < coding_category_raw_text; i++)
8637               {
8638                 category = coding_priorities[i];
8639                 this = coding_categories + category;
8640                 if (detect_info.found & (1 << category))
8641                   break;
8642               }
8643           else
8644             {
8645               if (null_byte_found)
8646                 {
8647                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8648                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8649                 }
8650               else if (prefer_utf_8
8651                        && detect_coding_utf_8 (&coding, &detect_info))
8652                 {
8653                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
8654                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
8655                 }
8656               for (i = 0; i < coding_category_raw_text; i++)
8657                 {
8658                   category = coding_priorities[i];
8659                   this = coding_categories + category;
8660
8661                   if (this->id < 0)
8662                     {
8663                       /* No coding system of this category is defined.  */
8664                       detect_info.rejected |= (1 << category);
8665                     }
8666                   else if (category >= coding_category_raw_text)
8667                     continue;
8668                   else if (detect_info.checked & (1 << category))
8669                     {
8670                       if (highest
8671                           && (detect_info.found & (1 << category)))
8672                         break;
8673                     }
8674                   else if ((*(this->detector)) (&coding, &detect_info)
8675                            && highest
8676                            && (detect_info.found & (1 << category)))
8677                     {
8678                       if (category == coding_category_utf_16_auto)
8679                         {
8680                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8681                             category = coding_category_utf_16_le;
8682                           else
8683                             category = coding_category_utf_16_be;
8684                         }
8685                       break;
8686                     }
8687                 }
8688             }
8689         }
8690
8691       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8692           || null_byte_found)
8693         {
8694           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8695           id = CODING_SYSTEM_ID (Qno_conversion);
8696           val = list1 (make_number (id));
8697         }
8698       else if (! detect_info.rejected && ! detect_info.found)
8699         {
8700           detect_info.found = CATEGORY_MASK_ANY;
8701           id = coding_categories[coding_category_undecided].id;
8702           val = list1 (make_number (id));
8703         }
8704       else if (highest)
8705         {
8706           if (detect_info.found)
8707             {
8708               detect_info.found = 1 << category;
8709               val = list1 (make_number (this->id));
8710             }
8711           else
8712             for (i = 0; i < coding_category_raw_text; i++)
8713               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8714                 {
8715                   detect_info.found = 1 << coding_priorities[i];
8716                   id = coding_categories[coding_priorities[i]].id;
8717                   val = list1 (make_number (id));
8718                   break;
8719                 }
8720         }
8721       else
8722         {
8723           int mask = detect_info.rejected | detect_info.found;
8724           int found = 0;
8725
8726           for (i = coding_category_raw_text - 1; i >= 0; i--)
8727             {
8728               category = coding_priorities[i];
8729               if (! (mask & (1 << category)))
8730                 {
8731                   found |= 1 << category;
8732                   id = coding_categories[category].id;
8733                   if (id >= 0)
8734                     val = list1 (make_number (id));
8735                 }
8736             }
8737           for (i = coding_category_raw_text - 1; i >= 0; i--)
8738             {
8739               category = coding_priorities[i];
8740               if (detect_info.found & (1 << category))
8741                 {
8742                   id = coding_categories[category].id;
8743                   val = Fcons (make_number (id), val);
8744                 }
8745             }
8746           detect_info.found |= found;
8747         }
8748     }
8749   else if (base_category == coding_category_utf_8_auto)
8750     {
8751       if (detect_coding_utf_8 (&coding, &detect_info))
8752         {
8753           struct coding_system *this;
8754
8755           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8756             this = coding_categories + coding_category_utf_8_sig;
8757           else
8758             this = coding_categories + coding_category_utf_8_nosig;
8759           val = list1 (make_number (this->id));
8760         }
8761     }
8762   else if (base_category == coding_category_utf_16_auto)
8763     {
8764       if (detect_coding_utf_16 (&coding, &detect_info))
8765         {
8766           struct coding_system *this;
8767
8768           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8769             this = coding_categories + coding_category_utf_16_le;
8770           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8771             this = coding_categories + coding_category_utf_16_be;
8772           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8773             this = coding_categories + coding_category_utf_16_be_nosig;
8774           else
8775             this = coding_categories + coding_category_utf_16_le_nosig;
8776           val = list1 (make_number (this->id));
8777         }
8778     }
8779   else
8780     {
8781       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8782       val = list1 (make_number (coding.id));
8783     }
8784
8785   /* Then, detect eol-format if necessary.  */
8786   {
8787     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8788     Lisp_Object tail;
8789
8790     if (VECTORP (eol_type))
8791       {
8792         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8793           {
8794             if (null_byte_found)
8795               normal_eol = EOL_SEEN_LF;
8796             else
8797               normal_eol = detect_eol (coding.source, src_bytes,
8798                                        coding_category_raw_text);
8799           }
8800         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8801                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8802           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8803                                       coding_category_utf_16_be);
8804         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8805                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8806           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8807                                       coding_category_utf_16_le);
8808       }
8809     else
8810       {
8811         if (EQ (eol_type, Qunix))
8812           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8813         else if (EQ (eol_type, Qdos))
8814           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8815         else
8816           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8817       }
8818
8819     for (tail = val; CONSP (tail); tail = XCDR (tail))
8820       {
8821         enum coding_category category;
8822         int this_eol;
8823
8824         id = XINT (XCAR (tail));
8825         attrs = CODING_ID_ATTRS (id);
8826         category = XINT (CODING_ATTR_CATEGORY (attrs));
8827         eol_type = CODING_ID_EOL_TYPE (id);
8828         if (VECTORP (eol_type))
8829           {
8830             if (category == coding_category_utf_16_be
8831                 || category == coding_category_utf_16_be_nosig)
8832               this_eol = utf_16_be_eol;
8833             else if (category == coding_category_utf_16_le
8834                      || category == coding_category_utf_16_le_nosig)
8835               this_eol = utf_16_le_eol;
8836             else
8837               this_eol = normal_eol;
8838
8839             if (this_eol == EOL_SEEN_LF)
8840               XSETCAR (tail, AREF (eol_type, 0));
8841             else if (this_eol == EOL_SEEN_CRLF)
8842               XSETCAR (tail, AREF (eol_type, 1));
8843             else if (this_eol == EOL_SEEN_CR)
8844               XSETCAR (tail, AREF (eol_type, 2));
8845             else
8846               XSETCAR (tail, CODING_ID_NAME (id));
8847           }
8848         else
8849           XSETCAR (tail, CODING_ID_NAME (id));
8850       }
8851   }
8852
8853   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8854 }
8855
8856
8857 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8858        2, 3, 0,
8859        doc: /* Detect coding system of the text in the region between START and END.
8860 Return a list of possible coding systems ordered by priority.
8861 The coding systems to try and their priorities follows what
8862 the function `coding-system-priority-list' (which see) returns.
8863
8864 If only ASCII characters are found (except for such ISO-2022 control
8865 characters as ESC), it returns a list of single element `undecided'
8866 or its subsidiary coding system according to a detected end-of-line
8867 format.
8868
8869 If optional argument HIGHEST is non-nil, return the coding system of
8870 highest priority.  */)
8871   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8872 {
8873   ptrdiff_t from, to;
8874   ptrdiff_t from_byte, to_byte;
8875
8876   validate_region (&start, &end);
8877   from = XINT (start), to = XINT (end);
8878   from_byte = CHAR_TO_BYTE (from);
8879   to_byte = CHAR_TO_BYTE (to);
8880
8881   if (from < GPT && to >= GPT)
8882     move_gap_both (to, to_byte);
8883
8884   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8885                                to - from, to_byte - from_byte,
8886                                !NILP (highest),
8887                                !NILP (BVAR (current_buffer
8888                                       , enable_multibyte_characters)),
8889                                Qnil);
8890 }
8891
8892 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8893        1, 2, 0,
8894        doc: /* Detect coding system of the text in STRING.
8895 Return a list of possible coding systems ordered by priority.
8896 The coding systems to try and their priorities follows what
8897 the function `coding-system-priority-list' (which see) returns.
8898
8899 If only ASCII characters are found (except for such ISO-2022 control
8900 characters as ESC), it returns a list of single element `undecided'
8901 or its subsidiary coding system according to a detected end-of-line
8902 format.
8903
8904 If optional argument HIGHEST is non-nil, return the coding system of
8905 highest priority.  */)
8906   (Lisp_Object string, Lisp_Object highest)
8907 {
8908   CHECK_STRING (string);
8909
8910   return detect_coding_system (SDATA (string),
8911                                SCHARS (string), SBYTES (string),
8912                                !NILP (highest), STRING_MULTIBYTE (string),
8913                                Qnil);
8914 }
8915
8916
8917 static bool
8918 char_encodable_p (int c, Lisp_Object attrs)
8919 {
8920   Lisp_Object tail;
8921   struct charset *charset;
8922   Lisp_Object translation_table;
8923
8924   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8925   if (! NILP (translation_table))
8926     c = translate_char (translation_table, c);
8927   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8928        CONSP (tail); tail = XCDR (tail))
8929     {
8930       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8931       if (CHAR_CHARSET_P (c, charset))
8932         break;
8933     }
8934   return (! NILP (tail));
8935 }
8936
8937
8938 /* Return a list of coding systems that safely encode the text between
8939    START and END.  If EXCLUDE is non-nil, it is a list of coding
8940    systems not to check.  The returned list doesn't contain any such
8941    coding systems.  In any case, if the text contains only ASCII or is
8942    unibyte, return t.  */
8943
8944 DEFUN ("find-coding-systems-region-internal",
8945        Ffind_coding_systems_region_internal,
8946        Sfind_coding_systems_region_internal, 2, 3, 0,
8947        doc: /* Internal use only.  */)
8948   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8949 {
8950   Lisp_Object coding_attrs_list, safe_codings;
8951   ptrdiff_t start_byte, end_byte;
8952   const unsigned char *p, *pbeg, *pend;
8953   int c;
8954   Lisp_Object tail, elt, work_table;
8955
8956   if (STRINGP (start))
8957     {
8958       if (!STRING_MULTIBYTE (start)
8959           || SCHARS (start) == SBYTES (start))
8960         return Qt;
8961       start_byte = 0;
8962       end_byte = SBYTES (start);
8963     }
8964   else
8965     {
8966       CHECK_NUMBER_COERCE_MARKER (start);
8967       CHECK_NUMBER_COERCE_MARKER (end);
8968       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8969         args_out_of_range (start, end);
8970       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8971         return Qt;
8972       start_byte = CHAR_TO_BYTE (XINT (start));
8973       end_byte = CHAR_TO_BYTE (XINT (end));
8974       if (XINT (end) - XINT (start) == end_byte - start_byte)
8975         return Qt;
8976
8977       if (XINT (start) < GPT && XINT (end) > GPT)
8978         {
8979           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8980             move_gap_both (XINT (start), start_byte);
8981           else
8982             move_gap_both (XINT (end), end_byte);
8983         }
8984     }
8985
8986   coding_attrs_list = Qnil;
8987   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8988     if (NILP (exclude)
8989         || NILP (Fmemq (XCAR (tail), exclude)))
8990       {
8991         Lisp_Object attrs;
8992
8993         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8994         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs)))
8995           {
8996             ASET (attrs, coding_attr_trans_tbl,
8997                   get_translation_table (attrs, 1, NULL));
8998             coding_attrs_list = Fcons (attrs, coding_attrs_list);
8999           }
9000       }
9001
9002   if (STRINGP (start))
9003     p = pbeg = SDATA (start);
9004   else
9005     p = pbeg = BYTE_POS_ADDR (start_byte);
9006   pend = p + (end_byte - start_byte);
9007
9008   while (p < pend && ASCII_CHAR_P (*p)) p++;
9009   while (p < pend && ASCII_CHAR_P (*(pend - 1))) pend--;
9010
9011   work_table = Fmake_char_table (Qnil, Qnil);
9012   while (p < pend)
9013     {
9014       if (ASCII_CHAR_P (*p))
9015         p++;
9016       else
9017         {
9018           c = STRING_CHAR_ADVANCE (p);
9019           if (!NILP (char_table_ref (work_table, c)))
9020             /* This character was already checked.  Ignore it.  */
9021             continue;
9022
9023           charset_map_loaded = 0;
9024           for (tail = coding_attrs_list; CONSP (tail);)
9025             {
9026               elt = XCAR (tail);
9027               if (NILP (elt))
9028                 tail = XCDR (tail);
9029               else if (char_encodable_p (c, elt))
9030                 tail = XCDR (tail);
9031               else if (CONSP (XCDR (tail)))
9032                 {
9033                   XSETCAR (tail, XCAR (XCDR (tail)));
9034                   XSETCDR (tail, XCDR (XCDR (tail)));
9035                 }
9036               else
9037                 {
9038                   XSETCAR (tail, Qnil);
9039                   tail = XCDR (tail);
9040                 }
9041             }
9042           if (charset_map_loaded)
9043             {
9044               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9045
9046               if (STRINGP (start))
9047                 pbeg = SDATA (start);
9048               else
9049                 pbeg = BYTE_POS_ADDR (start_byte);
9050               p = pbeg + p_offset;
9051               pend = pbeg + pend_offset;
9052             }
9053           char_table_set (work_table, c, Qt);
9054         }
9055     }
9056
9057   safe_codings = list2 (Qraw_text, Qno_conversion);
9058   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
9059     if (! NILP (XCAR (tail)))
9060       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
9061
9062   return safe_codings;
9063 }
9064
9065
9066 DEFUN ("unencodable-char-position", Funencodable_char_position,
9067        Sunencodable_char_position, 3, 5, 0,
9068        doc: /* Return position of first un-encodable character in a region.
9069 START and END specify the region and CODING-SYSTEM specifies the
9070 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
9071
9072 If optional 4th argument COUNT is non-nil, it specifies at most how
9073 many un-encodable characters to search.  In this case, the value is a
9074 list of positions.
9075
9076 If optional 5th argument STRING is non-nil, it is a string to search
9077 for un-encodable characters.  In that case, START and END are indexes
9078 to the string and treated as in `substring'.  */)
9079   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system,
9080    Lisp_Object count, Lisp_Object string)
9081 {
9082   EMACS_INT n;
9083   struct coding_system coding;
9084   Lisp_Object attrs, charset_list, translation_table;
9085   Lisp_Object positions;
9086   ptrdiff_t from, to;
9087   const unsigned char *p, *stop, *pend;
9088   bool ascii_compatible;
9089
9090   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
9091   attrs = CODING_ID_ATTRS (coding.id);
9092   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
9093     return Qnil;
9094   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
9095   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9096   translation_table = get_translation_table (attrs, 1, NULL);
9097
9098   if (NILP (string))
9099     {
9100       validate_region (&start, &end);
9101       from = XINT (start);
9102       to = XINT (end);
9103       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
9104           || (ascii_compatible
9105               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
9106         return Qnil;
9107       p = CHAR_POS_ADDR (from);
9108       pend = CHAR_POS_ADDR (to);
9109       if (from < GPT && to >= GPT)
9110         stop = GPT_ADDR;
9111       else
9112         stop = pend;
9113     }
9114   else
9115     {
9116       CHECK_STRING (string);
9117       validate_subarray (string, start, end, SCHARS (string), &from, &to);
9118       if (! STRING_MULTIBYTE (string))
9119         return Qnil;
9120       p = SDATA (string) + string_char_to_byte (string, from);
9121       stop = pend = SDATA (string) + string_char_to_byte (string, to);
9122       if (ascii_compatible && (to - from) == (pend - p))
9123         return Qnil;
9124     }
9125
9126   if (NILP (count))
9127     n = 1;
9128   else
9129     {
9130       CHECK_NATNUM (count);
9131       n = XINT (count);
9132     }
9133
9134   positions = Qnil;
9135   charset_map_loaded = 0;
9136   while (1)
9137     {
9138       int c;
9139
9140       if (ascii_compatible)
9141         while (p < stop && ASCII_CHAR_P (*p))
9142           p++, from++;
9143       if (p >= stop)
9144         {
9145           if (p >= pend)
9146             break;
9147           stop = pend;
9148           p = GAP_END_ADDR;
9149         }
9150
9151       c = STRING_CHAR_ADVANCE (p);
9152       if (! (ASCII_CHAR_P (c) && ascii_compatible)
9153           && ! char_charset (translate_char (translation_table, c),
9154                              charset_list, NULL))
9155         {
9156           positions = Fcons (make_number (from), positions);
9157           n--;
9158           if (n == 0)
9159             break;
9160         }
9161
9162       from++;
9163       if (charset_map_loaded && NILP (string))
9164         {
9165           p = CHAR_POS_ADDR (from);
9166           pend = CHAR_POS_ADDR (to);
9167           if (from < GPT && to >= GPT)
9168             stop = GPT_ADDR;
9169           else
9170             stop = pend;
9171           charset_map_loaded = 0;
9172         }
9173     }
9174
9175   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
9176 }
9177
9178
9179 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
9180        Scheck_coding_systems_region, 3, 3, 0,
9181        doc: /* Check if the region is encodable by coding systems.
9182
9183 START and END are buffer positions specifying the region.
9184 CODING-SYSTEM-LIST is a list of coding systems to check.
9185
9186 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
9187 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
9188 whole region, POS0, POS1, ... are buffer positions where non-encodable
9189 characters are found.
9190
9191 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
9192 value is nil.
9193
9194 START may be a string.  In that case, check if the string is
9195 encodable, and the value contains indices to the string instead of
9196 buffer positions.  END is ignored.
9197
9198 If the current buffer (or START if it is a string) is unibyte, the value
9199 is nil.  */)
9200   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
9201 {
9202   Lisp_Object list;
9203   ptrdiff_t start_byte, end_byte;
9204   ptrdiff_t pos;
9205   const unsigned char *p, *pbeg, *pend;
9206   int c;
9207   Lisp_Object tail, elt, attrs;
9208
9209   if (STRINGP (start))
9210     {
9211       if (!STRING_MULTIBYTE (start)
9212           || SCHARS (start) == SBYTES (start))
9213         return Qnil;
9214       start_byte = 0;
9215       end_byte = SBYTES (start);
9216       pos = 0;
9217     }
9218   else
9219     {
9220       CHECK_NUMBER_COERCE_MARKER (start);
9221       CHECK_NUMBER_COERCE_MARKER (end);
9222       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
9223         args_out_of_range (start, end);
9224       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9225         return Qnil;
9226       start_byte = CHAR_TO_BYTE (XINT (start));
9227       end_byte = CHAR_TO_BYTE (XINT (end));
9228       if (XINT (end) - XINT (start) == end_byte - start_byte)
9229         return Qnil;
9230
9231       if (XINT (start) < GPT && XINT (end) > GPT)
9232         {
9233           if ((GPT - XINT (start)) < (XINT (end) - GPT))
9234             move_gap_both (XINT (start), start_byte);
9235           else
9236             move_gap_both (XINT (end), end_byte);
9237         }
9238       pos = XINT (start);
9239     }
9240
9241   list = Qnil;
9242   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
9243     {
9244       elt = XCAR (tail);
9245       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
9246       ASET (attrs, coding_attr_trans_tbl,
9247             get_translation_table (attrs, 1, NULL));
9248       list = Fcons (list2 (elt, attrs), list);
9249     }
9250
9251   if (STRINGP (start))
9252     p = pbeg = SDATA (start);
9253   else
9254     p = pbeg = BYTE_POS_ADDR (start_byte);
9255   pend = p + (end_byte - start_byte);
9256
9257   while (p < pend && ASCII_CHAR_P (*p)) p++, pos++;
9258   while (p < pend && ASCII_CHAR_P (*(pend - 1))) pend--;
9259
9260   while (p < pend)
9261     {
9262       if (ASCII_CHAR_P (*p))
9263         p++;
9264       else
9265         {
9266           c = STRING_CHAR_ADVANCE (p);
9267
9268           charset_map_loaded = 0;
9269           for (tail = list; CONSP (tail); tail = XCDR (tail))
9270             {
9271               elt = XCDR (XCAR (tail));
9272               if (! char_encodable_p (c, XCAR (elt)))
9273                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
9274             }
9275           if (charset_map_loaded)
9276             {
9277               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9278
9279               if (STRINGP (start))
9280                 pbeg = SDATA (start);
9281               else
9282                 pbeg = BYTE_POS_ADDR (start_byte);
9283               p = pbeg + p_offset;
9284               pend = pbeg + pend_offset;
9285             }
9286         }
9287       pos++;
9288     }
9289
9290   tail = list;
9291   list = Qnil;
9292   for (; CONSP (tail); tail = XCDR (tail))
9293     {
9294       elt = XCAR (tail);
9295       if (CONSP (XCDR (XCDR (elt))))
9296         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
9297                       list);
9298     }
9299
9300   return list;
9301 }
9302
9303
9304 static Lisp_Object
9305 code_convert_region (Lisp_Object start, Lisp_Object end,
9306                      Lisp_Object coding_system, Lisp_Object dst_object,
9307                      bool encodep, bool norecord)
9308 {
9309   struct coding_system coding;
9310   ptrdiff_t from, from_byte, to, to_byte;
9311   Lisp_Object src_object;
9312
9313   if (NILP (coding_system))
9314     coding_system = Qno_conversion;
9315   else
9316     CHECK_CODING_SYSTEM (coding_system);
9317   src_object = Fcurrent_buffer ();
9318   if (NILP (dst_object))
9319     dst_object = src_object;
9320   else if (! EQ (dst_object, Qt))
9321     CHECK_BUFFER (dst_object);
9322
9323   validate_region (&start, &end);
9324   from = XFASTINT (start);
9325   from_byte = CHAR_TO_BYTE (from);
9326   to = XFASTINT (end);
9327   to_byte = CHAR_TO_BYTE (to);
9328
9329   setup_coding_system (coding_system, &coding);
9330   coding.mode |= CODING_MODE_LAST_BLOCK;
9331
9332   if (BUFFERP (dst_object) && !EQ (dst_object, src_object))
9333     {
9334       struct buffer *buf = XBUFFER (dst_object);
9335       ptrdiff_t buf_pt = BUF_PT (buf);
9336
9337       invalidate_buffer_caches (buf, buf_pt, buf_pt);
9338     }
9339
9340   if (encodep)
9341     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9342                           dst_object);
9343   else
9344     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9345                           dst_object);
9346   if (! norecord)
9347     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9348
9349   return (BUFFERP (dst_object)
9350           ? make_number (coding.produced_char)
9351           : coding.dst_object);
9352 }
9353
9354
9355 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9356        3, 4, "r\nzCoding system: ",
9357        doc: /* Decode the current region from the specified coding system.
9358 When called from a program, takes four arguments:
9359         START, END, CODING-SYSTEM, and DESTINATION.
9360 START and END are buffer positions.
9361
9362 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9363 If nil, the region between START and END is replaced by the decoded text.
9364 If buffer, the decoded text is inserted in that buffer after point (point
9365 does not move).
9366 In those cases, the length of the decoded text is returned.
9367 If DESTINATION is t, the decoded text is returned.
9368
9369 This function sets `last-coding-system-used' to the precise coding system
9370 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9371 not fully specified.)  */)
9372   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9373 {
9374   return code_convert_region (start, end, coding_system, destination, 0, 0);
9375 }
9376
9377 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9378        3, 4, "r\nzCoding system: ",
9379        doc: /* Encode the current region by specified coding system.
9380 When called from a program, takes four arguments:
9381         START, END, CODING-SYSTEM and DESTINATION.
9382 START and END are buffer positions.
9383
9384 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9385 If nil, the region between START and END is replace by the encoded text.
9386 If buffer, the encoded text is inserted in that buffer after point (point
9387 does not move).
9388 In those cases, the length of the encoded text is returned.
9389 If DESTINATION is t, the encoded text is returned.
9390
9391 This function sets `last-coding-system-used' to the precise coding system
9392 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9393 not fully specified.)  */)
9394   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9395 {
9396   return code_convert_region (start, end, coding_system, destination, 1, 0);
9397 }
9398
9399 Lisp_Object
9400 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
9401                      Lisp_Object dst_object, bool encodep, bool nocopy,
9402                      bool norecord)
9403 {
9404   struct coding_system coding;
9405   ptrdiff_t chars, bytes;
9406
9407   CHECK_STRING (string);
9408   if (NILP (coding_system))
9409     {
9410       if (! norecord)
9411         Vlast_coding_system_used = Qno_conversion;
9412       if (NILP (dst_object))
9413         return (nocopy ? Fcopy_sequence (string) : string);
9414     }
9415
9416   if (NILP (coding_system))
9417     coding_system = Qno_conversion;
9418   else
9419     CHECK_CODING_SYSTEM (coding_system);
9420   if (NILP (dst_object))
9421     dst_object = Qt;
9422   else if (! EQ (dst_object, Qt))
9423     CHECK_BUFFER (dst_object);
9424
9425   setup_coding_system (coding_system, &coding);
9426   coding.mode |= CODING_MODE_LAST_BLOCK;
9427   chars = SCHARS (string);
9428   bytes = SBYTES (string);
9429
9430   if (BUFFERP (dst_object))
9431     {
9432       struct buffer *buf = XBUFFER (dst_object);
9433       ptrdiff_t buf_pt = BUF_PT (buf);
9434
9435       invalidate_buffer_caches (buf, buf_pt, buf_pt);
9436     }
9437
9438   if (encodep)
9439     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9440   else
9441     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9442   if (! norecord)
9443     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9444
9445   return (BUFFERP (dst_object)
9446           ? make_number (coding.produced_char)
9447           : coding.dst_object);
9448 }
9449
9450
9451 /* Encode or decode STRING according to CODING_SYSTEM.
9452    Do not set Vlast_coding_system_used.
9453
9454    This function is called only from macros DECODE_FILE and
9455    ENCODE_FILE, thus we ignore character composition.  */
9456
9457 Lisp_Object
9458 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9459                               bool encodep)
9460 {
9461   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9462 }
9463
9464 /* Encode or decode a file name, to or from a unibyte string suitable
9465    for passing to C library functions.  */
9466 Lisp_Object
9467 decode_file_name (Lisp_Object fname)
9468 {
9469 #ifdef WINDOWSNT
9470   /* The w32 build pretends to use UTF-8 for file-name encoding, and
9471      converts the file names either to UTF-16LE or to the system ANSI
9472      codepage internally, depending on the underlying OS; see w32.c.  */
9473   if (! NILP (Fcoding_system_p (Qutf_8)))
9474     return code_convert_string_norecord (fname, Qutf_8, 0);
9475   return fname;
9476 #else  /* !WINDOWSNT */
9477   if (! NILP (Vfile_name_coding_system))
9478     return code_convert_string_norecord (fname, Vfile_name_coding_system, 0);
9479   else if (! NILP (Vdefault_file_name_coding_system))
9480     return code_convert_string_norecord (fname,
9481                                          Vdefault_file_name_coding_system, 0);
9482   else
9483     return fname;
9484 #endif
9485 }
9486
9487 Lisp_Object
9488 encode_file_name (Lisp_Object fname)
9489 {
9490   /* This is especially important during bootstrap and dumping, when
9491      file-name encoding is not yet known, and therefore any non-ASCII
9492      file names are unibyte strings, and could only be thrashed if we
9493      try to encode them.  */
9494   if (!STRING_MULTIBYTE (fname))
9495     return fname;
9496 #ifdef WINDOWSNT
9497   /* The w32 build pretends to use UTF-8 for file-name encoding, and
9498      converts the file names either to UTF-16LE or to the system ANSI
9499      codepage internally, depending on the underlying OS; see w32.c.  */
9500   if (! NILP (Fcoding_system_p (Qutf_8)))
9501     return code_convert_string_norecord (fname, Qutf_8, 1);
9502   return fname;
9503 #else  /* !WINDOWSNT */
9504   if (! NILP (Vfile_name_coding_system))
9505     return code_convert_string_norecord (fname, Vfile_name_coding_system, 1);
9506   else if (! NILP (Vdefault_file_name_coding_system))
9507     return code_convert_string_norecord (fname,
9508                                          Vdefault_file_name_coding_system, 1);
9509   else
9510     return fname;
9511 #endif
9512 }
9513
9514 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9515        2, 4, 0,
9516        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9517
9518 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9519 if the decoding operation is trivial.
9520
9521 Optional fourth arg BUFFER non-nil means that the decoded text is
9522 inserted in that buffer after point (point does not move).  In this
9523 case, the return value is the length of the decoded text.
9524
9525 This function sets `last-coding-system-used' to the precise coding system
9526 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9527 not fully specified.)  */)
9528   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9529 {
9530   return code_convert_string (string, coding_system, buffer,
9531                               0, ! NILP (nocopy), 0);
9532 }
9533
9534 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9535        2, 4, 0,
9536        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9537
9538 Optional third arg NOCOPY non-nil means it is OK to return STRING
9539 itself if the encoding operation is trivial.
9540
9541 Optional fourth arg BUFFER non-nil means that the encoded text is
9542 inserted in that buffer after point (point does not move).  In this
9543 case, the return value is the length of the encoded text.
9544
9545 This function sets `last-coding-system-used' to the precise coding system
9546 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9547 not fully specified.)  */)
9548   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9549 {
9550   return code_convert_string (string, coding_system, buffer,
9551                               1, ! NILP (nocopy), 0);
9552 }
9553
9554 \f
9555 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9556        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9557 Return the corresponding character.  */)
9558   (Lisp_Object code)
9559 {
9560   Lisp_Object spec, attrs, val;
9561   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9562   EMACS_INT ch;
9563   int c;
9564
9565   CHECK_NATNUM (code);
9566   ch = XFASTINT (code);
9567   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9568   attrs = AREF (spec, 0);
9569
9570   if (ASCII_CHAR_P (ch)
9571       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9572     return code;
9573
9574   val = CODING_ATTR_CHARSET_LIST (attrs);
9575   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9576   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9577   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9578
9579   if (ch <= 0x7F)
9580     {
9581       c = ch;
9582       charset = charset_roman;
9583     }
9584   else if (ch >= 0xA0 && ch < 0xDF)
9585     {
9586       c = ch - 0x80;
9587       charset = charset_kana;
9588     }
9589   else
9590     {
9591       EMACS_INT c1 = ch >> 8;
9592       int c2 = ch & 0xFF;
9593
9594       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9595           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9596         error ("Invalid code: %"pI"d", ch);
9597       c = ch;
9598       SJIS_TO_JIS (c);
9599       charset = charset_kanji;
9600     }
9601   c = DECODE_CHAR (charset, c);
9602   if (c < 0)
9603     error ("Invalid code: %"pI"d", ch);
9604   return make_number (c);
9605 }
9606
9607
9608 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9609        doc: /* Encode a Japanese character CH to shift_jis encoding.
9610 Return the corresponding code in SJIS.  */)
9611   (Lisp_Object ch)
9612 {
9613   Lisp_Object spec, attrs, charset_list;
9614   int c;
9615   struct charset *charset;
9616   unsigned code;
9617
9618   CHECK_CHARACTER (ch);
9619   c = XFASTINT (ch);
9620   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9621   attrs = AREF (spec, 0);
9622
9623   if (ASCII_CHAR_P (c)
9624       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9625     return ch;
9626
9627   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9628   charset = char_charset (c, charset_list, &code);
9629   if (code == CHARSET_INVALID_CODE (charset))
9630     error ("Can't encode by shift_jis encoding: %c", c);
9631   JIS_TO_SJIS (code);
9632
9633   return make_number (code);
9634 }
9635
9636 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9637        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9638 Return the corresponding character.  */)
9639   (Lisp_Object code)
9640 {
9641   Lisp_Object spec, attrs, val;
9642   struct charset *charset_roman, *charset_big5, *charset;
9643   EMACS_INT ch;
9644   int c;
9645
9646   CHECK_NATNUM (code);
9647   ch = XFASTINT (code);
9648   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9649   attrs = AREF (spec, 0);
9650
9651   if (ASCII_CHAR_P (ch)
9652       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9653     return code;
9654
9655   val = CODING_ATTR_CHARSET_LIST (attrs);
9656   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9657   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9658
9659   if (ch <= 0x7F)
9660     {
9661       c = ch;
9662       charset = charset_roman;
9663     }
9664   else
9665     {
9666       EMACS_INT b1 = ch >> 8;
9667       int b2 = ch & 0x7F;
9668       if (b1 < 0xA1 || b1 > 0xFE
9669           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9670         error ("Invalid code: %"pI"d", ch);
9671       c = ch;
9672       charset = charset_big5;
9673     }
9674   c = DECODE_CHAR (charset, c);
9675   if (c < 0)
9676     error ("Invalid code: %"pI"d", ch);
9677   return make_number (c);
9678 }
9679
9680 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9681        doc: /* Encode the Big5 character CH to BIG5 coding system.
9682 Return the corresponding character code in Big5.  */)
9683   (Lisp_Object ch)
9684 {
9685   Lisp_Object spec, attrs, charset_list;
9686   struct charset *charset;
9687   int c;
9688   unsigned code;
9689
9690   CHECK_CHARACTER (ch);
9691   c = XFASTINT (ch);
9692   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9693   attrs = AREF (spec, 0);
9694   if (ASCII_CHAR_P (c)
9695       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9696     return ch;
9697
9698   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9699   charset = char_charset (c, charset_list, &code);
9700   if (code == CHARSET_INVALID_CODE (charset))
9701     error ("Can't encode by Big5 encoding: %c", c);
9702
9703   return make_number (code);
9704 }
9705
9706 \f
9707 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9708        Sset_terminal_coding_system_internal, 1, 2, 0,
9709        doc: /* Internal use only.  */)
9710   (Lisp_Object coding_system, Lisp_Object terminal)
9711 {
9712   struct terminal *term = decode_live_terminal (terminal);
9713   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9714   CHECK_SYMBOL (coding_system);
9715   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9716   /* We had better not send unsafe characters to terminal.  */
9717   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9718   /* Character composition should be disabled.  */
9719   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9720   terminal_coding->src_multibyte = 1;
9721   terminal_coding->dst_multibyte = 0;
9722   tset_charset_list
9723     (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9724             ? coding_charset_list (terminal_coding)
9725             : list1 (make_number (charset_ascii))));
9726   return Qnil;
9727 }
9728
9729 DEFUN ("set-safe-terminal-coding-system-internal",
9730        Fset_safe_terminal_coding_system_internal,
9731        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9732        doc: /* Internal use only.  */)
9733   (Lisp_Object coding_system)
9734 {
9735   CHECK_SYMBOL (coding_system);
9736   setup_coding_system (Fcheck_coding_system (coding_system),
9737                        &safe_terminal_coding);
9738   /* Character composition should be disabled.  */
9739   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9740   safe_terminal_coding.src_multibyte = 1;
9741   safe_terminal_coding.dst_multibyte = 0;
9742   return Qnil;
9743 }
9744
9745 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9746        Sterminal_coding_system, 0, 1, 0,
9747        doc: /* Return coding system specified for terminal output on the given terminal.
9748 TERMINAL may be a terminal object, a frame, or nil for the selected
9749 frame's terminal device.  */)
9750   (Lisp_Object terminal)
9751 {
9752   struct coding_system *terminal_coding
9753     = TERMINAL_TERMINAL_CODING (decode_live_terminal (terminal));
9754   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9755
9756   /* For backward compatibility, return nil if it is `undecided'.  */
9757   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9758 }
9759
9760 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9761        Sset_keyboard_coding_system_internal, 1, 2, 0,
9762        doc: /* Internal use only.  */)
9763   (Lisp_Object coding_system, Lisp_Object terminal)
9764 {
9765   struct terminal *t = decode_live_terminal (terminal);
9766   CHECK_SYMBOL (coding_system);
9767   if (NILP (coding_system))
9768     coding_system = Qno_conversion;
9769   else
9770     Fcheck_coding_system (coding_system);
9771   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9772   /* Character composition should be disabled.  */
9773   TERMINAL_KEYBOARD_CODING (t)->common_flags
9774     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9775   return Qnil;
9776 }
9777
9778 DEFUN ("keyboard-coding-system",
9779        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9780        doc: /* Return coding system specified for decoding keyboard input.  */)
9781   (Lisp_Object terminal)
9782 {
9783   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9784                          (decode_live_terminal (terminal))->id);
9785 }
9786
9787 \f
9788 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9789        Sfind_operation_coding_system,  1, MANY, 0,
9790        doc: /* Choose a coding system for an operation based on the target name.
9791 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9792 DECODING-SYSTEM is the coding system to use for decoding
9793 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9794 for encoding (in case OPERATION does encoding).
9795
9796 The first argument OPERATION specifies an I/O primitive:
9797   For file I/O, `insert-file-contents' or `write-region'.
9798   For process I/O, `call-process', `call-process-region', or `start-process'.
9799   For network I/O, `open-network-stream'.
9800
9801 The remaining arguments should be the same arguments that were passed
9802 to the primitive.  Depending on which primitive, one of those arguments
9803 is selected as the TARGET.  For example, if OPERATION does file I/O,
9804 whichever argument specifies the file name is TARGET.
9805
9806 TARGET has a meaning which depends on OPERATION:
9807   For file I/O, TARGET is a file name (except for the special case below).
9808   For process I/O, TARGET is a process name.
9809   For network I/O, TARGET is a service name or a port number.
9810
9811 This function looks up what is specified for TARGET in
9812 `file-coding-system-alist', `process-coding-system-alist',
9813 or `network-coding-system-alist' depending on OPERATION.
9814 They may specify a coding system, a cons of coding systems,
9815 or a function symbol to call.
9816 In the last case, we call the function with one argument,
9817 which is a list of all the arguments given to this function.
9818 If the function can't decide a coding system, it can return
9819 `undecided' so that the normal code-detection is performed.
9820
9821 If OPERATION is `insert-file-contents', the argument corresponding to
9822 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9823 file name to look up, and BUFFER is a buffer that contains the file's
9824 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9825 function to call for FILENAME, that function should examine the
9826 contents of BUFFER instead of reading the file.
9827
9828 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9829   (ptrdiff_t nargs, Lisp_Object *args)
9830 {
9831   Lisp_Object operation, target_idx, target, val;
9832   register Lisp_Object chain;
9833
9834   if (nargs < 2)
9835     error ("Too few arguments");
9836   operation = args[0];
9837   if (!SYMBOLP (operation)
9838       || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9839     error ("Invalid first argument");
9840   if (nargs <= 1 + XFASTINT (target_idx))
9841     error ("Too few arguments for operation `%s'",
9842            SDATA (SYMBOL_NAME (operation)));
9843   target = args[XFASTINT (target_idx) + 1];
9844   if (!(STRINGP (target)
9845         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9846             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9847         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9848     error ("Invalid argument %"pI"d of operation `%s'",
9849            XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9850   if (CONSP (target))
9851     target = XCAR (target);
9852
9853   chain = ((EQ (operation, Qinsert_file_contents)
9854             || EQ (operation, Qwrite_region))
9855            ? Vfile_coding_system_alist
9856            : (EQ (operation, Qopen_network_stream)
9857               ? Vnetwork_coding_system_alist
9858               : Vprocess_coding_system_alist));
9859   if (NILP (chain))
9860     return Qnil;
9861
9862   for (; CONSP (chain); chain = XCDR (chain))
9863     {
9864       Lisp_Object elt;
9865
9866       elt = XCAR (chain);
9867       if (CONSP (elt)
9868           && ((STRINGP (target)
9869                && STRINGP (XCAR (elt))
9870                && fast_string_match (XCAR (elt), target) >= 0)
9871               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9872         {
9873           val = XCDR (elt);
9874           /* Here, if VAL is both a valid coding system and a valid
9875              function symbol, we return VAL as a coding system.  */
9876           if (CONSP (val))
9877             return val;
9878           if (! SYMBOLP (val))
9879             return Qnil;
9880           if (! NILP (Fcoding_system_p (val)))
9881             return Fcons (val, val);
9882           if (! NILP (Ffboundp (val)))
9883             {
9884               /* We use call1 rather than safe_call1
9885                  so as to get bug reports about functions called here
9886                  which don't handle the current interface.  */
9887               val = call1 (val, Flist (nargs, args));
9888               if (CONSP (val))
9889                 return val;
9890               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9891                 return Fcons (val, val);
9892             }
9893           return Qnil;
9894         }
9895     }
9896   return Qnil;
9897 }
9898
9899 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9900        Sset_coding_system_priority, 0, MANY, 0,
9901        doc: /* Assign higher priority to the coding systems given as arguments.
9902 If multiple coding systems belong to the same category,
9903 all but the first one are ignored.
9904
9905 usage: (set-coding-system-priority &rest coding-systems)  */)
9906   (ptrdiff_t nargs, Lisp_Object *args)
9907 {
9908   ptrdiff_t i, j;
9909   bool changed[coding_category_max];
9910   enum coding_category priorities[coding_category_max];
9911
9912   memset (changed, 0, sizeof changed);
9913
9914   for (i = j = 0; i < nargs; i++)
9915     {
9916       enum coding_category category;
9917       Lisp_Object spec, attrs;
9918
9919       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9920       attrs = AREF (spec, 0);
9921       category = XINT (CODING_ATTR_CATEGORY (attrs));
9922       if (changed[category])
9923         /* Ignore this coding system because a coding system of the
9924            same category already had a higher priority.  */
9925         continue;
9926       changed[category] = 1;
9927       priorities[j++] = category;
9928       if (coding_categories[category].id >= 0
9929           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9930         setup_coding_system (args[i], &coding_categories[category]);
9931       Fset (AREF (Vcoding_category_table, category), args[i]);
9932     }
9933
9934   /* Now we have decided top J priorities.  Reflect the order of the
9935      original priorities to the remaining priorities.  */
9936
9937   for (i = j, j = 0; i < coding_category_max; i++, j++)
9938     {
9939       while (j < coding_category_max
9940              && changed[coding_priorities[j]])
9941         j++;
9942       if (j == coding_category_max)
9943         emacs_abort ();
9944       priorities[i] = coding_priorities[j];
9945     }
9946
9947   memcpy (coding_priorities, priorities, sizeof priorities);
9948
9949   /* Update `coding-category-list'.  */
9950   Vcoding_category_list = Qnil;
9951   for (i = coding_category_max; i-- > 0; )
9952     Vcoding_category_list
9953       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9954                Vcoding_category_list);
9955
9956   return Qnil;
9957 }
9958
9959 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9960        Scoding_system_priority_list, 0, 1, 0,
9961        doc: /* Return a list of coding systems ordered by their priorities.
9962 The list contains a subset of coding systems; i.e. coding systems
9963 assigned to each coding category (see `coding-category-list').
9964
9965 HIGHESTP non-nil means just return the highest priority one.  */)
9966   (Lisp_Object highestp)
9967 {
9968   int i;
9969   Lisp_Object val;
9970
9971   for (i = 0, val = Qnil; i < coding_category_max; i++)
9972     {
9973       enum coding_category category = coding_priorities[i];
9974       int id = coding_categories[category].id;
9975       Lisp_Object attrs;
9976
9977       if (id < 0)
9978         continue;
9979       attrs = CODING_ID_ATTRS (id);
9980       if (! NILP (highestp))
9981         return CODING_ATTR_BASE_NAME (attrs);
9982       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9983     }
9984   return Fnreverse (val);
9985 }
9986
9987 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9988
9989 static Lisp_Object
9990 make_subsidiaries (Lisp_Object base)
9991 {
9992   Lisp_Object subsidiaries;
9993   ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
9994   USE_SAFE_ALLOCA;
9995   char *buf = SAFE_ALLOCA (base_name_len + 6);
9996   int i;
9997
9998   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
9999   subsidiaries = make_uninit_vector (3);
10000   for (i = 0; i < 3; i++)
10001     {
10002       strcpy (buf + base_name_len, suffixes[i]);
10003       ASET (subsidiaries, i, intern (buf));
10004     }
10005   SAFE_FREE ();
10006   return subsidiaries;
10007 }
10008
10009
10010 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
10011        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
10012        doc: /* For internal use only.
10013 usage: (define-coding-system-internal ...)  */)
10014   (ptrdiff_t nargs, Lisp_Object *args)
10015 {
10016   Lisp_Object name;
10017   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
10018   Lisp_Object attrs;            /* Vector of attributes.  */
10019   Lisp_Object eol_type;
10020   Lisp_Object aliases;
10021   Lisp_Object coding_type, charset_list, safe_charsets;
10022   enum coding_category category;
10023   Lisp_Object tail, val;
10024   int max_charset_id = 0;
10025   int i;
10026
10027   if (nargs < coding_arg_max)
10028     goto short_args;
10029
10030   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
10031
10032   name = args[coding_arg_name];
10033   CHECK_SYMBOL (name);
10034   ASET (attrs, coding_attr_base_name, name);
10035
10036   val = args[coding_arg_mnemonic];
10037   if (! STRINGP (val))
10038     CHECK_CHARACTER (val);
10039   ASET (attrs, coding_attr_mnemonic, val);
10040
10041   coding_type = args[coding_arg_coding_type];
10042   CHECK_SYMBOL (coding_type);
10043   ASET (attrs, coding_attr_type, coding_type);
10044
10045   charset_list = args[coding_arg_charset_list];
10046   if (SYMBOLP (charset_list))
10047     {
10048       if (EQ (charset_list, Qiso_2022))
10049         {
10050           if (! EQ (coding_type, Qiso_2022))
10051             error ("Invalid charset-list");
10052           charset_list = Viso_2022_charset_list;
10053         }
10054       else if (EQ (charset_list, Qemacs_mule))
10055         {
10056           if (! EQ (coding_type, Qemacs_mule))
10057             error ("Invalid charset-list");
10058           charset_list = Vemacs_mule_charset_list;
10059         }
10060       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10061         {
10062           if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
10063             error ("Invalid charset-list");
10064           if (max_charset_id < XFASTINT (XCAR (tail)))
10065             max_charset_id = XFASTINT (XCAR (tail));
10066         }
10067     }
10068   else
10069     {
10070       charset_list = Fcopy_sequence (charset_list);
10071       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10072         {
10073           struct charset *charset;
10074
10075           val = XCAR (tail);
10076           CHECK_CHARSET_GET_CHARSET (val, charset);
10077           if (EQ (coding_type, Qiso_2022)
10078               ? CHARSET_ISO_FINAL (charset) < 0
10079               : EQ (coding_type, Qemacs_mule)
10080               ? CHARSET_EMACS_MULE_ID (charset) < 0
10081               : 0)
10082             error ("Can't handle charset `%s'",
10083                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10084
10085           XSETCAR (tail, make_number (charset->id));
10086           if (max_charset_id < charset->id)
10087             max_charset_id = charset->id;
10088         }
10089     }
10090   ASET (attrs, coding_attr_charset_list, charset_list);
10091
10092   safe_charsets = make_uninit_string (max_charset_id + 1);
10093   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
10094   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10095     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
10096   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
10097
10098   ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
10099
10100   val = args[coding_arg_decode_translation_table];
10101   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10102     CHECK_SYMBOL (val);
10103   ASET (attrs, coding_attr_decode_tbl, val);
10104
10105   val = args[coding_arg_encode_translation_table];
10106   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10107     CHECK_SYMBOL (val);
10108   ASET (attrs, coding_attr_encode_tbl, val);
10109
10110   val = args[coding_arg_post_read_conversion];
10111   CHECK_SYMBOL (val);
10112   ASET (attrs, coding_attr_post_read, val);
10113
10114   val = args[coding_arg_pre_write_conversion];
10115   CHECK_SYMBOL (val);
10116   ASET (attrs, coding_attr_pre_write, val);
10117
10118   val = args[coding_arg_default_char];
10119   if (NILP (val))
10120     ASET (attrs, coding_attr_default_char, make_number (' '));
10121   else
10122     {
10123       CHECK_CHARACTER (val);
10124       ASET (attrs, coding_attr_default_char, val);
10125     }
10126
10127   val = args[coding_arg_for_unibyte];
10128   ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
10129
10130   val = args[coding_arg_plist];
10131   CHECK_LIST (val);
10132   ASET (attrs, coding_attr_plist, val);
10133
10134   if (EQ (coding_type, Qcharset))
10135     {
10136       /* Generate a lisp vector of 256 elements.  Each element is nil,
10137          integer, or a list of charset IDs.
10138
10139          If Nth element is nil, the byte code N is invalid in this
10140          coding system.
10141
10142          If Nth element is a number NUM, N is the first byte of a
10143          charset whose ID is NUM.
10144
10145          If Nth element is a list of charset IDs, N is the first byte
10146          of one of them.  The list is sorted by dimensions of the
10147          charsets.  A charset of smaller dimension comes first. */
10148       val = Fmake_vector (make_number (256), Qnil);
10149
10150       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10151         {
10152           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
10153           int dim = CHARSET_DIMENSION (charset);
10154           int idx = (dim - 1) * 4;
10155
10156           if (CHARSET_ASCII_COMPATIBLE_P (charset))
10157             ASET (attrs, coding_attr_ascii_compat, Qt);
10158
10159           for (i = charset->code_space[idx];
10160                i <= charset->code_space[idx + 1]; i++)
10161             {
10162               Lisp_Object tmp, tmp2;
10163               int dim2;
10164
10165               tmp = AREF (val, i);
10166               if (NILP (tmp))
10167                 tmp = XCAR (tail);
10168               else if (NUMBERP (tmp))
10169                 {
10170                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
10171                   if (dim < dim2)
10172                     tmp = list2 (XCAR (tail), tmp);
10173                   else
10174                     tmp = list2 (tmp, XCAR (tail));
10175                 }
10176               else
10177                 {
10178                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
10179                     {
10180                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
10181                       if (dim < dim2)
10182                         break;
10183                     }
10184                   if (NILP (tmp2))
10185                     tmp = nconc2 (tmp, list1 (XCAR (tail)));
10186                   else
10187                     {
10188                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
10189                       XSETCAR (tmp2, XCAR (tail));
10190                     }
10191                 }
10192               ASET (val, i, tmp);
10193             }
10194         }
10195       ASET (attrs, coding_attr_charset_valids, val);
10196       category = coding_category_charset;
10197     }
10198   else if (EQ (coding_type, Qccl))
10199     {
10200       Lisp_Object valids;
10201
10202       if (nargs < coding_arg_ccl_max)
10203         goto short_args;
10204
10205       val = args[coding_arg_ccl_decoder];
10206       CHECK_CCL_PROGRAM (val);
10207       if (VECTORP (val))
10208         val = Fcopy_sequence (val);
10209       ASET (attrs, coding_attr_ccl_decoder, val);
10210
10211       val = args[coding_arg_ccl_encoder];
10212       CHECK_CCL_PROGRAM (val);
10213       if (VECTORP (val))
10214         val = Fcopy_sequence (val);
10215       ASET (attrs, coding_attr_ccl_encoder, val);
10216
10217       val = args[coding_arg_ccl_valids];
10218       valids = Fmake_string (make_number (256), make_number (0));
10219       for (tail = val; CONSP (tail); tail = XCDR (tail))
10220         {
10221           int from, to;
10222
10223           val = XCAR (tail);
10224           if (INTEGERP (val))
10225             {
10226               if (! (0 <= XINT (val) && XINT (val) <= 255))
10227                 args_out_of_range_3 (val, make_number (0), make_number (255));
10228               from = to = XINT (val);
10229             }
10230           else
10231             {
10232               CHECK_CONS (val);
10233               CHECK_NATNUM_CAR (val);
10234               CHECK_NUMBER_CDR (val);
10235               if (XINT (XCAR (val)) > 255)
10236                 args_out_of_range_3 (XCAR (val),
10237                                      make_number (0), make_number (255));
10238               from = XINT (XCAR (val));
10239               if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
10240                 args_out_of_range_3 (XCDR (val),
10241                                      XCAR (val), make_number (255));
10242               to = XINT (XCDR (val));
10243             }
10244           for (i = from; i <= to; i++)
10245             SSET (valids, i, 1);
10246         }
10247       ASET (attrs, coding_attr_ccl_valids, valids);
10248
10249       category = coding_category_ccl;
10250     }
10251   else if (EQ (coding_type, Qutf_16))
10252     {
10253       Lisp_Object bom, endian;
10254
10255       ASET (attrs, coding_attr_ascii_compat, Qnil);
10256
10257       if (nargs < coding_arg_utf16_max)
10258         goto short_args;
10259
10260       bom = args[coding_arg_utf16_bom];
10261       if (! NILP (bom) && ! EQ (bom, Qt))
10262         {
10263           CHECK_CONS (bom);
10264           val = XCAR (bom);
10265           CHECK_CODING_SYSTEM (val);
10266           val = XCDR (bom);
10267           CHECK_CODING_SYSTEM (val);
10268         }
10269       ASET (attrs, coding_attr_utf_bom, bom);
10270
10271       endian = args[coding_arg_utf16_endian];
10272       CHECK_SYMBOL (endian);
10273       if (NILP (endian))
10274         endian = Qbig;
10275       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
10276         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
10277       ASET (attrs, coding_attr_utf_16_endian, endian);
10278
10279       category = (CONSP (bom)
10280                   ? coding_category_utf_16_auto
10281                   : NILP (bom)
10282                   ? (EQ (endian, Qbig)
10283                      ? coding_category_utf_16_be_nosig
10284                      : coding_category_utf_16_le_nosig)
10285                   : (EQ (endian, Qbig)
10286                      ? coding_category_utf_16_be
10287                      : coding_category_utf_16_le));
10288     }
10289   else if (EQ (coding_type, Qiso_2022))
10290     {
10291       Lisp_Object initial, reg_usage, request, flags;
10292
10293       if (nargs < coding_arg_iso2022_max)
10294         goto short_args;
10295
10296       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
10297       CHECK_VECTOR (initial);
10298       for (i = 0; i < 4; i++)
10299         {
10300           val = AREF (initial, i);
10301           if (! NILP (val))
10302             {
10303               struct charset *charset;
10304
10305               CHECK_CHARSET_GET_CHARSET (val, charset);
10306               ASET (initial, i, make_number (CHARSET_ID (charset)));
10307               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
10308                 ASET (attrs, coding_attr_ascii_compat, Qt);
10309             }
10310           else
10311             ASET (initial, i, make_number (-1));
10312         }
10313
10314       reg_usage = args[coding_arg_iso2022_reg_usage];
10315       CHECK_CONS (reg_usage);
10316       CHECK_NUMBER_CAR (reg_usage);
10317       CHECK_NUMBER_CDR (reg_usage);
10318
10319       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
10320       for (tail = request; CONSP (tail); tail = XCDR (tail))
10321         {
10322           int id;
10323           Lisp_Object tmp1;
10324
10325           val = XCAR (tail);
10326           CHECK_CONS (val);
10327           tmp1 = XCAR (val);
10328           CHECK_CHARSET_GET_ID (tmp1, id);
10329           CHECK_NATNUM_CDR (val);
10330           if (XINT (XCDR (val)) >= 4)
10331             error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
10332           XSETCAR (val, make_number (id));
10333         }
10334
10335       flags = args[coding_arg_iso2022_flags];
10336       CHECK_NATNUM (flags);
10337       i = XINT (flags) & INT_MAX;
10338       if (EQ (args[coding_arg_charset_list], Qiso_2022))
10339         i |= CODING_ISO_FLAG_FULL_SUPPORT;
10340       flags = make_number (i);
10341
10342       ASET (attrs, coding_attr_iso_initial, initial);
10343       ASET (attrs, coding_attr_iso_usage, reg_usage);
10344       ASET (attrs, coding_attr_iso_request, request);
10345       ASET (attrs, coding_attr_iso_flags, flags);
10346       setup_iso_safe_charsets (attrs);
10347
10348       if (i & CODING_ISO_FLAG_SEVEN_BITS)
10349         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
10350                           | CODING_ISO_FLAG_SINGLE_SHIFT))
10351                     ? coding_category_iso_7_else
10352                     : EQ (args[coding_arg_charset_list], Qiso_2022)
10353                     ? coding_category_iso_7
10354                     : coding_category_iso_7_tight);
10355       else
10356         {
10357           int id = XINT (AREF (initial, 1));
10358
10359           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
10360                        || EQ (args[coding_arg_charset_list], Qiso_2022)
10361                        || id < 0)
10362                       ? coding_category_iso_8_else
10363                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
10364                       ? coding_category_iso_8_1
10365                       : coding_category_iso_8_2);
10366         }
10367       if (category != coding_category_iso_8_1
10368           && category != coding_category_iso_8_2)
10369         ASET (attrs, coding_attr_ascii_compat, Qnil);
10370     }
10371   else if (EQ (coding_type, Qemacs_mule))
10372     {
10373       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
10374         ASET (attrs, coding_attr_emacs_mule_full, Qt);
10375       ASET (attrs, coding_attr_ascii_compat, Qt);
10376       category = coding_category_emacs_mule;
10377     }
10378   else if (EQ (coding_type, Qshift_jis))
10379     {
10380
10381       struct charset *charset;
10382
10383       if (XINT (Flength (charset_list)) != 3
10384           && XINT (Flength (charset_list)) != 4)
10385         error ("There should be three or four charsets");
10386
10387       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10388       if (CHARSET_DIMENSION (charset) != 1)
10389         error ("Dimension of charset %s is not one",
10390                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10391       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10392         ASET (attrs, coding_attr_ascii_compat, Qt);
10393
10394       charset_list = XCDR (charset_list);
10395       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10396       if (CHARSET_DIMENSION (charset) != 1)
10397         error ("Dimension of charset %s is not one",
10398                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10399
10400       charset_list = XCDR (charset_list);
10401       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10402       if (CHARSET_DIMENSION (charset) != 2)
10403         error ("Dimension of charset %s is not two",
10404                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10405
10406       charset_list = XCDR (charset_list);
10407       if (! NILP (charset_list))
10408         {
10409           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10410           if (CHARSET_DIMENSION (charset) != 2)
10411             error ("Dimension of charset %s is not two",
10412                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10413         }
10414
10415       category = coding_category_sjis;
10416       Vsjis_coding_system = name;
10417     }
10418   else if (EQ (coding_type, Qbig5))
10419     {
10420       struct charset *charset;
10421
10422       if (XINT (Flength (charset_list)) != 2)
10423         error ("There should be just two charsets");
10424
10425       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10426       if (CHARSET_DIMENSION (charset) != 1)
10427         error ("Dimension of charset %s is not one",
10428                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10429       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10430         ASET (attrs, coding_attr_ascii_compat, Qt);
10431
10432       charset_list = XCDR (charset_list);
10433       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10434       if (CHARSET_DIMENSION (charset) != 2)
10435         error ("Dimension of charset %s is not two",
10436                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10437
10438       category = coding_category_big5;
10439       Vbig5_coding_system = name;
10440     }
10441   else if (EQ (coding_type, Qraw_text))
10442     {
10443       category = coding_category_raw_text;
10444       ASET (attrs, coding_attr_ascii_compat, Qt);
10445     }
10446   else if (EQ (coding_type, Qutf_8))
10447     {
10448       Lisp_Object bom;
10449
10450       if (nargs < coding_arg_utf8_max)
10451         goto short_args;
10452
10453       bom = args[coding_arg_utf8_bom];
10454       if (! NILP (bom) && ! EQ (bom, Qt))
10455         {
10456           CHECK_CONS (bom);
10457           val = XCAR (bom);
10458           CHECK_CODING_SYSTEM (val);
10459           val = XCDR (bom);
10460           CHECK_CODING_SYSTEM (val);
10461         }
10462       ASET (attrs, coding_attr_utf_bom, bom);
10463       if (NILP (bom))
10464         ASET (attrs, coding_attr_ascii_compat, Qt);
10465
10466       category = (CONSP (bom) ? coding_category_utf_8_auto
10467                   : NILP (bom) ? coding_category_utf_8_nosig
10468                   : coding_category_utf_8_sig);
10469     }
10470   else if (EQ (coding_type, Qundecided))
10471     {
10472       if (nargs < coding_arg_undecided_max)
10473         goto short_args;
10474       ASET (attrs, coding_attr_undecided_inhibit_null_byte_detection,
10475             args[coding_arg_undecided_inhibit_null_byte_detection]);
10476       ASET (attrs, coding_attr_undecided_inhibit_iso_escape_detection,
10477             args[coding_arg_undecided_inhibit_iso_escape_detection]);
10478       ASET (attrs, coding_attr_undecided_prefer_utf_8,
10479             args[coding_arg_undecided_prefer_utf_8]);
10480       category = coding_category_undecided;
10481     }
10482   else
10483     error ("Invalid coding system type: %s",
10484            SDATA (SYMBOL_NAME (coding_type)));
10485
10486   ASET (attrs, coding_attr_category, make_number (category));
10487   ASET (attrs, coding_attr_plist,
10488         Fcons (QCcategory,
10489                Fcons (AREF (Vcoding_category_table, category),
10490                       CODING_ATTR_PLIST (attrs))));
10491   ASET (attrs, coding_attr_plist,
10492         Fcons (QCascii_compatible_p,
10493                Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10494                       CODING_ATTR_PLIST (attrs))));
10495
10496   eol_type = args[coding_arg_eol_type];
10497   if (! NILP (eol_type)
10498       && ! EQ (eol_type, Qunix)
10499       && ! EQ (eol_type, Qdos)
10500       && ! EQ (eol_type, Qmac))
10501     error ("Invalid eol-type");
10502
10503   aliases = list1 (name);
10504
10505   if (NILP (eol_type))
10506     {
10507       eol_type = make_subsidiaries (name);
10508       for (i = 0; i < 3; i++)
10509         {
10510           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10511
10512           this_name = AREF (eol_type, i);
10513           this_aliases = list1 (this_name);
10514           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10515           this_spec = make_uninit_vector (3);
10516           ASET (this_spec, 0, attrs);
10517           ASET (this_spec, 1, this_aliases);
10518           ASET (this_spec, 2, this_eol_type);
10519           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10520           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10521           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10522           if (NILP (val))
10523             Vcoding_system_alist
10524               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10525                        Vcoding_system_alist);
10526         }
10527     }
10528
10529   spec_vec = make_uninit_vector (3);
10530   ASET (spec_vec, 0, attrs);
10531   ASET (spec_vec, 1, aliases);
10532   ASET (spec_vec, 2, eol_type);
10533
10534   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10535   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10536   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10537   if (NILP (val))
10538     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10539                                   Vcoding_system_alist);
10540
10541   {
10542     int id = coding_categories[category].id;
10543
10544     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10545       setup_coding_system (name, &coding_categories[category]);
10546   }
10547
10548   return Qnil;
10549
10550  short_args:
10551   return Fsignal (Qwrong_number_of_arguments,
10552                   Fcons (intern ("define-coding-system-internal"),
10553                          make_number (nargs)));
10554 }
10555
10556
10557 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10558        3, 3, 0,
10559        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10560   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10561 {
10562   Lisp_Object spec, attrs;
10563
10564   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10565   attrs = AREF (spec, 0);
10566   if (EQ (prop, QCmnemonic))
10567     {
10568       if (! STRINGP (val))
10569         CHECK_CHARACTER (val);
10570       ASET (attrs, coding_attr_mnemonic, val);
10571     }
10572   else if (EQ (prop, QCdefault_char))
10573     {
10574       if (NILP (val))
10575         val = make_number (' ');
10576       else
10577         CHECK_CHARACTER (val);
10578       ASET (attrs, coding_attr_default_char, val);
10579     }
10580   else if (EQ (prop, QCdecode_translation_table))
10581     {
10582       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10583         CHECK_SYMBOL (val);
10584       ASET (attrs, coding_attr_decode_tbl, val);
10585     }
10586   else if (EQ (prop, QCencode_translation_table))
10587     {
10588       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10589         CHECK_SYMBOL (val);
10590       ASET (attrs, coding_attr_encode_tbl, val);
10591     }
10592   else if (EQ (prop, QCpost_read_conversion))
10593     {
10594       CHECK_SYMBOL (val);
10595       ASET (attrs, coding_attr_post_read, val);
10596     }
10597   else if (EQ (prop, QCpre_write_conversion))
10598     {
10599       CHECK_SYMBOL (val);
10600       ASET (attrs, coding_attr_pre_write, val);
10601     }
10602   else if (EQ (prop, QCascii_compatible_p))
10603     {
10604       ASET (attrs, coding_attr_ascii_compat, val);
10605     }
10606
10607   ASET (attrs, coding_attr_plist,
10608         Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10609   return val;
10610 }
10611
10612
10613 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10614        Sdefine_coding_system_alias, 2, 2, 0,
10615        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10616   (Lisp_Object alias, Lisp_Object coding_system)
10617 {
10618   Lisp_Object spec, aliases, eol_type, val;
10619
10620   CHECK_SYMBOL (alias);
10621   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10622   aliases = AREF (spec, 1);
10623   /* ALIASES should be a list of length more than zero, and the first
10624      element is a base coding system.  Append ALIAS at the tail of the
10625      list.  */
10626   while (!NILP (XCDR (aliases)))
10627     aliases = XCDR (aliases);
10628   XSETCDR (aliases, list1 (alias));
10629
10630   eol_type = AREF (spec, 2);
10631   if (VECTORP (eol_type))
10632     {
10633       Lisp_Object subsidiaries;
10634       int i;
10635
10636       subsidiaries = make_subsidiaries (alias);
10637       for (i = 0; i < 3; i++)
10638         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10639                                      AREF (eol_type, i));
10640     }
10641
10642   Fputhash (alias, spec, Vcoding_system_hash_table);
10643   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10644   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10645   if (NILP (val))
10646     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10647                                   Vcoding_system_alist);
10648
10649   return Qnil;
10650 }
10651
10652 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10653        1, 1, 0,
10654        doc: /* Return the base of CODING-SYSTEM.
10655 Any alias or subsidiary coding system is not a base coding system.  */)
10656   (Lisp_Object coding_system)
10657 {
10658   Lisp_Object spec, attrs;
10659
10660   if (NILP (coding_system))
10661     return (Qno_conversion);
10662   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10663   attrs = AREF (spec, 0);
10664   return CODING_ATTR_BASE_NAME (attrs);
10665 }
10666
10667 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10668        1, 1, 0,
10669        doc: /* Return the property list of CODING-SYSTEM.  */)
10670   (Lisp_Object coding_system)
10671 {
10672   Lisp_Object spec, attrs;
10673
10674   if (NILP (coding_system))
10675     coding_system = Qno_conversion;
10676   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10677   attrs = AREF (spec, 0);
10678   return CODING_ATTR_PLIST (attrs);
10679 }
10680
10681
10682 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10683        1, 1, 0,
10684        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10685   (Lisp_Object coding_system)
10686 {
10687   Lisp_Object spec;
10688
10689   if (NILP (coding_system))
10690     coding_system = Qno_conversion;
10691   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10692   return AREF (spec, 1);
10693 }
10694
10695 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10696        Scoding_system_eol_type, 1, 1, 0,
10697        doc: /* Return eol-type of CODING-SYSTEM.
10698 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10699
10700 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10701 and CR respectively.
10702
10703 A vector value indicates that a format of end-of-line should be
10704 detected automatically.  Nth element of the vector is the subsidiary
10705 coding system whose eol-type is N.  */)
10706   (Lisp_Object coding_system)
10707 {
10708   Lisp_Object spec, eol_type;
10709   int n;
10710
10711   if (NILP (coding_system))
10712     coding_system = Qno_conversion;
10713   if (! CODING_SYSTEM_P (coding_system))
10714     return Qnil;
10715   spec = CODING_SYSTEM_SPEC (coding_system);
10716   eol_type = AREF (spec, 2);
10717   if (VECTORP (eol_type))
10718     return Fcopy_sequence (eol_type);
10719   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10720   return make_number (n);
10721 }
10722
10723 #endif /* emacs */
10724
10725 \f
10726 /*** 9. Post-amble ***/
10727
10728 void
10729 init_coding_once (void)
10730 {
10731   int i;
10732
10733   for (i = 0; i < coding_category_max; i++)
10734     {
10735       coding_categories[i].id = -1;
10736       coding_priorities[i] = i;
10737     }
10738
10739   /* ISO2022 specific initialize routine.  */
10740   for (i = 0; i < 0x20; i++)
10741     iso_code_class[i] = ISO_control_0;
10742   for (i = 0x21; i < 0x7F; i++)
10743     iso_code_class[i] = ISO_graphic_plane_0;
10744   for (i = 0x80; i < 0xA0; i++)
10745     iso_code_class[i] = ISO_control_1;
10746   for (i = 0xA1; i < 0xFF; i++)
10747     iso_code_class[i] = ISO_graphic_plane_1;
10748   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10749   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10750   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10751   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10752   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10753   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10754   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10755   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10756   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10757
10758   for (i = 0; i < 256; i++)
10759     {
10760       emacs_mule_bytes[i] = 1;
10761     }
10762   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10763   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10764   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10765   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10766 }
10767
10768 #ifdef emacs
10769
10770 void
10771 syms_of_coding (void)
10772 {
10773   staticpro (&Vcoding_system_hash_table);
10774   Vcoding_system_hash_table = CALLN (Fmake_hash_table, QCtest, Qeq);
10775
10776   staticpro (&Vsjis_coding_system);
10777   Vsjis_coding_system = Qnil;
10778
10779   staticpro (&Vbig5_coding_system);
10780   Vbig5_coding_system = Qnil;
10781
10782   staticpro (&Vcode_conversion_reused_workbuf);
10783   Vcode_conversion_reused_workbuf = Qnil;
10784
10785   staticpro (&Vcode_conversion_workbuf_name);
10786   Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10787
10788   reused_workbuf_in_use = 0;
10789
10790   DEFSYM (Qcharset, "charset");
10791   DEFSYM (Qtarget_idx, "target-idx");
10792   DEFSYM (Qcoding_system_history, "coding-system-history");
10793   Fset (Qcoding_system_history, Qnil);
10794
10795   /* Target FILENAME is the first argument.  */
10796   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10797   /* Target FILENAME is the third argument.  */
10798   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10799
10800   DEFSYM (Qcall_process, "call-process");
10801   /* Target PROGRAM is the first argument.  */
10802   Fput (Qcall_process, Qtarget_idx, make_number (0));
10803
10804   DEFSYM (Qcall_process_region, "call-process-region");
10805   /* Target PROGRAM is the third argument.  */
10806   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10807
10808   DEFSYM (Qstart_process, "start-process");
10809   /* Target PROGRAM is the third argument.  */
10810   Fput (Qstart_process, Qtarget_idx, make_number (2));
10811
10812   DEFSYM (Qopen_network_stream, "open-network-stream");
10813   /* Target SERVICE is the fourth argument.  */
10814   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10815
10816   DEFSYM (Qunix, "unix");
10817   DEFSYM (Qdos, "dos");
10818   DEFSYM (Qmac, "mac");
10819
10820   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10821   DEFSYM (Qundecided, "undecided");
10822   DEFSYM (Qno_conversion, "no-conversion");
10823   DEFSYM (Qraw_text, "raw-text");
10824
10825   DEFSYM (Qiso_2022, "iso-2022");
10826
10827   DEFSYM (Qutf_8, "utf-8");
10828   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10829
10830 #if defined (WINDOWSNT) || defined (CYGWIN)
10831   /* No, not utf-16-le: that one has a BOM.  */
10832   DEFSYM (Qutf_16le, "utf-16le");
10833 #endif
10834
10835   DEFSYM (Qutf_16, "utf-16");
10836   DEFSYM (Qbig, "big");
10837   DEFSYM (Qlittle, "little");
10838
10839   DEFSYM (Qshift_jis, "shift-jis");
10840   DEFSYM (Qbig5, "big5");
10841
10842   DEFSYM (Qcoding_system_p, "coding-system-p");
10843
10844   /* Error signaled when there's a problem with detecting a coding system.  */
10845   DEFSYM (Qcoding_system_error, "coding-system-error");
10846   Fput (Qcoding_system_error, Qerror_conditions,
10847         listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10848   Fput (Qcoding_system_error, Qerror_message,
10849         build_pure_c_string ("Invalid coding system"));
10850
10851   DEFSYM (Qtranslation_table, "translation-table");
10852   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10853   DEFSYM (Qtranslation_table_id, "translation-table-id");
10854
10855   /* Coding system emacs-mule and raw-text are for converting only
10856      end-of-line format.  */
10857   DEFSYM (Qemacs_mule, "emacs-mule");
10858
10859   DEFSYM (QCcategory, ":category");
10860   DEFSYM (QCmnemonic, ":mnemonic");
10861   DEFSYM (QCdefault_char, ":default-char");
10862   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10863   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10864   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10865   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10866   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10867
10868   Vcoding_category_table
10869     = Fmake_vector (make_number (coding_category_max), Qnil);
10870   staticpro (&Vcoding_category_table);
10871   /* Followings are target of code detection.  */
10872   ASET (Vcoding_category_table, coding_category_iso_7,
10873         intern_c_string ("coding-category-iso-7"));
10874   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10875         intern_c_string ("coding-category-iso-7-tight"));
10876   ASET (Vcoding_category_table, coding_category_iso_8_1,
10877         intern_c_string ("coding-category-iso-8-1"));
10878   ASET (Vcoding_category_table, coding_category_iso_8_2,
10879         intern_c_string ("coding-category-iso-8-2"));
10880   ASET (Vcoding_category_table, coding_category_iso_7_else,
10881         intern_c_string ("coding-category-iso-7-else"));
10882   ASET (Vcoding_category_table, coding_category_iso_8_else,
10883         intern_c_string ("coding-category-iso-8-else"));
10884   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10885         intern_c_string ("coding-category-utf-8-auto"));
10886   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10887         intern_c_string ("coding-category-utf-8"));
10888   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10889         intern_c_string ("coding-category-utf-8-sig"));
10890   ASET (Vcoding_category_table, coding_category_utf_16_be,
10891         intern_c_string ("coding-category-utf-16-be"));
10892   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10893         intern_c_string ("coding-category-utf-16-auto"));
10894   ASET (Vcoding_category_table, coding_category_utf_16_le,
10895         intern_c_string ("coding-category-utf-16-le"));
10896   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10897         intern_c_string ("coding-category-utf-16-be-nosig"));
10898   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10899         intern_c_string ("coding-category-utf-16-le-nosig"));
10900   ASET (Vcoding_category_table, coding_category_charset,
10901         intern_c_string ("coding-category-charset"));
10902   ASET (Vcoding_category_table, coding_category_sjis,
10903         intern_c_string ("coding-category-sjis"));
10904   ASET (Vcoding_category_table, coding_category_big5,
10905         intern_c_string ("coding-category-big5"));
10906   ASET (Vcoding_category_table, coding_category_ccl,
10907         intern_c_string ("coding-category-ccl"));
10908   ASET (Vcoding_category_table, coding_category_emacs_mule,
10909         intern_c_string ("coding-category-emacs-mule"));
10910   /* Followings are NOT target of code detection.  */
10911   ASET (Vcoding_category_table, coding_category_raw_text,
10912         intern_c_string ("coding-category-raw-text"));
10913   ASET (Vcoding_category_table, coding_category_undecided,
10914         intern_c_string ("coding-category-undecided"));
10915
10916   DEFSYM (Qinsufficient_source, "insufficient-source");
10917   DEFSYM (Qinvalid_source, "invalid-source");
10918   DEFSYM (Qinterrupted, "interrupted");
10919
10920   /* If a symbol has this property, evaluate the value to define the
10921      symbol as a coding system.  */
10922   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10923
10924   defsubr (&Scoding_system_p);
10925   defsubr (&Sread_coding_system);
10926   defsubr (&Sread_non_nil_coding_system);
10927   defsubr (&Scheck_coding_system);
10928   defsubr (&Sdetect_coding_region);
10929   defsubr (&Sdetect_coding_string);
10930   defsubr (&Sfind_coding_systems_region_internal);
10931   defsubr (&Sunencodable_char_position);
10932   defsubr (&Scheck_coding_systems_region);
10933   defsubr (&Sdecode_coding_region);
10934   defsubr (&Sencode_coding_region);
10935   defsubr (&Sdecode_coding_string);
10936   defsubr (&Sencode_coding_string);
10937   defsubr (&Sdecode_sjis_char);
10938   defsubr (&Sencode_sjis_char);
10939   defsubr (&Sdecode_big5_char);
10940   defsubr (&Sencode_big5_char);
10941   defsubr (&Sset_terminal_coding_system_internal);
10942   defsubr (&Sset_safe_terminal_coding_system_internal);
10943   defsubr (&Sterminal_coding_system);
10944   defsubr (&Sset_keyboard_coding_system_internal);
10945   defsubr (&Skeyboard_coding_system);
10946   defsubr (&Sfind_operation_coding_system);
10947   defsubr (&Sset_coding_system_priority);
10948   defsubr (&Sdefine_coding_system_internal);
10949   defsubr (&Sdefine_coding_system_alias);
10950   defsubr (&Scoding_system_put);
10951   defsubr (&Scoding_system_base);
10952   defsubr (&Scoding_system_plist);
10953   defsubr (&Scoding_system_aliases);
10954   defsubr (&Scoding_system_eol_type);
10955   defsubr (&Scoding_system_priority_list);
10956
10957   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10958                doc: /* List of coding systems.
10959
10960 Do not alter the value of this variable manually.  This variable should be
10961 updated by the functions `define-coding-system' and
10962 `define-coding-system-alias'.  */);
10963   Vcoding_system_list = Qnil;
10964
10965   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10966                doc: /* Alist of coding system names.
10967 Each element is one element list of coding system name.
10968 This variable is given to `completing-read' as COLLECTION argument.
10969
10970 Do not alter the value of this variable manually.  This variable should be
10971 updated by the functions `make-coding-system' and
10972 `define-coding-system-alias'.  */);
10973   Vcoding_system_alist = Qnil;
10974
10975   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10976                doc: /* List of coding-categories (symbols) ordered by priority.
10977
10978 On detecting a coding system, Emacs tries code detection algorithms
10979 associated with each coding-category one by one in this order.  When
10980 one algorithm agrees with a byte sequence of source text, the coding
10981 system bound to the corresponding coding-category is selected.
10982
10983 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
10984   {
10985     int i;
10986
10987     Vcoding_category_list = Qnil;
10988     for (i = coding_category_max - 1; i >= 0; i--)
10989       Vcoding_category_list
10990         = Fcons (AREF (Vcoding_category_table, i),
10991                  Vcoding_category_list);
10992   }
10993
10994   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
10995                doc: /* Specify the coding system for read operations.
10996 It is useful to bind this variable with `let', but do not set it globally.
10997 If the value is a coding system, it is used for decoding on read operation.
10998 If not, an appropriate element is used from one of the coding system alists.
10999 There are three such tables: `file-coding-system-alist',
11000 `process-coding-system-alist', and `network-coding-system-alist'.  */);
11001   Vcoding_system_for_read = Qnil;
11002
11003   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
11004                doc: /* Specify the coding system for write operations.
11005 Programs bind this variable with `let', but you should not set it globally.
11006 If the value is a coding system, it is used for encoding of output,
11007 when writing it to a file and when sending it to a file or subprocess.
11008
11009 If this does not specify a coding system, an appropriate element
11010 is used from one of the coding system alists.
11011 There are three such tables: `file-coding-system-alist',
11012 `process-coding-system-alist', and `network-coding-system-alist'.
11013 For output to files, if the above procedure does not specify a coding system,
11014 the value of `buffer-file-coding-system' is used.  */);
11015   Vcoding_system_for_write = Qnil;
11016
11017   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
11018                doc: /*
11019 Coding system used in the latest file or process I/O.  */);
11020   Vlast_coding_system_used = Qnil;
11021
11022   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
11023                doc: /*
11024 Error status of the last code conversion.
11025
11026 When an error was detected in the last code conversion, this variable
11027 is set to one of the following symbols.
11028   `insufficient-source'
11029   `inconsistent-eol'
11030   `invalid-source'
11031   `interrupted'
11032   `insufficient-memory'
11033 When no error was detected, the value doesn't change.  So, to check
11034 the error status of a code conversion by this variable, you must
11035 explicitly set this variable to nil before performing code
11036 conversion.  */);
11037   Vlast_code_conversion_error = Qnil;
11038
11039   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
11040                doc: /*
11041 Non-nil means always inhibit code conversion of end-of-line format.
11042 See info node `Coding Systems' and info node `Text and Binary' concerning
11043 such conversion.  */);
11044   inhibit_eol_conversion = 0;
11045
11046   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
11047                doc: /*
11048 Non-nil means process buffer inherits coding system of process output.
11049 Bind it to t if the process output is to be treated as if it were a file
11050 read from some filesystem.  */);
11051   inherit_process_coding_system = 0;
11052
11053   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
11054                doc: /*
11055 Alist to decide a coding system to use for a file I/O operation.
11056 The format is ((PATTERN . VAL) ...),
11057 where PATTERN is a regular expression matching a file name,
11058 VAL is a coding system, a cons of coding systems, or a function symbol.
11059 If VAL is a coding system, it is used for both decoding and encoding
11060 the file contents.
11061 If VAL is a cons of coding systems, the car part is used for decoding,
11062 and the cdr part is used for encoding.
11063 If VAL is a function symbol, the function must return a coding system
11064 or a cons of coding systems which are used as above.  The function is
11065 called with an argument that is a list of the arguments with which
11066 `find-operation-coding-system' was called.  If the function can't decide
11067 a coding system, it can return `undecided' so that the normal
11068 code-detection is performed.
11069
11070 See also the function `find-operation-coding-system'
11071 and the variable `auto-coding-alist'.  */);
11072   Vfile_coding_system_alist = Qnil;
11073
11074   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
11075                doc: /*
11076 Alist to decide a coding system to use for a process I/O operation.
11077 The format is ((PATTERN . VAL) ...),
11078 where PATTERN is a regular expression matching a program name,
11079 VAL is a coding system, a cons of coding systems, or a function symbol.
11080 If VAL is a coding system, it is used for both decoding what received
11081 from the program and encoding what sent to the program.
11082 If VAL is a cons of coding systems, the car part is used for decoding,
11083 and the cdr part is used for encoding.
11084 If VAL is a function symbol, the function must return a coding system
11085 or a cons of coding systems which are used as above.
11086
11087 See also the function `find-operation-coding-system'.  */);
11088   Vprocess_coding_system_alist = Qnil;
11089
11090   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
11091                doc: /*
11092 Alist to decide a coding system to use for a network I/O operation.
11093 The format is ((PATTERN . VAL) ...),
11094 where PATTERN is a regular expression matching a network service name
11095 or is a port number to connect to,
11096 VAL is a coding system, a cons of coding systems, or a function symbol.
11097 If VAL is a coding system, it is used for both decoding what received
11098 from the network stream and encoding what sent to the network stream.
11099 If VAL is a cons of coding systems, the car part is used for decoding,
11100 and the cdr part is used for encoding.
11101 If VAL is a function symbol, the function must return a coding system
11102 or a cons of coding systems which are used as above.
11103
11104 See also the function `find-operation-coding-system'.  */);
11105   Vnetwork_coding_system_alist = Qnil;
11106
11107   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
11108                doc: /* Coding system to use with system messages.
11109 Also used for decoding keyboard input on X Window system, and for
11110 encoding standard output and error streams.  */);
11111   Vlocale_coding_system = Qnil;
11112
11113   /* The eol mnemonics are reset in startup.el system-dependently.  */
11114   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
11115                doc: /*
11116 String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
11117   eol_mnemonic_unix = build_pure_c_string (":");
11118
11119   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
11120                doc: /*
11121 String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
11122   eol_mnemonic_dos = build_pure_c_string ("\\");
11123
11124   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
11125                doc: /*
11126 String displayed in mode line for MAC-like (CR) end-of-line format.  */);
11127   eol_mnemonic_mac = build_pure_c_string ("/");
11128
11129   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
11130                doc: /*
11131 String displayed in mode line when end-of-line format is not yet determined.  */);
11132   eol_mnemonic_undecided = build_pure_c_string (":");
11133
11134   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
11135                doc: /*
11136 Non-nil enables character translation while encoding and decoding.  */);
11137   Venable_character_translation = Qt;
11138
11139   DEFVAR_LISP ("standard-translation-table-for-decode",
11140                Vstandard_translation_table_for_decode,
11141                doc: /* Table for translating characters while decoding.  */);
11142   Vstandard_translation_table_for_decode = Qnil;
11143
11144   DEFVAR_LISP ("standard-translation-table-for-encode",
11145                Vstandard_translation_table_for_encode,
11146                doc: /* Table for translating characters while encoding.  */);
11147   Vstandard_translation_table_for_encode = Qnil;
11148
11149   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
11150                doc: /* Alist of charsets vs revision numbers.
11151 While encoding, if a charset (car part of an element) is found,
11152 designate it with the escape sequence identifying revision (cdr part
11153 of the element).  */);
11154   Vcharset_revision_table = Qnil;
11155
11156   DEFVAR_LISP ("default-process-coding-system",
11157                Vdefault_process_coding_system,
11158                doc: /* Cons of coding systems used for process I/O by default.
11159 The car part is used for decoding a process output,
11160 the cdr part is used for encoding a text to be sent to a process.  */);
11161   Vdefault_process_coding_system = Qnil;
11162
11163   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
11164                doc: /*
11165 Table of extra Latin codes in the range 128..159 (inclusive).
11166 This is a vector of length 256.
11167 If Nth element is non-nil, the existence of code N in a file
11168 \(or output of subprocess) doesn't prevent it to be detected as
11169 a coding system of ISO 2022 variant which has a flag
11170 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
11171 or reading output of a subprocess.
11172 Only 128th through 159th elements have a meaning.  */);
11173   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
11174
11175   DEFVAR_LISP ("select-safe-coding-system-function",
11176                Vselect_safe_coding_system_function,
11177                doc: /*
11178 Function to call to select safe coding system for encoding a text.
11179
11180 If set, this function is called to force a user to select a proper
11181 coding system which can encode the text in the case that a default
11182 coding system used in each operation can't encode the text.  The
11183 function should take care that the buffer is not modified while
11184 the coding system is being selected.
11185
11186 The default value is `select-safe-coding-system' (which see).  */);
11187   Vselect_safe_coding_system_function = Qnil;
11188
11189   DEFVAR_BOOL ("coding-system-require-warning",
11190                coding_system_require_warning,
11191                doc: /* Internal use only.
11192 If non-nil, on writing a file, `select-safe-coding-system-function' is
11193 called even if `coding-system-for-write' is non-nil.  The command
11194 `universal-coding-system-argument' binds this variable to t temporarily.  */);
11195   coding_system_require_warning = 0;
11196
11197
11198   DEFVAR_BOOL ("inhibit-iso-escape-detection",
11199                inhibit_iso_escape_detection,
11200                doc: /*
11201 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
11202
11203 When Emacs reads text, it tries to detect how the text is encoded.
11204 This code detection is sensitive to escape sequences.  If Emacs sees
11205 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
11206 of the ISO2022 encodings, and decodes text by the corresponding coding
11207 system (e.g. `iso-2022-7bit').
11208
11209 However, there may be a case that you want to read escape sequences in
11210 a file as is.  In such a case, you can set this variable to non-nil.
11211 Then the code detection will ignore any escape sequences, and no text is
11212 detected as encoded in some ISO-2022 encoding.  The result is that all
11213 escape sequences become visible in a buffer.
11214
11215 The default value is nil, and it is strongly recommended not to change
11216 it.  That is because many Emacs Lisp source files that contain
11217 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
11218 in Emacs's distribution, and they won't be decoded correctly on
11219 reading if you suppress escape sequence detection.
11220
11221 The other way to read escape sequences in a file without decoding is
11222 to explicitly specify some coding system that doesn't use ISO-2022
11223 escape sequence (e.g., `latin-1') on reading by \\[universal-coding-system-argument].  */);
11224   inhibit_iso_escape_detection = 0;
11225
11226   DEFVAR_BOOL ("inhibit-null-byte-detection",
11227                inhibit_null_byte_detection,
11228                doc: /* If non-nil, Emacs ignores null bytes on code detection.
11229 By default, Emacs treats it as binary data, and does not attempt to
11230 decode it.  The effect is as if you specified `no-conversion' for
11231 reading that text.
11232
11233 Set this to non-nil when a regular text happens to include null bytes.
11234 Examples are Index nodes of Info files and null-byte delimited output
11235 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
11236 decode text as usual.  */);
11237   inhibit_null_byte_detection = 0;
11238
11239   DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
11240                doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
11241 Internal use only.  Remove after the experimental optimizer becomes stable.  */);
11242   disable_ascii_optimization = 0;
11243
11244   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
11245                doc: /* Char table for translating self-inserting characters.
11246 This is applied to the result of input methods, not their input.
11247 See also `keyboard-translate-table'.
11248
11249 Use of this variable for character code unification was rendered
11250 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
11251 internal character representation.  */);
11252   Vtranslation_table_for_input = Qnil;
11253
11254   Lisp_Object args[coding_arg_undecided_max];
11255   memclear (args, sizeof args);
11256
11257   Lisp_Object plist[] =
11258     {
11259       QCname,
11260       args[coding_arg_name] = Qno_conversion,
11261       QCmnemonic,
11262       args[coding_arg_mnemonic] = make_number ('='),
11263       intern_c_string (":coding-type"),
11264       args[coding_arg_coding_type] = Qraw_text,
11265       QCascii_compatible_p,
11266       args[coding_arg_ascii_compatible_p] = Qt,
11267       QCdefault_char,
11268       args[coding_arg_default_char] = make_number (0),
11269       intern_c_string (":for-unibyte"),
11270       args[coding_arg_for_unibyte] = Qt,
11271       intern_c_string (":docstring"),
11272       (build_pure_c_string
11273        ("Do no conversion.\n"
11274         "\n"
11275         "When you visit a file with this coding, the file is read into a\n"
11276         "unibyte buffer as is, thus each byte of a file is treated as a\n"
11277         "character.")),
11278       intern_c_string (":eol-type"),
11279       args[coding_arg_eol_type] = Qunix,
11280     };
11281   args[coding_arg_plist] = CALLMANY (Flist, plist);
11282   Fdefine_coding_system_internal (coding_arg_max, args);
11283
11284   plist[1] = args[coding_arg_name] = Qundecided;
11285   plist[3] = args[coding_arg_mnemonic] = make_number ('-');
11286   plist[5] = args[coding_arg_coding_type] = Qundecided;
11287   /* This is already set.
11288      plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
11289   plist[8] = intern_c_string (":charset-list");
11290   plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
11291   plist[11] = args[coding_arg_for_unibyte] = Qnil;
11292   plist[13] = build_pure_c_string ("No conversion on encoding, "
11293                                    "automatic conversion on decoding.");
11294   plist[15] = args[coding_arg_eol_type] = Qnil;
11295   args[coding_arg_plist] = CALLMANY (Flist, plist);
11296   args[coding_arg_undecided_inhibit_null_byte_detection] = make_number (0);
11297   args[coding_arg_undecided_inhibit_iso_escape_detection] = make_number (0);
11298   Fdefine_coding_system_internal (coding_arg_undecided_max, args);
11299
11300   setup_coding_system (Qno_conversion, &safe_terminal_coding);
11301
11302   for (int i = 0; i < coding_category_max; i++)
11303     Fset (AREF (Vcoding_category_table, i), Qno_conversion);
11304
11305 #if defined (DOS_NT)
11306   system_eol_type = Qdos;
11307 #else
11308   system_eol_type = Qunix;
11309 #endif
11310   staticpro (&system_eol_type);
11311 }
11312
11313 char *
11314 emacs_strerror (int error_number)
11315 {
11316   char *str;
11317
11318   synchronize_system_messages_locale ();
11319   str = strerror (error_number);
11320
11321   if (! NILP (Vlocale_coding_system))
11322     {
11323       Lisp_Object dec = code_convert_string_norecord (build_string (str),
11324                                                       Vlocale_coding_system,
11325                                                       0);
11326       str = SSDATA (dec);
11327     }
11328
11329   return str;
11330 }
11331
11332 #endif /* emacs */