code.delx.au - gnu-emacs/blob - src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2015 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or
  16 (at your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  On
  59   the C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return true if the byte sequence conforms to XXX.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static bool
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   bool multibytep = coding->src_multibyte;
 162   ptrdiff_t consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   bool multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce a encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   bool multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   ptrdiff_t produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288
 289 #ifdef HAVE_WCHAR_H
 290 #include <wchar.h>
 291 #endif /* HAVE_WCHAR_H */
 292
 293 #include "lisp.h"
 294 #include "character.h"
 295 #include "buffer.h"
 296 #include "charset.h"
 297 #include "ccl.h"
 298 #include "composite.h"
 299 #include "coding.h"
 300 #include "window.h"
 301 #include "frame.h"
 302 #include "termhooks.h"
 303
 304 Lisp_Object Vcoding_system_hash_table;
 305
 306 /* Format of end-of-line decided by system.  This is Qunix on
 307    Unix and Mac, Qdos on DOS/Windows.
 308    This has an effect only for external encoding (i.e. for output to
 309    file and process), not for in-buffer or Lisp string encoding.  */
 310 static Lisp_Object system_eol_type;
 311
 312 #ifdef emacs
 313
 314 /* Coding-systems are handed between Emacs Lisp programs and C internal
 315    routines by the following three variables.  */
 316 /* Coding system to be used to encode text for terminal display when
 317    terminal coding system is nil.  */
 318 struct coding_system safe_terminal_coding;
 319
 320 #endif /* emacs */
 321
 322 /* Two special coding systems.  */
 323 static Lisp_Object Vsjis_coding_system;
 324 static Lisp_Object Vbig5_coding_system;
 325
 326 /* ISO2022 section */
 327
 328 #define CODING_ISO_INITIAL(coding, reg)                 \
 329   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 330                      coding_attr_iso_initial),          \
 331                reg)))
 332
 333
 334 #define CODING_ISO_REQUEST(coding, charset_id)          \
 335   (((charset_id) <= (coding)->max_charset_id            \
 336     ? ((coding)->safe_charsets[charset_id] != 255       \
 337        ? (coding)->safe_charsets[charset_id]            \
 338        : -1)                                            \
 339     : -1))
 340
 341
 342 #define CODING_ISO_FLAGS(coding)        \
 343   ((coding)->spec.iso_2022.flags)
 344 #define CODING_ISO_DESIGNATION(coding, reg)     \
 345   ((coding)->spec.iso_2022.current_designation[reg])
 346 #define CODING_ISO_INVOCATION(coding, plane)    \
 347   ((coding)->spec.iso_2022.current_invocation[plane])
 348 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 349   ((coding)->spec.iso_2022.single_shifting)
 350 #define CODING_ISO_BOL(coding)  \
 351   ((coding)->spec.iso_2022.bol)
 352 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 353   (CODING_ISO_INVOCATION (coding, plane) < 0 ? -1       \
 354    : CODING_ISO_DESIGNATION (coding, CODING_ISO_INVOCATION (coding, plane)))
 355 #define CODING_ISO_CMP_STATUS(coding)   \
 356   (&(coding)->spec.iso_2022.cmp_status)
 357 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 358   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 359 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 360   ((coding)->spec.iso_2022.embedded_utf_8)
 361
 362 /* Control characters of ISO2022.  */
 363                         /* code */      /* function */
 364 #define ISO_CODE_SO     0x0E            /* shift-out */
 365 #define ISO_CODE_SI     0x0F            /* shift-in */
 366 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 367 #define ISO_CODE_ESC    0x1B            /* escape */
 368 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 369 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 370 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 371
 372 /* All code (1-byte) of ISO2022 is classified into one of the
 373    followings.  */
 374 enum iso_code_class_type
 375   {
 376     ISO_control_0,              /* Control codes in the range
 377                                    0x00..0x1F and 0x7F, except for the
 378                                    following 5 codes.  */
 379     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 380     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 381     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 382     ISO_escape,                 /* ISO_CODE_ESC (0x1B) */
 383     ISO_control_1,              /* Control codes in the range
 384                                    0x80..0x9F, except for the
 385                                    following 3 codes.  */
 386     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 387     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 388     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 389     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 390     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 391     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 392     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 393   };
 394
 395 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 396     `iso-flags' attribute of an iso2022 coding system.  */
 397
 398 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 399    instead of the correct short-form sequence (e.g. ESC $ A).  */
 400 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 401
 402 /* If set, reset graphic planes and registers at end-of-line to the
 403    initial state.  */
 404 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 405
 406 /* If set, reset graphic planes and registers before any control
 407    characters to the initial state.  */
 408 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 409
 410 /* If set, encode by 7-bit environment.  */
 411 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 412
 413 /* If set, use locking-shift function.  */
 414 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 415
 416 /* If set, use single-shift function.  Overwrite
 417    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 418 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 419
 420 /* If set, use designation escape sequence.  */
 421 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 422
 423 /* If set, produce revision number sequence.  */
 424 #define CODING_ISO_FLAG_REVISION        0x0080
 425
 426 /* If set, produce ISO6429's direction specifying sequence.  */
 427 #define CODING_ISO_FLAG_DIRECTION       0x0100
 428
 429 /* If set, assume designation states are reset at beginning of line on
 430    output.  */
 431 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 432
 433 /* If set, designation sequence should be placed at beginning of line
 434    on output.  */
 435 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 436
 437 /* If set, do not encode unsafe characters on output.  */
 438 #define CODING_ISO_FLAG_SAFE            0x0800
 439
 440 /* If set, extra latin codes (128..159) are accepted as a valid code
 441    on input.  */
 442 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 443
 444 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 445
 446 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
 447
 448 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 449
 450 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 451
 452 #define CODING_ISO_FLAG_LEVEL_4         0x20000
 453
 454 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 455
 456 /* A character to be produced on output if encoding of the original
 457    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 458 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 459
 460 /* UTF-8 section */
 461 #define CODING_UTF_8_BOM(coding)        \
 462   ((coding)->spec.utf_8_bom)
 463
 464 /* UTF-16 section */
 465 #define CODING_UTF_16_BOM(coding)       \
 466   ((coding)->spec.utf_16.bom)
 467
 468 #define CODING_UTF_16_ENDIAN(coding)    \
 469   ((coding)->spec.utf_16.endian)
 470
 471 #define CODING_UTF_16_SURROGATE(coding) \
 472   ((coding)->spec.utf_16.surrogate)
 473
 474
 475 /* CCL section */
 476 #define CODING_CCL_DECODER(coding)      \
 477   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 478 #define CODING_CCL_ENCODER(coding)      \
 479   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 480 #define CODING_CCL_VALIDS(coding)                                          \
 481   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 482
 483 /* Index for each coding category in `coding_categories' */
 484
 485 enum coding_category
 486   {
 487     coding_category_iso_7,
 488     coding_category_iso_7_tight,
 489     coding_category_iso_8_1,
 490     coding_category_iso_8_2,
 491     coding_category_iso_7_else,
 492     coding_category_iso_8_else,
 493     coding_category_utf_8_auto,
 494     coding_category_utf_8_nosig,
 495     coding_category_utf_8_sig,
 496     coding_category_utf_16_auto,
 497     coding_category_utf_16_be,
 498     coding_category_utf_16_le,
 499     coding_category_utf_16_be_nosig,
 500     coding_category_utf_16_le_nosig,
 501     coding_category_charset,
 502     coding_category_sjis,
 503     coding_category_big5,
 504     coding_category_ccl,
 505     coding_category_emacs_mule,
 506     /* All above are targets of code detection.  */
 507     coding_category_raw_text,
 508     coding_category_undecided,
 509     coding_category_max
 510   };
 511
 512 /* Definitions of flag bits used in detect_coding_XXXX.  */
 513 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 514 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 515 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 516 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 517 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 518 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 519 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 520 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 521 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 522 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 523 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 524 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 525 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 526 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 527 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 528 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 529 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 530 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 531 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 532 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 533
 534 /* This value is returned if detect_coding_mask () find nothing other
 535    than ASCII characters.  */
 536 #define CATEGORY_MASK_ANY               \
 537   (CATEGORY_MASK_ISO_7                  \
 538    | CATEGORY_MASK_ISO_7_TIGHT          \
 539    | CATEGORY_MASK_ISO_8_1              \
 540    | CATEGORY_MASK_ISO_8_2              \
 541    | CATEGORY_MASK_ISO_7_ELSE           \
 542    | CATEGORY_MASK_ISO_8_ELSE           \
 543    | CATEGORY_MASK_UTF_8_AUTO           \
 544    | CATEGORY_MASK_UTF_8_NOSIG          \
 545    | CATEGORY_MASK_UTF_8_SIG            \
 546    | CATEGORY_MASK_UTF_16_AUTO          \
 547    | CATEGORY_MASK_UTF_16_BE            \
 548    | CATEGORY_MASK_UTF_16_LE            \
 549    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 550    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 551    | CATEGORY_MASK_CHARSET              \
 552    | CATEGORY_MASK_SJIS                 \
 553    | CATEGORY_MASK_BIG5                 \
 554    | CATEGORY_MASK_CCL                  \
 555    | CATEGORY_MASK_EMACS_MULE)
 556
 557
 558 #define CATEGORY_MASK_ISO_7BIT \
 559   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 560
 561 #define CATEGORY_MASK_ISO_8BIT \
 562   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 563
 564 #define CATEGORY_MASK_ISO_ELSE \
 565   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 566
 567 #define CATEGORY_MASK_ISO_ESCAPE        \
 568   (CATEGORY_MASK_ISO_7                  \
 569    | CATEGORY_MASK_ISO_7_TIGHT          \
 570    | CATEGORY_MASK_ISO_7_ELSE           \
 571    | CATEGORY_MASK_ISO_8_ELSE)
 572
 573 #define CATEGORY_MASK_ISO       \
 574   (  CATEGORY_MASK_ISO_7BIT     \
 575      | CATEGORY_MASK_ISO_8BIT   \
 576      | CATEGORY_MASK_ISO_ELSE)
 577
 578 #define CATEGORY_MASK_UTF_16            \
 579   (CATEGORY_MASK_UTF_16_AUTO            \
 580    | CATEGORY_MASK_UTF_16_BE            \
 581    | CATEGORY_MASK_UTF_16_LE            \
 582    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 583    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 584
 585 #define CATEGORY_MASK_UTF_8     \
 586   (CATEGORY_MASK_UTF_8_AUTO     \
 587    | CATEGORY_MASK_UTF_8_NOSIG  \
 588    | CATEGORY_MASK_UTF_8_SIG)
 589
 590 /* Table of coding categories (Lisp symbols).  This variable is for
 591    internal use only.  */
 592 static Lisp_Object Vcoding_category_table;
 593
 594 /* Table of coding-categories ordered by priority.  */
 595 static enum coding_category coding_priorities[coding_category_max];
 596
 597 /* Nth element is a coding context for the coding system bound to the
 598    Nth coding category.  */
 599 static struct coding_system coding_categories[coding_category_max];
 600
 601 /* Encode a flag that can be nil, something else, or t as -1, 0, 1.  */
 602
 603 static int
 604 encode_inhibit_flag (Lisp_Object flag)
 605 {
 606   return NILP (flag) ? -1 : EQ (flag, Qt);
 607 }
 608
 609 /* True if the value of ENCODED_FLAG says a flag should be treated as set.
 610    1 means yes, -1 means no, 0 means ask the user variable VAR.  */
 611
 612 static bool
 613 inhibit_flag (int encoded_flag, bool var)
 614 {
 615   return 0 < encoded_flag + var;
 616 }
 617
 618 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 619   do {                                                  \
 620     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 621     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 622   } while (0)
 623
 624 static void
 625 CHECK_NATNUM_CAR (Lisp_Object x)
 626 {
 627   Lisp_Object tmp = XCAR (x);
 628   CHECK_NATNUM (tmp);
 629   XSETCAR (x, tmp);
 630 }
 631
 632 static void
 633 CHECK_NATNUM_CDR (Lisp_Object x)
 634 {
 635   Lisp_Object tmp = XCDR (x);
 636   CHECK_NATNUM (tmp);
 637   XSETCDR (x, tmp);
 638 }
 639
 640 /* True if CODING's destination can be grown.  */
 641
 642 static bool
 643 growable_destination (struct coding_system *coding)
 644 {
 645   return STRINGP (coding->dst_object) || BUFFERP (coding->dst_object);
 646 }
 647
 648
 649 /* Safely get one byte from the source text pointed by SRC which ends
 650    at SRC_END, and set C to that byte.  If there are not enough bytes
 651    in the source, it jumps to 'no_more_source'.  If MULTIBYTEP,
 652    and a multibyte character is found at SRC, set C to the
 653    negative value of the character code.  The caller should declare
 654    and set these variables appropriately in advance:
 655         src, src_end, multibytep */
 656
 657 #define ONE_MORE_BYTE(c)                                \
 658   do {                                                  \
 659     if (src == src_end)                                 \
 660       {                                                 \
 661         if (src_base < src)                             \
 662           record_conversion_result                      \
 663             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 664         goto no_more_source;                            \
 665       }                                                 \
 666     c = *src++;                                         \
 667     if (multibytep && (c & 0x80))                       \
 668       {                                                 \
 669         if ((c & 0xFE) == 0xC0)                         \
 670           c = ((c & 1) << 6) | *src++;                  \
 671         else                                            \
 672           {                                             \
 673             src--;                                      \
 674             c = - string_char (src, &src, NULL);        \
 675             record_conversion_result                    \
 676               (coding, CODING_RESULT_INVALID_SRC);      \
 677           }                                             \
 678       }                                                 \
 679     consumed_chars++;                                   \
 680   } while (0)
 681
 682 /* Safely get two bytes from the source text pointed by SRC which ends
 683    at SRC_END, and set C1 and C2 to those bytes while skipping the
 684    heading multibyte characters.  If there are not enough bytes in the
 685    source, it jumps to 'no_more_source'.  If MULTIBYTEP and
 686    a multibyte character is found for C2, set C2 to the negative value
 687    of the character code.  The caller should declare and set these
 688    variables appropriately in advance:
 689         src, src_end, multibytep
 690    It is intended that this macro is used in detect_coding_utf_16.  */
 691
 692 #define TWO_MORE_BYTES(c1, c2)                          \
 693   do {                                                  \
 694     do {                                                \
 695       if (src == src_end)                               \
 696         goto no_more_source;                            \
 697       c1 = *src++;                                      \
 698       if (multibytep && (c1 & 0x80))                    \
 699         {                                               \
 700           if ((c1 & 0xFE) == 0xC0)                      \
 701             c1 = ((c1 & 1) << 6) | *src++;              \
 702           else                                          \
 703             {                                           \
 704               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 705               c1 = -1;                                  \
 706             }                                           \
 707         }                                               \
 708     } while (c1 < 0);                                   \
 709     if (src == src_end)                                 \
 710       goto no_more_source;                              \
 711     c2 = *src++;                                        \
 712     if (multibytep && (c2 & 0x80))                      \
 713       {                                                 \
 714         if ((c2 & 0xFE) == 0xC0)                        \
 715           c2 = ((c2 & 1) << 6) | *src++;                \
 716         else                                            \
 717           c2 = -1;                                      \
 718       }                                                 \
 719   } while (0)
 720
 721
 722 /* Store a byte C in the place pointed by DST and increment DST to the
 723    next free point, and increment PRODUCED_CHARS.  The caller should
 724    assure that C is 0..127, and declare and set the variable `dst'
 725    appropriately in advance.
 726 */
 727
 728
 729 #define EMIT_ONE_ASCII_BYTE(c)  \
 730   do {                          \
 731     produced_chars++;           \
 732     *dst++ = (c);               \
 733   } while (0)
 734
 735
 736 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 737
 738 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 739   do {                                  \
 740     produced_chars += 2;                \
 741     *dst++ = (c1), *dst++ = (c2);       \
 742   } while (0)
 743
 744
 745 /* Store a byte C in the place pointed by DST and increment DST to the
 746    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP,
 747    store in an appropriate multibyte form.  The caller should
 748    declare and set the variables `dst' and `multibytep' appropriately
 749    in advance.  */
 750
 751 #define EMIT_ONE_BYTE(c)                \
 752   do {                                  \
 753     produced_chars++;                   \
 754     if (multibytep)                     \
 755       {                                 \
 756         unsigned ch = (c);              \
 757         if (ch >= 0x80)                 \
 758           ch = BYTE8_TO_CHAR (ch);      \
 759         CHAR_STRING_ADVANCE (ch, dst);  \
 760       }                                 \
 761     else                                \
 762       *dst++ = (c);                     \
 763   } while (0)
 764
 765
 766 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 767
 768 #define EMIT_TWO_BYTES(c1, c2)          \
 769   do {                                  \
 770     produced_chars += 2;                \
 771     if (multibytep)                     \
 772       {                                 \
 773         unsigned ch;                    \
 774                                         \
 775         ch = (c1);                      \
 776         if (ch >= 0x80)                 \
 777           ch = BYTE8_TO_CHAR (ch);      \
 778         CHAR_STRING_ADVANCE (ch, dst);  \
 779         ch = (c2);                      \
 780         if (ch >= 0x80)                 \
 781           ch = BYTE8_TO_CHAR (ch);      \
 782         CHAR_STRING_ADVANCE (ch, dst);  \
 783       }                                 \
 784     else                                \
 785       {                                 \
 786         *dst++ = (c1);                  \
 787         *dst++ = (c2);                  \
 788       }                                 \
 789   } while (0)
 790
 791
 792 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 793   do {                                  \
 794     EMIT_ONE_BYTE (c1);                 \
 795     EMIT_TWO_BYTES (c2, c3);            \
 796   } while (0)
 797
 798
 799 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 800   do {                                          \
 801     EMIT_TWO_BYTES (c1, c2);                    \
 802     EMIT_TWO_BYTES (c3, c4);                    \
 803   } while (0)
 804
 805
 806 static void
 807 record_conversion_result (struct coding_system *coding,
 808                           enum coding_result_code result)
 809 {
 810   coding->result = result;
 811   switch (result)
 812     {
 813     case CODING_RESULT_INSUFFICIENT_SRC:
 814       Vlast_code_conversion_error = Qinsufficient_source;
 815       break;
 816     case CODING_RESULT_INVALID_SRC:
 817       Vlast_code_conversion_error = Qinvalid_source;
 818       break;
 819     case CODING_RESULT_INTERRUPT:
 820       Vlast_code_conversion_error = Qinterrupted;
 821       break;
 822     case CODING_RESULT_INSUFFICIENT_DST:
 823       /* Don't record this error in Vlast_code_conversion_error
 824          because it happens just temporarily and is resolved when the
 825          whole conversion is finished.  */
 826       break;
 827     case CODING_RESULT_SUCCESS:
 828       break;
 829     default:
 830       Vlast_code_conversion_error = intern ("Unknown error");
 831     }
 832 }
 833
 834 /* These wrapper macros are used to preserve validity of pointers into
 835    buffer text across calls to decode_char, encode_char, etc, which
 836    could cause relocation of buffers if it loads a charset map,
 837    because loading a charset map allocates large structures.  */
 838
 839 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 840   do {                                                                       \
 841     ptrdiff_t offset;                                                        \
 842                                                                              \
 843     charset_map_loaded = 0;                                                  \
 844     c = DECODE_CHAR (charset, code);                                         \
 845     if (charset_map_loaded                                                   \
 846         && (offset = coding_change_source (coding)))                         \
 847       {                                                                      \
 848         src += offset;                                                       \
 849         src_base += offset;                                                  \
 850         src_end += offset;                                                   \
 851       }                                                                      \
 852   } while (0)
 853
 854 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code)      \
 855   do {                                                                  \
 856     ptrdiff_t offset;                                                   \
 857                                                                         \
 858     charset_map_loaded = 0;                                             \
 859     code = ENCODE_CHAR (charset, c);                                    \
 860     if (charset_map_loaded                                              \
 861         && (offset = coding_change_destination (coding)))               \
 862       {                                                                 \
 863         dst += offset;                                                  \
 864         dst_end += offset;                                              \
 865       }                                                                 \
 866   } while (0)
 867
 868 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
 869   do {                                                                  \
 870     ptrdiff_t offset;                                                   \
 871                                                                         \
 872     charset_map_loaded = 0;                                             \
 873     charset = char_charset (c, charset_list, code_return);              \
 874     if (charset_map_loaded                                              \
 875         && (offset = coding_change_destination (coding)))               \
 876       {                                                                 \
 877         dst += offset;                                                  \
 878         dst_end += offset;                                              \
 879       }                                                                 \
 880   } while (0)
 881
 882 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
 883   do {                                                                  \
 884     ptrdiff_t offset;                                                   \
 885                                                                         \
 886     charset_map_loaded = 0;                                             \
 887     result = CHAR_CHARSET_P (c, charset);                               \
 888     if (charset_map_loaded                                              \
 889         && (offset = coding_change_destination (coding)))               \
 890       {                                                                 \
 891         dst += offset;                                                  \
 892         dst_end += offset;                                              \
 893       }                                                                 \
 894   } while (0)
 895
 896
 897 /* If there are at least BYTES length of room at dst, allocate memory
 898    for coding->destination and update dst and dst_end.  We don't have
 899    to take care of coding->source which will be relocated.  It is
 900    handled by calling coding_set_source in encode_coding.  */
 901
 902 #define ASSURE_DESTINATION(bytes)                               \
 903   do {                                                          \
 904     if (dst + (bytes) >= dst_end)                               \
 905       {                                                         \
 906         ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
 907                                                                 \
 908         dst = alloc_destination (coding, more_bytes, dst);      \
 909         dst_end = coding->destination + coding->dst_bytes;      \
 910       }                                                         \
 911   } while (0)
 912
 913
 914 /* Store multibyte form of the character C in P, and advance P to the
 915    end of the multibyte form.  This used to be like CHAR_STRING_ADVANCE
 916    without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
 917    MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE.  */
 918
 919 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)  CHAR_STRING_ADVANCE(c, p)
 920
 921 /* Return the character code of character whose multibyte form is at
 922    P, and advance P to the end of the multibyte form.  This used to be
 923    like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
 924    nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR.  */
 925
 926 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
 927
 928 /* Set coding->source from coding->src_object.  */
 929
 930 static void
 931 coding_set_source (struct coding_system *coding)
 932 {
 933   if (BUFFERP (coding->src_object))
 934     {
 935       struct buffer *buf = XBUFFER (coding->src_object);
 936
 937       if (coding->src_pos < 0)
 938         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 939       else
 940         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 941     }
 942   else if (STRINGP (coding->src_object))
 943     {
 944       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 945     }
 946   else
 947     {
 948       /* Otherwise, the source is C string and is never relocated
 949          automatically.  Thus we don't have to update anything.  */
 950     }
 951 }
 952
 953
 954 /* Set coding->source from coding->src_object, and return how many
 955    bytes coding->source was changed.  */
 956
 957 static ptrdiff_t
 958 coding_change_source (struct coding_system *coding)
 959 {
 960   const unsigned char *orig = coding->source;
 961   coding_set_source (coding);
 962   return coding->source - orig;
 963 }
 964
 965
 966 /* Set coding->destination from coding->dst_object.  */
 967
 968 static void
 969 coding_set_destination (struct coding_system *coding)
 970 {
 971   if (BUFFERP (coding->dst_object))
 972     {
 973       if (BUFFERP (coding->src_object) && coding->src_pos < 0)
 974         {
 975           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
 976           coding->dst_bytes = (GAP_END_ADDR
 977                                - (coding->src_bytes - coding->consumed)
 978                                - coding->destination);
 979         }
 980       else
 981         {
 982           /* We are sure that coding->dst_pos_byte is before the gap
 983              of the buffer. */
 984           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
 985                                  + coding->dst_pos_byte - BEG_BYTE);
 986           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
 987                                - coding->destination);
 988         }
 989     }
 990   else
 991     {
 992       /* Otherwise, the destination is C string and is never relocated
 993          automatically.  Thus we don't have to update anything.  */
 994     }
 995 }
 996
 997
 998 /* Set coding->destination from coding->dst_object, and return how
 999    many bytes coding->destination was changed.  */
1000
1001 static ptrdiff_t
1002 coding_change_destination (struct coding_system *coding)
1003 {
1004   const unsigned char *orig = coding->destination;
1005   coding_set_destination (coding);
1006   return coding->destination - orig;
1007 }
1008
1009
1010 static void
1011 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1012 {
1013   if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
1014     string_overflow ();
1015   coding->destination = xrealloc (coding->destination,
1016                                   coding->dst_bytes + bytes);
1017   coding->dst_bytes += bytes;
1018 }
1019
1020 static void
1021 coding_alloc_by_making_gap (struct coding_system *coding,
1022                             ptrdiff_t gap_head_used, ptrdiff_t bytes)
1023 {
1024   if (EQ (coding->src_object, coding->dst_object))
1025     {
1026       /* The gap may contain the produced data at the head and not-yet
1027          consumed data at the tail.  To preserve those data, we at
1028          first make the gap size to zero, then increase the gap
1029          size.  */
1030       ptrdiff_t add = GAP_SIZE;
1031
1032       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1033       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1034       make_gap (bytes);
1035       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1036       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1037     }
1038   else
1039     make_gap_1 (XBUFFER (coding->dst_object), bytes);
1040 }
1041
1042
1043 static unsigned char *
1044 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1045                    unsigned char *dst)
1046 {
1047   ptrdiff_t offset = dst - coding->destination;
1048
1049   if (BUFFERP (coding->dst_object))
1050     {
1051       struct buffer *buf = XBUFFER (coding->dst_object);
1052
1053       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1054     }
1055   else
1056     coding_alloc_by_realloc (coding, nbytes);
1057   coding_set_destination (coding);
1058   dst = coding->destination + offset;
1059   return dst;
1060 }
1061
1062 /** Macros for annotations.  */
1063
1064 /* An annotation data is stored in the array coding->charbuf in this
1065    format:
1066      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1067    LENGTH is the number of elements in the annotation.
1068    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1069    NCHARS is the number of characters in the text annotated.
1070
1071    The format of the following elements depend on ANNOTATION_MASK.
1072
1073    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1074    follows:
1075      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1076
1077    NBYTES is the number of bytes specified in the header part of
1078    old-style emacs-mule encoding, or 0 for the other kind of
1079    composition.
1080
1081    METHOD is one of enum composition_method.
1082
1083    Optional COMPOSITION-COMPONENTS are characters and composition
1084    rules.
1085
1086    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1087    follows.
1088
1089    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1090    recover from an invalid annotation, and should be skipped by
1091    produce_annotation.  */
1092
1093 /* Maximum length of the header of annotation data.  */
1094 #define MAX_ANNOTATION_LENGTH 5
1095
1096 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1097   do {                                                  \
1098     *(buf)++ = -(len);                                  \
1099     *(buf)++ = (mask);                                  \
1100     *(buf)++ = (nchars);                                \
1101     coding->annotated = 1;                              \
1102   } while (0);
1103
1104 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1105   do {                                                                      \
1106     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1107     *buf++ = nbytes;                                                        \
1108     *buf++ = method;                                                        \
1109   } while (0)
1110
1111
1112 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1113   do {                                                                  \
1114     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1115     *buf++ = id;                                                        \
1116   } while (0)
1117
1118
1119 /* Bitmasks for coding->eol_seen.  */
1120
1121 #define EOL_SEEN_NONE   0
1122 #define EOL_SEEN_LF     1
1123 #define EOL_SEEN_CR     2
1124 #define EOL_SEEN_CRLF   4
1125
1126 \f
1127 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1128
1129
1130
1131 \f
1132 /*** 3. UTF-8 ***/
1133
1134 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1135    Return true if a text is encoded in UTF-8.  */
1136
1137 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1138 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1139 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1140 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1141 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1142 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1143
1144 #define UTF_8_BOM_1 0xEF
1145 #define UTF_8_BOM_2 0xBB
1146 #define UTF_8_BOM_3 0xBF
1147
1148 /* Unlike the other detect_coding_XXX, this function counts the number
1149    of characters and checks the EOL format.  */
1150
1151 static bool
1152 detect_coding_utf_8 (struct coding_system *coding,
1153                      struct coding_detection_info *detect_info)
1154 {
1155   const unsigned char *src = coding->source, *src_base;
1156   const unsigned char *src_end = coding->source + coding->src_bytes;
1157   bool multibytep = coding->src_multibyte;
1158   ptrdiff_t consumed_chars = 0;
1159   bool bom_found = 0;
1160   ptrdiff_t nchars = coding->head_ascii;
1161   int eol_seen = coding->eol_seen;
1162
1163   detect_info->checked |= CATEGORY_MASK_UTF_8;
1164   /* A coding system of this category is always ASCII compatible.  */
1165   src += nchars;
1166
1167   if (src == coding->source     /* BOM should be at the head.  */
1168       && src + 3 < src_end      /* BOM is 3-byte long.  */
1169       && src[0] == UTF_8_BOM_1
1170       && src[1] == UTF_8_BOM_2
1171       && src[2] == UTF_8_BOM_3)
1172     {
1173       bom_found = 1;
1174       src += 3;
1175       nchars++;
1176     }
1177
1178   while (1)
1179     {
1180       int c, c1, c2, c3, c4;
1181
1182       src_base = src;
1183       ONE_MORE_BYTE (c);
1184       if (c < 0 || UTF_8_1_OCTET_P (c))
1185         {
1186           nchars++;
1187           if (c == '\r')
1188             {
1189               if (src < src_end && *src == '\n')
1190                 {
1191                   eol_seen |= EOL_SEEN_CRLF;
1192                   src++;
1193                   nchars++;
1194                 }
1195               else
1196                 eol_seen |= EOL_SEEN_CR;
1197             }
1198           else if (c == '\n')
1199             eol_seen |= EOL_SEEN_LF;
1200           continue;
1201         }
1202       ONE_MORE_BYTE (c1);
1203       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1204         break;
1205       if (UTF_8_2_OCTET_LEADING_P (c))
1206         {
1207           nchars++;
1208           continue;
1209         }
1210       ONE_MORE_BYTE (c2);
1211       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1212         break;
1213       if (UTF_8_3_OCTET_LEADING_P (c))
1214         {
1215           nchars++;
1216           continue;
1217         }
1218       ONE_MORE_BYTE (c3);
1219       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1220         break;
1221       if (UTF_8_4_OCTET_LEADING_P (c))
1222         {
1223           nchars++;
1224           continue;
1225         }
1226       ONE_MORE_BYTE (c4);
1227       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1228         break;
1229       if (UTF_8_5_OCTET_LEADING_P (c))
1230         {
1231           nchars++;
1232           continue;
1233         }
1234       break;
1235     }
1236   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1237   return 0;
1238
1239  no_more_source:
1240   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1241     {
1242       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1243       return 0;
1244     }
1245   if (bom_found)
1246     {
1247       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1248       detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1249     }
1250   else
1251     {
1252       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1253       if (nchars < src_end - coding->source)
1254         /* The found characters are less than source bytes, which
1255            means that we found a valid non-ASCII characters.  */
1256         detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_NOSIG;
1257     }
1258   coding->detected_utf8_bytes = src_base - coding->source;
1259   coding->detected_utf8_chars = nchars;
1260   return 1;
1261 }
1262
1263
1264 static void
1265 decode_coding_utf_8 (struct coding_system *coding)
1266 {
1267   const unsigned char *src = coding->source + coding->consumed;
1268   const unsigned char *src_end = coding->source + coding->src_bytes;
1269   const unsigned char *src_base;
1270   int *charbuf = coding->charbuf + coding->charbuf_used;
1271   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1272   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1273   bool multibytep = coding->src_multibyte;
1274   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1275   bool eol_dos
1276     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1277   int byte_after_cr = -1;
1278
1279   if (bom != utf_without_bom)
1280     {
1281       int c1, c2, c3;
1282
1283       src_base = src;
1284       ONE_MORE_BYTE (c1);
1285       if (! UTF_8_3_OCTET_LEADING_P (c1))
1286         src = src_base;
1287       else
1288         {
1289           ONE_MORE_BYTE (c2);
1290           if (! UTF_8_EXTRA_OCTET_P (c2))
1291             src = src_base;
1292           else
1293             {
1294               ONE_MORE_BYTE (c3);
1295               if (! UTF_8_EXTRA_OCTET_P (c3))
1296                 src = src_base;
1297               else
1298                 {
1299                   if ((c1 != UTF_8_BOM_1)
1300                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1301                     src = src_base;
1302                   else
1303                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1304                 }
1305             }
1306         }
1307     }
1308   CODING_UTF_8_BOM (coding) = utf_without_bom;
1309
1310   while (1)
1311     {
1312       int c, c1, c2, c3, c4, c5;
1313
1314       src_base = src;
1315       consumed_chars_base = consumed_chars;
1316
1317       if (charbuf >= charbuf_end)
1318         {
1319           if (byte_after_cr >= 0)
1320             src_base--;
1321           break;
1322         }
1323
1324       /* In the simple case, rapidly handle ordinary characters */
1325       if (multibytep && ! eol_dos
1326           && charbuf < charbuf_end - 6 && src < src_end - 6)
1327         {
1328           while (charbuf < charbuf_end - 6 && src < src_end - 6)
1329             {
1330               c1 = *src;
1331               if (c1 & 0x80)
1332                 break;
1333               src++;
1334               consumed_chars++;
1335               *charbuf++ = c1;
1336
1337               c1 = *src;
1338               if (c1 & 0x80)
1339                 break;
1340               src++;
1341               consumed_chars++;
1342               *charbuf++ = c1;
1343
1344               c1 = *src;
1345               if (c1 & 0x80)
1346                 break;
1347               src++;
1348               consumed_chars++;
1349               *charbuf++ = c1;
1350
1351               c1 = *src;
1352               if (c1 & 0x80)
1353                 break;
1354               src++;
1355               consumed_chars++;
1356               *charbuf++ = c1;
1357             }
1358           /* If we handled at least one character, restart the main loop.  */
1359           if (src != src_base)
1360             continue;
1361         }
1362
1363       if (byte_after_cr >= 0)
1364         c1 = byte_after_cr, byte_after_cr = -1;
1365       else
1366         ONE_MORE_BYTE (c1);
1367       if (c1 < 0)
1368         {
1369           c = - c1;
1370         }
1371       else if (UTF_8_1_OCTET_P (c1))
1372         {
1373           if (eol_dos && c1 == '\r')
1374             ONE_MORE_BYTE (byte_after_cr);
1375           c = c1;
1376         }
1377       else
1378         {
1379           ONE_MORE_BYTE (c2);
1380           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1381             goto invalid_code;
1382           if (UTF_8_2_OCTET_LEADING_P (c1))
1383             {
1384               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1385               /* Reject overlong sequences here and below.  Encoders
1386                  producing them are incorrect, they can be misleading,
1387                  and they mess up read/write invariance.  */
1388               if (c < 128)
1389                 goto invalid_code;
1390             }
1391           else
1392             {
1393               ONE_MORE_BYTE (c3);
1394               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1395                 goto invalid_code;
1396               if (UTF_8_3_OCTET_LEADING_P (c1))
1397                 {
1398                   c = (((c1 & 0xF) << 12)
1399                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1400                   if (c < 0x800
1401                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1402                     goto invalid_code;
1403                 }
1404               else
1405                 {
1406                   ONE_MORE_BYTE (c4);
1407                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1408                     goto invalid_code;
1409                   if (UTF_8_4_OCTET_LEADING_P (c1))
1410                     {
1411                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1412                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1413                     if (c < 0x10000)
1414                       goto invalid_code;
1415                     }
1416                   else
1417                     {
1418                       ONE_MORE_BYTE (c5);
1419                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1420                         goto invalid_code;
1421                       if (UTF_8_5_OCTET_LEADING_P (c1))
1422                         {
1423                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1424                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1425                                | (c5 & 0x3F));
1426                           if ((c > MAX_CHAR) || (c < 0x200000))
1427                             goto invalid_code;
1428                         }
1429                       else
1430                         goto invalid_code;
1431                     }
1432                 }
1433             }
1434         }
1435
1436       *charbuf++ = c;
1437       continue;
1438
1439     invalid_code:
1440       src = src_base;
1441       consumed_chars = consumed_chars_base;
1442       ONE_MORE_BYTE (c);
1443       *charbuf++ = ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
1444     }
1445
1446  no_more_source:
1447   coding->consumed_char += consumed_chars_base;
1448   coding->consumed = src_base - coding->source;
1449   coding->charbuf_used = charbuf - coding->charbuf;
1450 }
1451
1452
1453 static bool
1454 encode_coding_utf_8 (struct coding_system *coding)
1455 {
1456   bool multibytep = coding->dst_multibyte;
1457   int *charbuf = coding->charbuf;
1458   int *charbuf_end = charbuf + coding->charbuf_used;
1459   unsigned char *dst = coding->destination + coding->produced;
1460   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1461   ptrdiff_t produced_chars = 0;
1462   int c;
1463
1464   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1465     {
1466       ASSURE_DESTINATION (3);
1467       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1468       CODING_UTF_8_BOM (coding) = utf_without_bom;
1469     }
1470
1471   if (multibytep)
1472     {
1473       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1474
1475       while (charbuf < charbuf_end)
1476         {
1477           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1478
1479           ASSURE_DESTINATION (safe_room);
1480           c = *charbuf++;
1481           if (CHAR_BYTE8_P (c))
1482             {
1483               c = CHAR_TO_BYTE8 (c);
1484               EMIT_ONE_BYTE (c);
1485             }
1486           else
1487             {
1488               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1489               for (p = str; p < pend; p++)
1490                 EMIT_ONE_BYTE (*p);
1491             }
1492         }
1493     }
1494   else
1495     {
1496       int safe_room = MAX_MULTIBYTE_LENGTH;
1497
1498       while (charbuf < charbuf_end)
1499         {
1500           ASSURE_DESTINATION (safe_room);
1501           c = *charbuf++;
1502           if (CHAR_BYTE8_P (c))
1503             *dst++ = CHAR_TO_BYTE8 (c);
1504           else
1505             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1506         }
1507       produced_chars = dst - (coding->destination + coding->produced);
1508     }
1509   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1510   coding->produced_char += produced_chars;
1511   coding->produced = dst - coding->destination;
1512   return 0;
1513 }
1514
1515
1516 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1517    Return true if a text is encoded in one of UTF-16 based coding systems.  */
1518
1519 #define UTF_16_HIGH_SURROGATE_P(val) \
1520   (((val) & 0xFC00) == 0xD800)
1521
1522 #define UTF_16_LOW_SURROGATE_P(val) \
1523   (((val) & 0xFC00) == 0xDC00)
1524
1525
1526 static bool
1527 detect_coding_utf_16 (struct coding_system *coding,
1528                       struct coding_detection_info *detect_info)
1529 {
1530   const unsigned char *src = coding->source;
1531   const unsigned char *src_end = coding->source + coding->src_bytes;
1532   bool multibytep = coding->src_multibyte;
1533   int c1, c2;
1534
1535   detect_info->checked |= CATEGORY_MASK_UTF_16;
1536   if (coding->mode & CODING_MODE_LAST_BLOCK
1537       && (coding->src_chars & 1))
1538     {
1539       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1540       return 0;
1541     }
1542
1543   TWO_MORE_BYTES (c1, c2);
1544   if ((c1 == 0xFF) && (c2 == 0xFE))
1545     {
1546       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1547                              | CATEGORY_MASK_UTF_16_AUTO);
1548       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1549                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1550                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1551     }
1552   else if ((c1 == 0xFE) && (c2 == 0xFF))
1553     {
1554       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1555                              | CATEGORY_MASK_UTF_16_AUTO);
1556       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1557                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1558                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1559     }
1560   else if (c2 < 0)
1561     {
1562       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1563       return 0;
1564     }
1565   else
1566     {
1567       /* We check the dispersion of Eth and Oth bytes where E is even and
1568          O is odd.  If both are high, we assume binary data.*/
1569       unsigned char e[256], o[256];
1570       unsigned e_num = 1, o_num = 1;
1571
1572       memset (e, 0, 256);
1573       memset (o, 0, 256);
1574       e[c1] = 1;
1575       o[c2] = 1;
1576
1577       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1578                                 |CATEGORY_MASK_UTF_16_BE
1579                                 | CATEGORY_MASK_UTF_16_LE);
1580
1581       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1582              != CATEGORY_MASK_UTF_16)
1583         {
1584           TWO_MORE_BYTES (c1, c2);
1585           if (c2 < 0)
1586             break;
1587           if (! e[c1])
1588             {
1589               e[c1] = 1;
1590               e_num++;
1591               if (e_num >= 128)
1592                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1593             }
1594           if (! o[c2])
1595             {
1596               o[c2] = 1;
1597               o_num++;
1598               if (o_num >= 128)
1599                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1600             }
1601         }
1602       return 0;
1603     }
1604
1605  no_more_source:
1606   return 1;
1607 }
1608
1609 static void
1610 decode_coding_utf_16 (struct coding_system *coding)
1611 {
1612   const unsigned char *src = coding->source + coding->consumed;
1613   const unsigned char *src_end = coding->source + coding->src_bytes;
1614   const unsigned char *src_base;
1615   int *charbuf = coding->charbuf + coding->charbuf_used;
1616   /* We may produces at most 3 chars in one loop.  */
1617   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1618   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1619   bool multibytep = coding->src_multibyte;
1620   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1621   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1622   int surrogate = CODING_UTF_16_SURROGATE (coding);
1623   bool eol_dos
1624     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1625   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1626
1627   if (bom == utf_with_bom)
1628     {
1629       int c, c1, c2;
1630
1631       src_base = src;
1632       ONE_MORE_BYTE (c1);
1633       ONE_MORE_BYTE (c2);
1634       c = (c1 << 8) | c2;
1635
1636       if (endian == utf_16_big_endian
1637           ? c != 0xFEFF : c != 0xFFFE)
1638         {
1639           /* The first two bytes are not BOM.  Treat them as bytes
1640              for a normal character.  */
1641           src = src_base;
1642         }
1643       CODING_UTF_16_BOM (coding) = utf_without_bom;
1644     }
1645   else if (bom == utf_detect_bom)
1646     {
1647       /* We have already tried to detect BOM and failed in
1648          detect_coding.  */
1649       CODING_UTF_16_BOM (coding) = utf_without_bom;
1650     }
1651
1652   while (1)
1653     {
1654       int c, c1, c2;
1655
1656       src_base = src;
1657       consumed_chars_base = consumed_chars;
1658
1659       if (charbuf >= charbuf_end)
1660         {
1661           if (byte_after_cr1 >= 0)
1662             src_base -= 2;
1663           break;
1664         }
1665
1666       if (byte_after_cr1 >= 0)
1667         c1 = byte_after_cr1, byte_after_cr1 = -1;
1668       else
1669         ONE_MORE_BYTE (c1);
1670       if (c1 < 0)
1671         {
1672           *charbuf++ = -c1;
1673           continue;
1674         }
1675       if (byte_after_cr2 >= 0)
1676         c2 = byte_after_cr2, byte_after_cr2 = -1;
1677       else
1678         ONE_MORE_BYTE (c2);
1679       if (c2 < 0)
1680         {
1681           *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1682           *charbuf++ = -c2;
1683           continue;
1684         }
1685       c = (endian == utf_16_big_endian
1686            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1687
1688       if (surrogate)
1689         {
1690           if (! UTF_16_LOW_SURROGATE_P (c))
1691             {
1692               if (endian == utf_16_big_endian)
1693                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1694               else
1695                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1696               *charbuf++ = c1;
1697               *charbuf++ = c2;
1698               if (UTF_16_HIGH_SURROGATE_P (c))
1699                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1700               else
1701                 *charbuf++ = c;
1702             }
1703           else
1704             {
1705               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1706               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1707               *charbuf++ = 0x10000 + c;
1708             }
1709         }
1710       else
1711         {
1712           if (UTF_16_HIGH_SURROGATE_P (c))
1713             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1714           else
1715             {
1716               if (eol_dos && c == '\r')
1717                 {
1718                   ONE_MORE_BYTE (byte_after_cr1);
1719                   ONE_MORE_BYTE (byte_after_cr2);
1720                 }
1721               *charbuf++ = c;
1722             }
1723         }
1724     }
1725
1726  no_more_source:
1727   coding->consumed_char += consumed_chars_base;
1728   coding->consumed = src_base - coding->source;
1729   coding->charbuf_used = charbuf - coding->charbuf;
1730 }
1731
1732 static bool
1733 encode_coding_utf_16 (struct coding_system *coding)
1734 {
1735   bool multibytep = coding->dst_multibyte;
1736   int *charbuf = coding->charbuf;
1737   int *charbuf_end = charbuf + coding->charbuf_used;
1738   unsigned char *dst = coding->destination + coding->produced;
1739   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1740   int safe_room = 8;
1741   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1742   bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1743   ptrdiff_t produced_chars = 0;
1744   int c;
1745
1746   if (bom != utf_without_bom)
1747     {
1748       ASSURE_DESTINATION (safe_room);
1749       if (big_endian)
1750         EMIT_TWO_BYTES (0xFE, 0xFF);
1751       else
1752         EMIT_TWO_BYTES (0xFF, 0xFE);
1753       CODING_UTF_16_BOM (coding) = utf_without_bom;
1754     }
1755
1756   while (charbuf < charbuf_end)
1757     {
1758       ASSURE_DESTINATION (safe_room);
1759       c = *charbuf++;
1760       if (c > MAX_UNICODE_CHAR)
1761         c = coding->default_char;
1762
1763       if (c < 0x10000)
1764         {
1765           if (big_endian)
1766             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1767           else
1768             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1769         }
1770       else
1771         {
1772           int c1, c2;
1773
1774           c -= 0x10000;
1775           c1 = (c >> 10) + 0xD800;
1776           c2 = (c & 0x3FF) + 0xDC00;
1777           if (big_endian)
1778             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1779           else
1780             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1781         }
1782     }
1783   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1784   coding->produced = dst - coding->destination;
1785   coding->produced_char += produced_chars;
1786   return 0;
1787 }
1788
1789 \f
1790 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1791
1792 /* Emacs' internal format for representation of multiple character
1793    sets is a kind of multi-byte encoding, i.e. characters are
1794    represented by variable-length sequences of one-byte codes.
1795
1796    ASCII characters and control characters (e.g. `tab', `newline') are
1797    represented by one-byte sequences which are their ASCII codes, in
1798    the range 0x00 through 0x7F.
1799
1800    8-bit characters of the range 0x80..0x9F are represented by
1801    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1802    code + 0x20).
1803
1804    8-bit characters of the range 0xA0..0xFF are represented by
1805    one-byte sequences which are their 8-bit code.
1806
1807    The other characters are represented by a sequence of `base
1808    leading-code', optional `extended leading-code', and one or two
1809    `position-code's.  The length of the sequence is determined by the
1810    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1811    whereas extended leading-code and position-code take the range 0xA0
1812    through 0xFF.  See `charset.h' for more details about leading-code
1813    and position-code.
1814
1815    --- CODE RANGE of Emacs' internal format ---
1816    character set        range
1817    -------------        -----
1818    ascii                0x00..0x7F
1819    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1820    eight-bit-graphic    0xA0..0xBF
1821    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1822    ---------------------------------------------
1823
1824    As this is the internal character representation, the format is
1825    usually not used externally (i.e. in a file or in a data sent to a
1826    process).  But, it is possible to have a text externally in this
1827    format (i.e. by encoding by the coding system `emacs-mule').
1828
1829    In that case, a sequence of one-byte codes has a slightly different
1830    form.
1831
1832    At first, all characters in eight-bit-control are represented by
1833    one-byte sequences which are their 8-bit code.
1834
1835    Next, character composition data are represented by the byte
1836    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1837    where,
1838         METHOD is 0xF2 plus one of composition method (enum
1839         composition_method),
1840
1841         BYTES is 0xA0 plus a byte length of this composition data,
1842
1843         CHARS is 0xA0 plus a number of characters composed by this
1844         data,
1845
1846         COMPONENTs are characters of multibyte form or composition
1847         rules encoded by two-byte of ASCII codes.
1848
1849    In addition, for backward compatibility, the following formats are
1850    also recognized as composition data on decoding.
1851
1852    0x80 MSEQ ...
1853    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1854
1855    Here,
1856         MSEQ is a multibyte form but in these special format:
1857           ASCII: 0xA0 ASCII_CODE+0x80,
1858           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1859         RULE is a one byte code of the range 0xA0..0xF0 that
1860         represents a composition rule.
1861   */
1862
1863 char emacs_mule_bytes[256];
1864
1865
1866 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1867    Return true if a text is encoded in 'emacs-mule'.  */
1868
1869 static bool
1870 detect_coding_emacs_mule (struct coding_system *coding,
1871                           struct coding_detection_info *detect_info)
1872 {
1873   const unsigned char *src = coding->source, *src_base;
1874   const unsigned char *src_end = coding->source + coding->src_bytes;
1875   bool multibytep = coding->src_multibyte;
1876   ptrdiff_t consumed_chars = 0;
1877   int c;
1878   int found = 0;
1879
1880   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1881   /* A coding system of this category is always ASCII compatible.  */
1882   src += coding->head_ascii;
1883
1884   while (1)
1885     {
1886       src_base = src;
1887       ONE_MORE_BYTE (c);
1888       if (c < 0)
1889         continue;
1890       if (c == 0x80)
1891         {
1892           /* Perhaps the start of composite character.  We simply skip
1893              it because analyzing it is too heavy for detecting.  But,
1894              at least, we check that the composite character
1895              constitutes of more than 4 bytes.  */
1896           const unsigned char *src_start;
1897
1898         repeat:
1899           src_start = src;
1900           do
1901             {
1902               ONE_MORE_BYTE (c);
1903             }
1904           while (c >= 0xA0);
1905
1906           if (src - src_start <= 4)
1907             break;
1908           found = CATEGORY_MASK_EMACS_MULE;
1909           if (c == 0x80)
1910             goto repeat;
1911         }
1912
1913       if (c < 0x80)
1914         {
1915           if (c < 0x20
1916               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1917             break;
1918         }
1919       else
1920         {
1921           int more_bytes = emacs_mule_bytes[c] - 1;
1922
1923           while (more_bytes > 0)
1924             {
1925               ONE_MORE_BYTE (c);
1926               if (c < 0xA0)
1927                 {
1928                   src--;        /* Unread the last byte.  */
1929                   break;
1930                 }
1931               more_bytes--;
1932             }
1933           if (more_bytes != 0)
1934             break;
1935           found = CATEGORY_MASK_EMACS_MULE;
1936         }
1937     }
1938   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1939   return 0;
1940
1941  no_more_source:
1942   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1943     {
1944       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1945       return 0;
1946     }
1947   detect_info->found |= found;
1948   return 1;
1949 }
1950
1951
1952 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1953    character.  If CMP_STATUS indicates that we must expect MSEQ or
1954    RULE described above, decode it and return the negative value of
1955    the decoded character or rule.  If an invalid byte is found, return
1956    -1.  If SRC is too short, return -2.  */
1957
1958 static int
1959 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1960                  int *nbytes, int *nchars, int *id,
1961                  struct composition_status *cmp_status)
1962 {
1963   const unsigned char *src_end = coding->source + coding->src_bytes;
1964   const unsigned char *src_base = src;
1965   bool multibytep = coding->src_multibyte;
1966   int charset_ID;
1967   unsigned code;
1968   int c;
1969   ptrdiff_t consumed_chars = 0;
1970   bool mseq_found = 0;
1971
1972   ONE_MORE_BYTE (c);
1973   if (c < 0)
1974     {
1975       c = -c;
1976       charset_ID = emacs_mule_charset[0];
1977     }
1978   else
1979     {
1980       if (c >= 0xA0)
1981         {
1982           if (cmp_status->state != COMPOSING_NO
1983               && cmp_status->old_form)
1984             {
1985               if (cmp_status->state == COMPOSING_CHAR)
1986                 {
1987                   if (c == 0xA0)
1988                     {
1989                       ONE_MORE_BYTE (c);
1990                       c -= 0x80;
1991                       if (c < 0)
1992                         goto invalid_code;
1993                     }
1994                   else
1995                     c -= 0x20;
1996                   mseq_found = 1;
1997                 }
1998               else
1999                 {
2000                   *nbytes = src - src_base;
2001                   *nchars = consumed_chars;
2002                   return -c;
2003                 }
2004             }
2005           else
2006             goto invalid_code;
2007         }
2008
2009       switch (emacs_mule_bytes[c])
2010         {
2011         case 2:
2012           if ((charset_ID = emacs_mule_charset[c]) < 0)
2013             goto invalid_code;
2014           ONE_MORE_BYTE (c);
2015           if (c < 0xA0)
2016             goto invalid_code;
2017           code = c & 0x7F;
2018           break;
2019
2020         case 3:
2021           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2022               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2023             {
2024               ONE_MORE_BYTE (c);
2025               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
2026                 goto invalid_code;
2027               ONE_MORE_BYTE (c);
2028               if (c < 0xA0)
2029                 goto invalid_code;
2030               code = c & 0x7F;
2031             }
2032           else
2033             {
2034               if ((charset_ID = emacs_mule_charset[c]) < 0)
2035                 goto invalid_code;
2036               ONE_MORE_BYTE (c);
2037               if (c < 0xA0)
2038                 goto invalid_code;
2039               code = (c & 0x7F) << 8;
2040               ONE_MORE_BYTE (c);
2041               if (c < 0xA0)
2042                 goto invalid_code;
2043               code |= c & 0x7F;
2044             }
2045           break;
2046
2047         case 4:
2048           ONE_MORE_BYTE (c);
2049           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
2050             goto invalid_code;
2051           ONE_MORE_BYTE (c);
2052           if (c < 0xA0)
2053             goto invalid_code;
2054           code = (c & 0x7F) << 8;
2055           ONE_MORE_BYTE (c);
2056           if (c < 0xA0)
2057             goto invalid_code;
2058           code |= c & 0x7F;
2059           break;
2060
2061         case 1:
2062           code = c;
2063           charset_ID = ASCII_CHAR_P (code) ? charset_ascii : charset_eight_bit;
2064           break;
2065
2066         default:
2067           emacs_abort ();
2068         }
2069       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2070                           CHARSET_FROM_ID (charset_ID), code, c);
2071       if (c < 0)
2072         goto invalid_code;
2073     }
2074   *nbytes = src - src_base;
2075   *nchars = consumed_chars;
2076   if (id)
2077     *id = charset_ID;
2078   return (mseq_found ? -c : c);
2079
2080  no_more_source:
2081   return -2;
2082
2083  invalid_code:
2084   return -1;
2085 }
2086
2087
2088 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2089
2090 /* Handle these composition sequence ('|': the end of header elements,
2091    BYTES and CHARS >= 0xA0):
2092
2093    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2094    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2095    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2096
2097    and these old form:
2098
2099    (4) relative composition: 0x80 | MSEQ ... MSEQ
2100    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2101
2102    When the starter 0x80 and the following header elements are found,
2103    this annotation header is produced.
2104
2105         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2106
2107    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2108    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2109
2110    Then, upon reading the following elements, these codes are produced
2111    until the composition end is found:
2112
2113    (1) CHAR ... CHAR
2114    (2) ALT ... ALT CHAR ... CHAR
2115    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2116    (4) CHAR ... CHAR
2117    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2118
2119    When the composition end is found, LENGTH and NCHARS in the
2120    annotation header is updated as below:
2121
2122    (1) LENGTH: unchanged, NCHARS: unchanged
2123    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2124    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2125    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2126    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2127
2128    If an error is found while composing, the annotation header is
2129    changed to the original composition header (plus filler -1s) as
2130    below:
2131
2132    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2133    (5)          [ 0x80 0xFF -1 -1- -1 ]
2134
2135    and the sequence [ -2 DECODED-RULE ] is changed to the original
2136    byte sequence as below:
2137         o the original byte sequence is B: [ B -1 ]
2138         o the original byte sequence is B1 B2: [ B1 B2 ]
2139
2140    Most of the routines are implemented by macros because many
2141    variables and labels in the caller decode_coding_emacs_mule must be
2142    accessible, and they are usually called just once (thus doesn't
2143    increase the size of compiled object).  */
2144
2145 /* Decode a composition rule represented by C as a component of
2146    composition sequence of Emacs 20 style.  Set RULE to the decoded
2147    rule. */
2148
2149 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2150   do {                                                  \
2151     int gref, nref;                                     \
2152                                                         \
2153     c -= 0xA0;                                          \
2154     if (c < 0 || c >= 81)                               \
2155       goto invalid_code;                                \
2156     gref = c / 9, nref = c % 9;                         \
2157     if (gref == 4) gref = 10;                           \
2158     if (nref == 4) nref = 10;                           \
2159     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2160   } while (0)
2161
2162
2163 /* Decode a composition rule represented by C and the following byte
2164    at SRC as a component of composition sequence of Emacs 21 style.
2165    Set RULE to the decoded rule.  */
2166
2167 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2168   do {                                                  \
2169     int gref, nref;                                     \
2170                                                         \
2171     gref = c - 0x20;                                    \
2172     if (gref < 0 || gref >= 81)                         \
2173       goto invalid_code;                                \
2174     ONE_MORE_BYTE (c);                                  \
2175     nref = c - 0x20;                                    \
2176     if (nref < 0 || nref >= 81)                         \
2177       goto invalid_code;                                \
2178     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2179   } while (0)
2180
2181
2182 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2183    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2184    byte length of this composition information, CHARS is the number of
2185    characters composed by this composition.  */
2186
2187 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2188   do {                                                                  \
2189     enum composition_method method = c - 0xF2;                          \
2190     int nbytes, nchars;                                                 \
2191                                                                         \
2192     ONE_MORE_BYTE (c);                                                  \
2193     if (c < 0)                                                          \
2194       goto invalid_code;                                                \
2195     nbytes = c - 0xA0;                                                  \
2196     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2197       goto invalid_code;                                                \
2198     ONE_MORE_BYTE (c);                                                  \
2199     nchars = c - 0xA0;                                                  \
2200     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2201       goto invalid_code;                                                \
2202     cmp_status->old_form = 0;                                           \
2203     cmp_status->method = method;                                        \
2204     if (method == COMPOSITION_RELATIVE)                                 \
2205       cmp_status->state = COMPOSING_CHAR;                               \
2206     else                                                                \
2207       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2208     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2209     cmp_status->nchars = nchars;                                        \
2210     cmp_status->ncomps = nbytes - 4;                                    \
2211     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2212   } while (0)
2213
2214
2215 /* Start of Emacs 20 style format for relative composition.  */
2216
2217 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2218   do {                                                          \
2219     cmp_status->old_form = 1;                                   \
2220     cmp_status->method = COMPOSITION_RELATIVE;                  \
2221     cmp_status->state = COMPOSING_CHAR;                         \
2222     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2223     cmp_status->nchars = cmp_status->ncomps = 0;                \
2224     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2225   } while (0)
2226
2227
2228 /* Start of Emacs 20 style format for rule-base composition.  */
2229
2230 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2231   do {                                                          \
2232     cmp_status->old_form = 1;                                   \
2233     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2234     cmp_status->state = COMPOSING_CHAR;                         \
2235     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2236     cmp_status->nchars = cmp_status->ncomps = 0;                \
2237     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2238   } while (0)
2239
2240
2241 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2242   do {                                                  \
2243     const unsigned char *current_src = src;             \
2244                                                         \
2245     ONE_MORE_BYTE (c);                                  \
2246     if (c < 0)                                          \
2247       goto invalid_code;                                \
2248     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2249         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2250       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2251     else if (c < 0xA0)                                  \
2252       goto invalid_code;                                \
2253     else if (c < 0xC0)                                  \
2254       {                                                 \
2255         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2256         /* Re-read C as a composition component.  */    \
2257         src = current_src;                              \
2258       }                                                 \
2259     else if (c == 0xFF)                                 \
2260       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2261     else                                                \
2262       goto invalid_code;                                \
2263   } while (0)
2264
2265 #define EMACS_MULE_COMPOSITION_END()                            \
2266   do {                                                          \
2267     int idx = - cmp_status->length;                             \
2268                                                                 \
2269     if (cmp_status->old_form)                                   \
2270       charbuf[idx + 2] = cmp_status->nchars;                    \
2271     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2272       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2273     cmp_status->state = COMPOSING_NO;                           \
2274   } while (0)
2275
2276
2277 static int
2278 emacs_mule_finish_composition (int *charbuf,
2279                                struct composition_status *cmp_status)
2280 {
2281   int idx = - cmp_status->length;
2282   int new_chars;
2283
2284   if (cmp_status->old_form && cmp_status->nchars > 0)
2285     {
2286       charbuf[idx + 2] = cmp_status->nchars;
2287       new_chars = 0;
2288       if (cmp_status->method == COMPOSITION_WITH_RULE
2289           && cmp_status->state == COMPOSING_CHAR)
2290         {
2291           /* The last rule was invalid.  */
2292           int rule = charbuf[-1] + 0xA0;
2293
2294           charbuf[-2] = BYTE8_TO_CHAR (rule);
2295           charbuf[-1] = -1;
2296           new_chars = 1;
2297         }
2298     }
2299   else
2300     {
2301       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2302
2303       if (cmp_status->method == COMPOSITION_WITH_RULE)
2304         {
2305           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2306           charbuf[idx++] = -3;
2307           charbuf[idx++] = 0;
2308           new_chars = 1;
2309         }
2310       else
2311         {
2312           int nchars = charbuf[idx + 1] + 0xA0;
2313           int nbytes = charbuf[idx + 2] + 0xA0;
2314
2315           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2316           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2317           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2318           charbuf[idx++] = -1;
2319           new_chars = 4;
2320         }
2321     }
2322   cmp_status->state = COMPOSING_NO;
2323   return new_chars;
2324 }
2325
2326 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2327   do {                                                                    \
2328     if (cmp_status->state != COMPOSING_NO)                                \
2329       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2330   } while (0)
2331
2332
2333 static void
2334 decode_coding_emacs_mule (struct coding_system *coding)
2335 {
2336   const unsigned char *src = coding->source + coding->consumed;
2337   const unsigned char *src_end = coding->source + coding->src_bytes;
2338   const unsigned char *src_base;
2339   int *charbuf = coding->charbuf + coding->charbuf_used;
2340   /* We may produce two annotations (charset and composition) in one
2341      loop and one more charset annotation at the end.  */
2342   int *charbuf_end
2343     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2344       /* We can produce up to 2 characters in a loop.  */
2345       - 1;
2346   ptrdiff_t consumed_chars = 0, consumed_chars_base;
2347   bool multibytep = coding->src_multibyte;
2348   ptrdiff_t char_offset = coding->produced_char;
2349   ptrdiff_t last_offset = char_offset;
2350   int last_id = charset_ascii;
2351   bool eol_dos
2352     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2353   int byte_after_cr = -1;
2354   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2355
2356   if (cmp_status->state != COMPOSING_NO)
2357     {
2358       int i;
2359
2360       if (charbuf_end - charbuf < cmp_status->length)
2361         emacs_abort ();
2362       for (i = 0; i < cmp_status->length; i++)
2363         *charbuf++ = cmp_status->carryover[i];
2364       coding->annotated = 1;
2365     }
2366
2367   while (1)
2368     {
2369       int c, id IF_LINT (= 0);
2370
2371       src_base = src;
2372       consumed_chars_base = consumed_chars;
2373
2374       if (charbuf >= charbuf_end)
2375         {
2376           if (byte_after_cr >= 0)
2377             src_base--;
2378           break;
2379         }
2380
2381       if (byte_after_cr >= 0)
2382         c = byte_after_cr, byte_after_cr = -1;
2383       else
2384         ONE_MORE_BYTE (c);
2385
2386       if (c < 0 || c == 0x80)
2387         {
2388           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2389           if (c < 0)
2390             {
2391               *charbuf++ = -c;
2392               char_offset++;
2393             }
2394           else
2395             DECODE_EMACS_MULE_COMPOSITION_START ();
2396           continue;
2397         }
2398
2399       if (c < 0x80)
2400         {
2401           if (eol_dos && c == '\r')
2402             ONE_MORE_BYTE (byte_after_cr);
2403           id = charset_ascii;
2404           if (cmp_status->state != COMPOSING_NO)
2405             {
2406               if (cmp_status->old_form)
2407                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2408               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2409                 cmp_status->ncomps--;
2410             }
2411         }
2412       else
2413         {
2414           int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2415           /* emacs_mule_char can load a charset map from a file, which
2416              allocates a large structure and might cause buffer text
2417              to be relocated as result.  Thus, we need to remember the
2418              original pointer to buffer text, and fix up all related
2419              pointers after the call.  */
2420           const unsigned char *orig = coding->source;
2421           ptrdiff_t offset;
2422
2423           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2424                                cmp_status);
2425           offset = coding->source - orig;
2426           if (offset)
2427             {
2428               src += offset;
2429               src_base += offset;
2430               src_end += offset;
2431             }
2432           if (c < 0)
2433             {
2434               if (c == -1)
2435                 goto invalid_code;
2436               if (c == -2)
2437                 break;
2438             }
2439           src = src_base + nbytes;
2440           consumed_chars = consumed_chars_base + nchars;
2441           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2442             cmp_status->ncomps -= nchars;
2443         }
2444
2445       /* Now if C >= 0, we found a normally encoded character, if C <
2446          0, we found an old-style composition component character or
2447          rule.  */
2448
2449       if (cmp_status->state == COMPOSING_NO)
2450         {
2451           if (last_id != id)
2452             {
2453               if (last_id != charset_ascii)
2454                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2455                                   last_id);
2456               last_id = id;
2457               last_offset = char_offset;
2458             }
2459           *charbuf++ = c;
2460           char_offset++;
2461         }
2462       else if (cmp_status->state == COMPOSING_CHAR)
2463         {
2464           if (cmp_status->old_form)
2465             {
2466               if (c >= 0)
2467                 {
2468                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2469                   *charbuf++ = c;
2470                   char_offset++;
2471                 }
2472               else
2473                 {
2474                   *charbuf++ = -c;
2475                   cmp_status->nchars++;
2476                   cmp_status->length++;
2477                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2478                     EMACS_MULE_COMPOSITION_END ();
2479                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2480                     cmp_status->state = COMPOSING_RULE;
2481                 }
2482             }
2483           else
2484             {
2485               *charbuf++ = c;
2486               cmp_status->length++;
2487               cmp_status->nchars--;
2488               if (cmp_status->nchars == 0)
2489                 EMACS_MULE_COMPOSITION_END ();
2490             }
2491         }
2492       else if (cmp_status->state == COMPOSING_RULE)
2493         {
2494           int rule;
2495
2496           if (c >= 0)
2497             {
2498               EMACS_MULE_COMPOSITION_END ();
2499               *charbuf++ = c;
2500               char_offset++;
2501             }
2502           else
2503             {
2504               c = -c;
2505               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2506               if (rule < 0)
2507                 goto invalid_code;
2508               *charbuf++ = -2;
2509               *charbuf++ = rule;
2510               cmp_status->length += 2;
2511               cmp_status->state = COMPOSING_CHAR;
2512             }
2513         }
2514       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2515         {
2516           *charbuf++ = c;
2517           cmp_status->length++;
2518           if (cmp_status->ncomps == 0)
2519             cmp_status->state = COMPOSING_CHAR;
2520           else if (cmp_status->ncomps > 0)
2521             {
2522               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2523                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2524             }
2525           else
2526             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2527         }
2528       else                      /* COMPOSING_COMPONENT_RULE */
2529         {
2530           int rule;
2531
2532           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2533           if (rule < 0)
2534             goto invalid_code;
2535           *charbuf++ = -2;
2536           *charbuf++ = rule;
2537           cmp_status->length += 2;
2538           cmp_status->ncomps--;
2539           if (cmp_status->ncomps > 0)
2540             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2541           else
2542             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2543         }
2544       continue;
2545
2546     invalid_code:
2547       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2548       src = src_base;
2549       consumed_chars = consumed_chars_base;
2550       ONE_MORE_BYTE (c);
2551       *charbuf++ = ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
2552       char_offset++;
2553     }
2554
2555  no_more_source:
2556   if (cmp_status->state != COMPOSING_NO)
2557     {
2558       if (coding->mode & CODING_MODE_LAST_BLOCK)
2559         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2560       else
2561         {
2562           int i;
2563
2564           charbuf -= cmp_status->length;
2565           for (i = 0; i < cmp_status->length; i++)
2566             cmp_status->carryover[i] = charbuf[i];
2567         }
2568     }
2569   if (last_id != charset_ascii)
2570     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2571   coding->consumed_char += consumed_chars_base;
2572   coding->consumed = src_base - coding->source;
2573   coding->charbuf_used = charbuf - coding->charbuf;
2574 }
2575
2576
2577 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2578   do {                                          \
2579     if (id < 0xA0)                              \
2580       codes[0] = id, codes[1] = 0;              \
2581     else if (id < 0xE0)                         \
2582       codes[0] = 0x9A, codes[1] = id;           \
2583     else if (id < 0xF0)                         \
2584       codes[0] = 0x9B, codes[1] = id;           \
2585     else if (id < 0xF5)                         \
2586       codes[0] = 0x9C, codes[1] = id;           \
2587     else                                        \
2588       codes[0] = 0x9D, codes[1] = id;           \
2589   } while (0);
2590
2591
2592 static bool
2593 encode_coding_emacs_mule (struct coding_system *coding)
2594 {
2595   bool multibytep = coding->dst_multibyte;
2596   int *charbuf = coding->charbuf;
2597   int *charbuf_end = charbuf + coding->charbuf_used;
2598   unsigned char *dst = coding->destination + coding->produced;
2599   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2600   int safe_room = 8;
2601   ptrdiff_t produced_chars = 0;
2602   Lisp_Object attrs, charset_list;
2603   int c;
2604   int preferred_charset_id = -1;
2605
2606   CODING_GET_INFO (coding, attrs, charset_list);
2607   if (! EQ (charset_list, Vemacs_mule_charset_list))
2608     {
2609       charset_list = Vemacs_mule_charset_list;
2610       ASET (attrs, coding_attr_charset_list, charset_list);
2611     }
2612
2613   while (charbuf < charbuf_end)
2614     {
2615       ASSURE_DESTINATION (safe_room);
2616       c = *charbuf++;
2617
2618       if (c < 0)
2619         {
2620           /* Handle an annotation.  */
2621           switch (*charbuf)
2622             {
2623             case CODING_ANNOTATE_COMPOSITION_MASK:
2624               /* Not yet implemented.  */
2625               break;
2626             case CODING_ANNOTATE_CHARSET_MASK:
2627               preferred_charset_id = charbuf[3];
2628               if (preferred_charset_id >= 0
2629                   && NILP (Fmemq (make_number (preferred_charset_id),
2630                                   charset_list)))
2631                 preferred_charset_id = -1;
2632               break;
2633             default:
2634               emacs_abort ();
2635             }
2636           charbuf += -c - 1;
2637           continue;
2638         }
2639
2640       if (ASCII_CHAR_P (c))
2641         EMIT_ONE_ASCII_BYTE (c);
2642       else if (CHAR_BYTE8_P (c))
2643         {
2644           c = CHAR_TO_BYTE8 (c);
2645           EMIT_ONE_BYTE (c);
2646         }
2647       else
2648         {
2649           struct charset *charset;
2650           unsigned code;
2651           int dimension;
2652           int emacs_mule_id;
2653           unsigned char leading_codes[2];
2654
2655           if (preferred_charset_id >= 0)
2656             {
2657               bool result;
2658
2659               charset = CHARSET_FROM_ID (preferred_charset_id);
2660               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2661               if (result)
2662                 code = ENCODE_CHAR (charset, c);
2663               else
2664                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2665                                      &code, charset);
2666             }
2667           else
2668             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2669                                  &code, charset);
2670           if (! charset)
2671             {
2672               c = coding->default_char;
2673               if (ASCII_CHAR_P (c))
2674                 {
2675                   EMIT_ONE_ASCII_BYTE (c);
2676                   continue;
2677                 }
2678               CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2679                                    &code, charset);
2680             }
2681           dimension = CHARSET_DIMENSION (charset);
2682           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2683           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2684           EMIT_ONE_BYTE (leading_codes[0]);
2685           if (leading_codes[1])
2686             EMIT_ONE_BYTE (leading_codes[1]);
2687           if (dimension == 1)
2688             EMIT_ONE_BYTE (code | 0x80);
2689           else
2690             {
2691               code |= 0x8080;
2692               EMIT_ONE_BYTE (code >> 8);
2693               EMIT_ONE_BYTE (code & 0xFF);
2694             }
2695         }
2696     }
2697   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2698   coding->produced_char += produced_chars;
2699   coding->produced = dst - coding->destination;
2700   return 0;
2701 }
2702
2703 \f
2704 /*** 7. ISO2022 handlers ***/
2705
2706 /* The following note describes the coding system ISO2022 briefly.
2707    Since the intention of this note is to help understand the
2708    functions in this file, some parts are NOT ACCURATE or are OVERLY
2709    SIMPLIFIED.  For thorough understanding, please refer to the
2710    original document of ISO2022.  This is equivalent to the standard
2711    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2712
2713    ISO2022 provides many mechanisms to encode several character sets
2714    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2715    is encoded using bytes less than 128.  This may make the encoded
2716    text a little bit longer, but the text passes more easily through
2717    several types of gateway, some of which strip off the MSB (Most
2718    Significant Bit).
2719
2720    There are two kinds of character sets: control character sets and
2721    graphic character sets.  The former contain control characters such
2722    as `newline' and `escape' to provide control functions (control
2723    functions are also provided by escape sequences).  The latter
2724    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2725    two control character sets and many graphic character sets.
2726
2727    Graphic character sets are classified into one of the following
2728    four classes, according to the number of bytes (DIMENSION) and
2729    number of characters in one dimension (CHARS) of the set:
2730    - DIMENSION1_CHARS94
2731    - DIMENSION1_CHARS96
2732    - DIMENSION2_CHARS94
2733    - DIMENSION2_CHARS96
2734
2735    In addition, each character set is assigned an identification tag,
2736    unique for each set, called the "final character" (denoted as <F>
2737    hereafter).  The <F> of each character set is decided by ECMA(*)
2738    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2739    (0x30..0x3F are for private use only).
2740
2741    Note (*): ECMA = European Computer Manufacturers Association
2742
2743    Here are examples of graphic character sets [NAME(<F>)]:
2744         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2745         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2746         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2747         o DIMENSION2_CHARS96 -- none for the moment
2748
2749    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2750         C0 [0x00..0x1F] -- control character plane 0
2751         GL [0x20..0x7F] -- graphic character plane 0
2752         C1 [0x80..0x9F] -- control character plane 1
2753         GR [0xA0..0xFF] -- graphic character plane 1
2754
2755    A control character set is directly designated and invoked to C0 or
2756    C1 by an escape sequence.  The most common case is that:
2757    - ISO646's  control character set is designated/invoked to C0, and
2758    - ISO6429's control character set is designated/invoked to C1,
2759    and usually these designations/invocations are omitted in encoded
2760    text.  In a 7-bit environment, only C0 can be used, and a control
2761    character for C1 is encoded by an appropriate escape sequence to
2762    fit into the environment.  All control characters for C1 are
2763    defined to have corresponding escape sequences.
2764
2765    A graphic character set is at first designated to one of four
2766    graphic registers (G0 through G3), then these graphic registers are
2767    invoked to GL or GR.  These designations and invocations can be
2768    done independently.  The most common case is that G0 is invoked to
2769    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2770    these invocations and designations are omitted in encoded text.
2771    In a 7-bit environment, only GL can be used.
2772
2773    When a graphic character set of CHARS94 is invoked to GL, codes
2774    0x20 and 0x7F of the GL area work as control characters SPACE and
2775    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2776    be used.
2777
2778    There are two ways of invocation: locking-shift and single-shift.
2779    With locking-shift, the invocation lasts until the next different
2780    invocation, whereas with single-shift, the invocation affects the
2781    following character only and doesn't affect the locking-shift
2782    state.  Invocations are done by the following control characters or
2783    escape sequences:
2784
2785    ----------------------------------------------------------------------
2786    abbrev  function                  cntrl escape seq   description
2787    ----------------------------------------------------------------------
2788    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2789    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2790    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2791    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2792    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2793    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2794    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2795    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2796    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2797    ----------------------------------------------------------------------
2798    (*) These are not used by any known coding system.
2799
2800    Control characters for these functions are defined by macros
2801    ISO_CODE_XXX in `coding.h'.
2802
2803    Designations are done by the following escape sequences:
2804    ----------------------------------------------------------------------
2805    escape sequence      description
2806    ----------------------------------------------------------------------
2807    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2808    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2809    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2810    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2811    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2812    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2813    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2814    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2815    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2816    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2817    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2818    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2819    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2820    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2821    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2822    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2823    ----------------------------------------------------------------------
2824
2825    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2826    of dimension 1, chars 94, and final character <F>, etc...
2827
2828    Note (*): Although these designations are not allowed in ISO2022,
2829    Emacs accepts them on decoding, and produces them on encoding
2830    CHARS96 character sets in a coding system which is characterized as
2831    7-bit environment, non-locking-shift, and non-single-shift.
2832
2833    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2834    '(' must be omitted.  We refer to this as "short-form" hereafter.
2835
2836    Now you may notice that there are a lot of ways of encoding the
2837    same multilingual text in ISO2022.  Actually, there exist many
2838    coding systems such as Compound Text (used in X11's inter client
2839    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2840    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2841    localized platforms), and all of these are variants of ISO2022.
2842
2843    In addition to the above, Emacs handles two more kinds of escape
2844    sequences: ISO6429's direction specification and Emacs' private
2845    sequence for specifying character composition.
2846
2847    ISO6429's direction specification takes the following form:
2848         o CSI ']'      -- end of the current direction
2849         o CSI '0' ']'  -- end of the current direction
2850         o CSI '1' ']'  -- start of left-to-right text
2851         o CSI '2' ']'  -- start of right-to-left text
2852    The control character CSI (0x9B: control sequence introducer) is
2853    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2854
2855    Character composition specification takes the following form:
2856         o ESC '0' -- start relative composition
2857         o ESC '1' -- end composition
2858         o ESC '2' -- start rule-base composition (*)
2859         o ESC '3' -- start relative composition with alternate chars  (**)
2860         o ESC '4' -- start rule-base composition with alternate chars  (**)
2861   Since these are not standard escape sequences of any ISO standard,
2862   the use of them with these meanings is restricted to Emacs only.
2863
2864   (*) This form is used only in Emacs 20.7 and older versions,
2865   but newer versions can safely decode it.
2866   (**) This form is used only in Emacs 21.1 and newer versions,
2867   and older versions can't decode it.
2868
2869   Here's a list of example usages of these composition escape
2870   sequences (categorized by `enum composition_method').
2871
2872   COMPOSITION_RELATIVE:
2873         ESC 0 CHAR [ CHAR ] ESC 1
2874   COMPOSITION_WITH_RULE:
2875         ESC 2 CHAR [ RULE CHAR ] ESC 1
2876   COMPOSITION_WITH_ALTCHARS:
2877         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2878   COMPOSITION_WITH_RULE_ALTCHARS:
2879         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2880
2881 static enum iso_code_class_type iso_code_class[256];
2882
2883 #define SAFE_CHARSET_P(coding, id)      \
2884   ((id) <= (coding)->max_charset_id     \
2885    && (coding)->safe_charsets[id] != 255)
2886
2887 static void
2888 setup_iso_safe_charsets (Lisp_Object attrs)
2889 {
2890   Lisp_Object charset_list, safe_charsets;
2891   Lisp_Object request;
2892   Lisp_Object reg_usage;
2893   Lisp_Object tail;
2894   EMACS_INT reg94, reg96;
2895   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2896   int max_charset_id;
2897
2898   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2899   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2900       && ! EQ (charset_list, Viso_2022_charset_list))
2901     {
2902       charset_list = Viso_2022_charset_list;
2903       ASET (attrs, coding_attr_charset_list, charset_list);
2904       ASET (attrs, coding_attr_safe_charsets, Qnil);
2905     }
2906
2907   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2908     return;
2909
2910   max_charset_id = 0;
2911   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2912     {
2913       int id = XINT (XCAR (tail));
2914       if (max_charset_id < id)
2915         max_charset_id = id;
2916     }
2917
2918   safe_charsets = make_uninit_string (max_charset_id + 1);
2919   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2920   request = AREF (attrs, coding_attr_iso_request);
2921   reg_usage = AREF (attrs, coding_attr_iso_usage);
2922   reg94 = XINT (XCAR (reg_usage));
2923   reg96 = XINT (XCDR (reg_usage));
2924
2925   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2926     {
2927       Lisp_Object id;
2928       Lisp_Object reg;
2929       struct charset *charset;
2930
2931       id = XCAR (tail);
2932       charset = CHARSET_FROM_ID (XINT (id));
2933       reg = Fcdr (Fassq (id, request));
2934       if (! NILP (reg))
2935         SSET (safe_charsets, XINT (id), XINT (reg));
2936       else if (charset->iso_chars_96)
2937         {
2938           if (reg96 < 4)
2939             SSET (safe_charsets, XINT (id), reg96);
2940         }
2941       else
2942         {
2943           if (reg94 < 4)
2944             SSET (safe_charsets, XINT (id), reg94);
2945         }
2946     }
2947   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2948 }
2949
2950
2951 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2952    Return true if a text is encoded in one of ISO-2022 based coding
2953    systems.  */
2954
2955 static bool
2956 detect_coding_iso_2022 (struct coding_system *coding,
2957                         struct coding_detection_info *detect_info)
2958 {
2959   const unsigned char *src = coding->source, *src_base = src;
2960   const unsigned char *src_end = coding->source + coding->src_bytes;
2961   bool multibytep = coding->src_multibyte;
2962   bool single_shifting = 0;
2963   int id;
2964   int c, c1;
2965   ptrdiff_t consumed_chars = 0;
2966   int i;
2967   int rejected = 0;
2968   int found = 0;
2969   int composition_count = -1;
2970
2971   detect_info->checked |= CATEGORY_MASK_ISO;
2972
2973   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2974     {
2975       struct coding_system *this = &(coding_categories[i]);
2976       Lisp_Object attrs, val;
2977
2978       if (this->id < 0)
2979         continue;
2980       attrs = CODING_ID_ATTRS (this->id);
2981       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2982           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
2983         setup_iso_safe_charsets (attrs);
2984       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2985       this->max_charset_id = SCHARS (val) - 1;
2986       this->safe_charsets = SDATA (val);
2987     }
2988
2989   /* A coding system of this category is always ASCII compatible.  */
2990   src += coding->head_ascii;
2991
2992   while (rejected != CATEGORY_MASK_ISO)
2993     {
2994       src_base = src;
2995       ONE_MORE_BYTE (c);
2996       switch (c)
2997         {
2998         case ISO_CODE_ESC:
2999           if (inhibit_iso_escape_detection)
3000             break;
3001           single_shifting = 0;
3002           ONE_MORE_BYTE (c);
3003           if (c == 'N' || c == 'O')
3004             {
3005               /* ESC <Fe> for SS2 or SS3.  */
3006               single_shifting = 1;
3007               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3008             }
3009           else if (c == '1')
3010             {
3011               /* End of composition.  */
3012               if (composition_count < 0
3013                   || composition_count > MAX_COMPOSITION_COMPONENTS)
3014                 /* Invalid */
3015                 break;
3016               composition_count = -1;
3017               found |= CATEGORY_MASK_ISO;
3018             }
3019           else if (c >= '0' && c <= '4')
3020             {
3021               /* ESC <Fp> for start/end composition.  */
3022               composition_count = 0;
3023             }
3024           else
3025             {
3026               if (c >= '(' && c <= '/')
3027                 {
3028                   /* Designation sequence for a charset of dimension 1.  */
3029                   ONE_MORE_BYTE (c1);
3030                   if (c1 < ' ' || c1 >= 0x80
3031                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3032                     {
3033                       /* Invalid designation sequence.  Just ignore.  */
3034                       if (c1 >= 0x80)
3035                         rejected |= (CATEGORY_MASK_ISO_7BIT
3036                                      | CATEGORY_MASK_ISO_7_ELSE);
3037                       break;
3038                     }
3039                 }
3040               else if (c == '$')
3041                 {
3042                   /* Designation sequence for a charset of dimension 2.  */
3043                   ONE_MORE_BYTE (c);
3044                   if (c >= '@' && c <= 'B')
3045                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3046                     id = iso_charset_table[1][0][c];
3047                   else if (c >= '(' && c <= '/')
3048                     {
3049                       ONE_MORE_BYTE (c1);
3050                       if (c1 < ' ' || c1 >= 0x80
3051                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3052                         {
3053                           /* Invalid designation sequence.  Just ignore.  */
3054                           if (c1 >= 0x80)
3055                             rejected |= (CATEGORY_MASK_ISO_7BIT
3056                                          | CATEGORY_MASK_ISO_7_ELSE);
3057                           break;
3058                         }
3059                     }
3060                   else
3061                     {
3062                       /* Invalid designation sequence.  Just ignore it.  */
3063                       if (c >= 0x80)
3064                         rejected |= (CATEGORY_MASK_ISO_7BIT
3065                                      | CATEGORY_MASK_ISO_7_ELSE);
3066                       break;
3067                     }
3068                 }
3069               else
3070                 {
3071                   /* Invalid escape sequence.  Just ignore it.  */
3072                   if (c >= 0x80)
3073                     rejected |= (CATEGORY_MASK_ISO_7BIT
3074                                  | CATEGORY_MASK_ISO_7_ELSE);
3075                   break;
3076                 }
3077
3078               /* We found a valid designation sequence for CHARSET.  */
3079               rejected |= CATEGORY_MASK_ISO_8BIT;
3080               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3081                                   id))
3082                 found |= CATEGORY_MASK_ISO_7;
3083               else
3084                 rejected |= CATEGORY_MASK_ISO_7;
3085               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3086                                   id))
3087                 found |= CATEGORY_MASK_ISO_7_TIGHT;
3088               else
3089                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3090               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3091                                   id))
3092                 found |= CATEGORY_MASK_ISO_7_ELSE;
3093               else
3094                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3095               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3096                                   id))
3097                 found |= CATEGORY_MASK_ISO_8_ELSE;
3098               else
3099                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3100             }
3101           break;
3102
3103         case ISO_CODE_SO:
3104         case ISO_CODE_SI:
3105           /* Locking shift out/in.  */
3106           if (inhibit_iso_escape_detection)
3107             break;
3108           single_shifting = 0;
3109           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3110           break;
3111
3112         case ISO_CODE_CSI:
3113           /* Control sequence introducer.  */
3114           single_shifting = 0;
3115           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3116           found |= CATEGORY_MASK_ISO_8_ELSE;
3117           goto check_extra_latin;
3118
3119         case ISO_CODE_SS2:
3120         case ISO_CODE_SS3:
3121           /* Single shift.   */
3122           if (inhibit_iso_escape_detection)
3123             break;
3124           single_shifting = 0;
3125           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3126           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3127               & CODING_ISO_FLAG_SINGLE_SHIFT)
3128             {
3129               found |= CATEGORY_MASK_ISO_8_1;
3130               single_shifting = 1;
3131             }
3132           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3133               & CODING_ISO_FLAG_SINGLE_SHIFT)
3134             {
3135               found |= CATEGORY_MASK_ISO_8_2;
3136               single_shifting = 1;
3137             }
3138           if (single_shifting)
3139             break;
3140           goto check_extra_latin;
3141
3142         default:
3143           if (c < 0)
3144             continue;
3145           if (c < 0x80)
3146             {
3147               if (composition_count >= 0)
3148                 composition_count++;
3149               single_shifting = 0;
3150               break;
3151             }
3152           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3153           if (c >= 0xA0)
3154             {
3155               found |= CATEGORY_MASK_ISO_8_1;
3156               /* Check the length of succeeding codes of the range
3157                  0xA0..0FF.  If the byte length is even, we include
3158                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3159                  only when we are not single shifting.  */
3160               if (! single_shifting
3161                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3162                 {
3163                   ptrdiff_t len = 1;
3164                   while (src < src_end)
3165                     {
3166                       src_base = src;
3167                       ONE_MORE_BYTE (c);
3168                       if (c < 0xA0)
3169                         {
3170                           src = src_base;
3171                           break;
3172                         }
3173                       len++;
3174                     }
3175
3176                   if (len & 1 && src < src_end)
3177                     {
3178                       rejected |= CATEGORY_MASK_ISO_8_2;
3179                       if (composition_count >= 0)
3180                         composition_count += len;
3181                     }
3182                   else
3183                     {
3184                       found |= CATEGORY_MASK_ISO_8_2;
3185                       if (composition_count >= 0)
3186                         composition_count += len / 2;
3187                     }
3188                 }
3189               break;
3190             }
3191         check_extra_latin:
3192           if (! VECTORP (Vlatin_extra_code_table)
3193               || NILP (AREF (Vlatin_extra_code_table, c)))
3194             {
3195               rejected = CATEGORY_MASK_ISO;
3196               break;
3197             }
3198           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3199               & CODING_ISO_FLAG_LATIN_EXTRA)
3200             found |= CATEGORY_MASK_ISO_8_1;
3201           else
3202             rejected |= CATEGORY_MASK_ISO_8_1;
3203           rejected |= CATEGORY_MASK_ISO_8_2;
3204           break;
3205         }
3206     }
3207   detect_info->rejected |= CATEGORY_MASK_ISO;
3208   return 0;
3209
3210  no_more_source:
3211   detect_info->rejected |= rejected;
3212   detect_info->found |= (found & ~rejected);
3213   return 1;
3214 }
3215
3216
3217 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3218    escape sequence should be kept.  */
3219 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3220   do {                                                                  \
3221     int id, prev;                                                       \
3222                                                                         \
3223     if (final < '0' || final >= 128                                     \
3224         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3225         || !SAFE_CHARSET_P (coding, id))                                \
3226       {                                                                 \
3227         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3228         chars_96 = -1;                                                  \
3229         break;                                                          \
3230       }                                                                 \
3231     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3232     if (id == charset_jisx0201_roman)                                   \
3233       {                                                                 \
3234         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3235           id = charset_ascii;                                           \
3236       }                                                                 \
3237     else if (id == charset_jisx0208_1978)                               \
3238       {                                                                 \
3239         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3240           id = charset_jisx0208;                                        \
3241       }                                                                 \
3242     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3243     /* If there was an invalid designation to REG previously, and this  \
3244        designation is ASCII to REG, we should keep this designation     \
3245        sequence.  */                                                    \
3246     if (prev == -2 && id == charset_ascii)                              \
3247       chars_96 = -1;                                                    \
3248   } while (0)
3249
3250
3251 /* Handle these composition sequence (ALT: alternate char):
3252
3253    (1) relative composition: ESC 0 CHAR ... ESC 1
3254    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3255    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3256    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3257
3258    When the start sequence (ESC 0/2/3/4) is found, this annotation
3259    header is produced.
3260
3261         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3262
3263    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3264    produced until the end sequence (ESC 1) is found:
3265
3266    (1) CHAR ... CHAR
3267    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3268    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3269    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3270
3271    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3272    annotation header is updated as below:
3273
3274    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3275    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3276    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3277    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3278
3279    If an error is found while composing, the annotation header is
3280    changed to:
3281
3282         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3283
3284    and the sequence [ -2 DECODED-RULE ] is changed to the original
3285    byte sequence as below:
3286         o the original byte sequence is B: [ B -1 ]
3287         o the original byte sequence is B1 B2: [ B1 B2 ]
3288    and the sequence [ -1 -1 ] is changed to the original byte
3289    sequence:
3290         [ ESC '0' ]
3291 */
3292
3293 /* Decode a composition rule C1 and maybe one more byte from the
3294    source, and set RULE to the encoded composition rule.  If the rule
3295    is invalid, goto invalid_code.  */
3296
3297 #define DECODE_COMPOSITION_RULE(rule)                                   \
3298   do {                                                                  \
3299     rule = c1 - 32;                                                     \
3300     if (rule < 0)                                                       \
3301       goto invalid_code;                                                \
3302     if (rule < 81)              /* old format (before ver.21) */        \
3303       {                                                                 \
3304         int gref = (rule) / 9;                                          \
3305         int nref = (rule) % 9;                                          \
3306         if (gref == 4) gref = 10;                                       \
3307         if (nref == 4) nref = 10;                                       \
3308         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3309       }                                                                 \
3310     else                        /* new format (after ver.21) */         \
3311       {                                                                 \
3312         int b;                                                          \
3313                                                                         \
3314         ONE_MORE_BYTE (b);                                              \
3315         if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))        \
3316           goto invalid_code;                                            \
3317         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
3318         rule += 0x100;   /* Distinguish it from the old format.  */     \
3319       }                                                                 \
3320   } while (0)
3321
3322 #define ENCODE_COMPOSITION_RULE(rule)                           \
3323   do {                                                          \
3324     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3325                                                                 \
3326     if (rule < 0x100)           /* old format */                \
3327       {                                                         \
3328         if (gref == 10) gref = 4;                               \
3329         if (nref == 10) nref = 4;                               \
3330         charbuf[idx] = 32 + gref * 9 + nref;                    \
3331         charbuf[idx + 1] = -1;                                  \
3332         new_chars++;                                            \
3333       }                                                         \
3334     else                                /* new format */        \
3335       {                                                         \
3336         charbuf[idx] = 32 + 81 + gref;                          \
3337         charbuf[idx + 1] = 32 + nref;                           \
3338         new_chars += 2;                                         \
3339       }                                                         \
3340   } while (0)
3341
3342 /* Finish the current composition as invalid.  */
3343
3344 static int
3345 finish_composition (int *charbuf, struct composition_status *cmp_status)
3346 {
3347   int idx = - cmp_status->length;
3348   int new_chars;
3349
3350   /* Recover the original ESC sequence */
3351   charbuf[idx++] = ISO_CODE_ESC;
3352   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3353                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3354                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3355                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3356                     : '4');
3357   charbuf[idx++] = -2;
3358   charbuf[idx++] = 0;
3359   charbuf[idx++] = -1;
3360   new_chars = cmp_status->nchars;
3361   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3362     for (; idx < 0; idx++)
3363       {
3364         int elt = charbuf[idx];
3365
3366         if (elt == -2)
3367           {
3368             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3369             idx++;
3370           }
3371         else if (elt == -1)
3372           {
3373             charbuf[idx++] = ISO_CODE_ESC;
3374             charbuf[idx] = '0';
3375             new_chars += 2;
3376           }
3377       }
3378   cmp_status->state = COMPOSING_NO;
3379   return new_chars;
3380 }
3381
3382 /* If characters are under composition, finish the composition.  */
3383 #define MAYBE_FINISH_COMPOSITION()                              \
3384   do {                                                          \
3385     if (cmp_status->state != COMPOSING_NO)                      \
3386       char_offset += finish_composition (charbuf, cmp_status);  \
3387   } while (0)
3388
3389 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3390
3391    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3392    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3393    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3394    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3395
3396    Produce this annotation sequence now:
3397
3398    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3399 */
3400
3401 #define DECODE_COMPOSITION_START(c1)                                       \
3402   do {                                                                     \
3403     if (c1 == '0'                                                          \
3404         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3405              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3406             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3407                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3408       {                                                                    \
3409         *charbuf++ = -1;                                                   \
3410         *charbuf++= -1;                                                    \
3411         cmp_status->state = COMPOSING_CHAR;                                \
3412         cmp_status->length += 2;                                           \
3413       }                                                                    \
3414     else                                                                   \
3415       {                                                                    \
3416         MAYBE_FINISH_COMPOSITION ();                                       \
3417         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3418                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3419                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3420                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3421         cmp_status->state                                                  \
3422           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3423         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3424         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3425         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3426         coding->annotated = 1;                                             \
3427       }                                                                    \
3428   } while (0)
3429
3430
3431 /* Handle composition end sequence ESC 1.  */
3432
3433 #define DECODE_COMPOSITION_END()                                        \
3434   do {                                                                  \
3435     if (cmp_status->nchars == 0                                         \
3436         || ((cmp_status->state == COMPOSING_CHAR)                       \
3437             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3438       {                                                                 \
3439         MAYBE_FINISH_COMPOSITION ();                                    \
3440         goto invalid_code;                                              \
3441       }                                                                 \
3442     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3443       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3444     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3445       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3446     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3447     char_offset += cmp_status->nchars;                                  \
3448     cmp_status->state = COMPOSING_NO;                                   \
3449   } while (0)
3450
3451 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3452
3453 #define STORE_COMPOSITION_RULE(rule)    \
3454   do {                                  \
3455     *charbuf++ = -2;                    \
3456     *charbuf++ = rule;                  \
3457     cmp_status->length += 2;            \
3458     cmp_status->state--;                \
3459   } while (0)
3460
3461 /* Store a composed char or a component char C in charbuf, and update
3462    cmp_status.  */
3463
3464 #define STORE_COMPOSITION_CHAR(c)                                       \
3465   do {                                                                  \
3466     *charbuf++ = (c);                                                   \
3467     cmp_status->length++;                                               \
3468     if (cmp_status->state == COMPOSING_CHAR)                            \
3469       cmp_status->nchars++;                                             \
3470     else                                                                \
3471       cmp_status->ncomps++;                                             \
3472     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3473         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3474             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3475       cmp_status->state++;                                              \
3476   } while (0)
3477
3478
3479 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3480
3481 static void
3482 decode_coding_iso_2022 (struct coding_system *coding)
3483 {
3484   const unsigned char *src = coding->source + coding->consumed;
3485   const unsigned char *src_end = coding->source + coding->src_bytes;
3486   const unsigned char *src_base;
3487   int *charbuf = coding->charbuf + coding->charbuf_used;
3488   /* We may produce two annotations (charset and composition) in one
3489      loop and one more charset annotation at the end.  */
3490   int *charbuf_end
3491     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3492   ptrdiff_t consumed_chars = 0, consumed_chars_base;
3493   bool multibytep = coding->src_multibyte;
3494   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3495   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3496   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3497   int charset_id_2, charset_id_3;
3498   struct charset *charset;
3499   int c;
3500   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3501   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3502   ptrdiff_t char_offset = coding->produced_char;
3503   ptrdiff_t last_offset = char_offset;
3504   int last_id = charset_ascii;
3505   bool eol_dos
3506     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3507   int byte_after_cr = -1;
3508   int i;
3509
3510   setup_iso_safe_charsets (attrs);
3511   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3512
3513   if (cmp_status->state != COMPOSING_NO)
3514     {
3515       if (charbuf_end - charbuf < cmp_status->length)
3516         emacs_abort ();
3517       for (i = 0; i < cmp_status->length; i++)
3518         *charbuf++ = cmp_status->carryover[i];
3519       coding->annotated = 1;
3520     }
3521
3522   while (1)
3523     {
3524       int c1, c2, c3;
3525
3526       src_base = src;
3527       consumed_chars_base = consumed_chars;
3528
3529       if (charbuf >= charbuf_end)
3530         {
3531           if (byte_after_cr >= 0)
3532             src_base--;
3533           break;
3534         }
3535
3536       if (byte_after_cr >= 0)
3537         c1 = byte_after_cr, byte_after_cr = -1;
3538       else
3539         ONE_MORE_BYTE (c1);
3540       if (c1 < 0)
3541         goto invalid_code;
3542
3543       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3544         {
3545           *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3546           char_offset++;
3547           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3548           continue;
3549         }
3550
3551       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3552         {
3553           if (c1 == ISO_CODE_ESC)
3554             {
3555               if (src + 1 >= src_end)
3556                 goto no_more_source;
3557               *charbuf++ = ISO_CODE_ESC;
3558               char_offset++;
3559               if (src[0] == '%' && src[1] == '@')
3560                 {
3561                   src += 2;
3562                   consumed_chars += 2;
3563                   char_offset += 2;
3564                   /* We are sure charbuf can contain two more chars. */
3565                   *charbuf++ = '%';
3566                   *charbuf++ = '@';
3567                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3568                 }
3569             }
3570           else
3571             {
3572               *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3573               char_offset++;
3574             }
3575           continue;
3576         }
3577
3578       if ((cmp_status->state == COMPOSING_RULE
3579            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3580           && c1 != ISO_CODE_ESC)
3581         {
3582           int rule;
3583
3584           DECODE_COMPOSITION_RULE (rule);
3585           STORE_COMPOSITION_RULE (rule);
3586           continue;
3587         }
3588
3589       /* We produce at most one character.  */
3590       switch (iso_code_class [c1])
3591         {
3592         case ISO_0x20_or_0x7F:
3593           if (charset_id_0 < 0
3594               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3595             /* This is SPACE or DEL.  */
3596             charset = CHARSET_FROM_ID (charset_ascii);
3597           else
3598             charset = CHARSET_FROM_ID (charset_id_0);
3599           break;
3600
3601         case ISO_graphic_plane_0:
3602           if (charset_id_0 < 0)
3603             charset = CHARSET_FROM_ID (charset_ascii);
3604           else
3605             charset = CHARSET_FROM_ID (charset_id_0);
3606           break;
3607
3608         case ISO_0xA0_or_0xFF:
3609           if (charset_id_1 < 0
3610               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3611               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3612             goto invalid_code;
3613           /* This is a graphic character, we fall down ... */
3614
3615         case ISO_graphic_plane_1:
3616           if (charset_id_1 < 0)
3617             goto invalid_code;
3618           charset = CHARSET_FROM_ID (charset_id_1);
3619           break;
3620
3621         case ISO_control_0:
3622           if (eol_dos && c1 == '\r')
3623             ONE_MORE_BYTE (byte_after_cr);
3624           MAYBE_FINISH_COMPOSITION ();
3625           charset = CHARSET_FROM_ID (charset_ascii);
3626           break;
3627
3628         case ISO_control_1:
3629           goto invalid_code;
3630
3631         case ISO_shift_out:
3632           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3633               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3634             goto invalid_code;
3635           CODING_ISO_INVOCATION (coding, 0) = 1;
3636           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3637           continue;
3638
3639         case ISO_shift_in:
3640           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3641             goto invalid_code;
3642           CODING_ISO_INVOCATION (coding, 0) = 0;
3643           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3644           continue;
3645
3646         case ISO_single_shift_2_7:
3647           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3648             goto invalid_code;
3649         case ISO_single_shift_2:
3650           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3651             goto invalid_code;
3652           /* SS2 is handled as an escape sequence of ESC 'N' */
3653           c1 = 'N';
3654           goto label_escape_sequence;
3655
3656         case ISO_single_shift_3:
3657           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3658             goto invalid_code;
3659           /* SS2 is handled as an escape sequence of ESC 'O' */
3660           c1 = 'O';
3661           goto label_escape_sequence;
3662
3663         case ISO_control_sequence_introducer:
3664           /* CSI is handled as an escape sequence of ESC '[' ...  */
3665           c1 = '[';
3666           goto label_escape_sequence;
3667
3668         case ISO_escape:
3669           ONE_MORE_BYTE (c1);
3670         label_escape_sequence:
3671           /* Escape sequences handled here are invocation,
3672              designation, direction specification, and character
3673              composition specification.  */
3674           switch (c1)
3675             {
3676             case '&':           /* revision of following character set */
3677               ONE_MORE_BYTE (c1);
3678               if (!(c1 >= '@' && c1 <= '~'))
3679                 goto invalid_code;
3680               ONE_MORE_BYTE (c1);
3681               if (c1 != ISO_CODE_ESC)
3682                 goto invalid_code;
3683               ONE_MORE_BYTE (c1);
3684               goto label_escape_sequence;
3685
3686             case '$':           /* designation of 2-byte character set */
3687               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3688                 goto invalid_code;
3689               {
3690                 int reg, chars96;
3691
3692                 ONE_MORE_BYTE (c1);
3693                 if (c1 >= '@' && c1 <= 'B')
3694                   {     /* designation of JISX0208.1978, GB2312.1980,
3695                            or JISX0208.1980 */
3696                     reg = 0, chars96 = 0;
3697                   }
3698                 else if (c1 >= 0x28 && c1 <= 0x2B)
3699                   { /* designation of DIMENSION2_CHARS94 character set */
3700                     reg = c1 - 0x28, chars96 = 0;
3701                     ONE_MORE_BYTE (c1);
3702                   }
3703                 else if (c1 >= 0x2C && c1 <= 0x2F)
3704                   { /* designation of DIMENSION2_CHARS96 character set */
3705                     reg = c1 - 0x2C, chars96 = 1;
3706                     ONE_MORE_BYTE (c1);
3707                   }
3708                 else
3709                   goto invalid_code;
3710                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3711                 /* We must update these variables now.  */
3712                 if (reg == 0)
3713                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3714                 else if (reg == 1)
3715                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3716                 if (chars96 < 0)
3717                   goto invalid_code;
3718               }
3719               continue;
3720
3721             case 'n':           /* invocation of locking-shift-2 */
3722               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3723                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3724                 goto invalid_code;
3725               CODING_ISO_INVOCATION (coding, 0) = 2;
3726               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3727               continue;
3728
3729             case 'o':           /* invocation of locking-shift-3 */
3730               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3731                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3732                 goto invalid_code;
3733               CODING_ISO_INVOCATION (coding, 0) = 3;
3734               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3735               continue;
3736
3737             case 'N':           /* invocation of single-shift-2 */
3738               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3739                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3740                 goto invalid_code;
3741               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3742               if (charset_id_2 < 0)
3743                 charset = CHARSET_FROM_ID (charset_ascii);
3744               else
3745                 charset = CHARSET_FROM_ID (charset_id_2);
3746               ONE_MORE_BYTE (c1);
3747               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3748                   || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3749                       && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3750                           ? c1 >= 0x80 : c1 < 0x80)))
3751                 goto invalid_code;
3752               break;
3753
3754             case 'O':           /* invocation of single-shift-3 */
3755               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3756                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3757                 goto invalid_code;
3758               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3759               if (charset_id_3 < 0)
3760                 charset = CHARSET_FROM_ID (charset_ascii);
3761               else
3762                 charset = CHARSET_FROM_ID (charset_id_3);
3763               ONE_MORE_BYTE (c1);
3764               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3765                   || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3766                       && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3767                           ? c1 >= 0x80 : c1 < 0x80)))
3768                 goto invalid_code;
3769               break;
3770
3771             case '0': case '2': case '3': case '4': /* start composition */
3772               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3773                 goto invalid_code;
3774               if (last_id != charset_ascii)
3775                 {
3776                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3777                   last_id = charset_ascii;
3778                   last_offset = char_offset;
3779                 }
3780               DECODE_COMPOSITION_START (c1);
3781               continue;
3782
3783             case '1':           /* end composition */
3784               if (cmp_status->state == COMPOSING_NO)
3785                 goto invalid_code;
3786               DECODE_COMPOSITION_END ();
3787               continue;
3788
3789             case '[':           /* specification of direction */
3790               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3791                 goto invalid_code;
3792               /* For the moment, nested direction is not supported.
3793                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3794                  left-to-right, and nonzero means right-to-left.  */
3795               ONE_MORE_BYTE (c1);
3796               switch (c1)
3797                 {
3798                 case ']':       /* end of the current direction */
3799                   coding->mode &= ~CODING_MODE_DIRECTION;
3800
3801                 case '0':       /* end of the current direction */
3802                 case '1':       /* start of left-to-right direction */
3803                   ONE_MORE_BYTE (c1);
3804                   if (c1 == ']')
3805                     coding->mode &= ~CODING_MODE_DIRECTION;
3806                   else
3807                     goto invalid_code;
3808                   break;
3809
3810                 case '2':       /* start of right-to-left direction */
3811                   ONE_MORE_BYTE (c1);
3812                   if (c1 == ']')
3813                     coding->mode |= CODING_MODE_DIRECTION;
3814                   else
3815                     goto invalid_code;
3816                   break;
3817
3818                 default:
3819                   goto invalid_code;
3820                 }
3821               continue;
3822
3823             case '%':
3824               ONE_MORE_BYTE (c1);
3825               if (c1 == '/')
3826                 {
3827                   /* CTEXT extended segment:
3828                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3829                      We keep these bytes as is for the moment.
3830                      They may be decoded by post-read-conversion.  */
3831                   int dim, M, L;
3832                   int size;
3833
3834                   ONE_MORE_BYTE (dim);
3835                   if (dim < '0' || dim > '4')
3836                     goto invalid_code;
3837                   ONE_MORE_BYTE (M);
3838                   if (M < 128)
3839                     goto invalid_code;
3840                   ONE_MORE_BYTE (L);
3841                   if (L < 128)
3842                     goto invalid_code;
3843                   size = ((M - 128) * 128) + (L - 128);
3844                   if (charbuf + 6 > charbuf_end)
3845                     goto break_loop;
3846                   *charbuf++ = ISO_CODE_ESC;
3847                   *charbuf++ = '%';
3848                   *charbuf++ = '/';
3849                   *charbuf++ = dim;
3850                   *charbuf++ = BYTE8_TO_CHAR (M);
3851                   *charbuf++ = BYTE8_TO_CHAR (L);
3852                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3853                 }
3854               else if (c1 == 'G')
3855                 {
3856                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3857                      ESC % G --UTF-8-BYTES-- ESC % @
3858                      We keep these bytes as is for the moment.
3859                      They may be decoded by post-read-conversion.  */
3860                   if (charbuf + 3 > charbuf_end)
3861                     goto break_loop;
3862                   *charbuf++ = ISO_CODE_ESC;
3863                   *charbuf++ = '%';
3864                   *charbuf++ = 'G';
3865                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3866                 }
3867               else
3868                 goto invalid_code;
3869               continue;
3870               break;
3871
3872             default:
3873               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3874                 goto invalid_code;
3875               {
3876                 int reg, chars96;
3877
3878                 if (c1 >= 0x28 && c1 <= 0x2B)
3879                   { /* designation of DIMENSION1_CHARS94 character set */
3880                     reg = c1 - 0x28, chars96 = 0;
3881                     ONE_MORE_BYTE (c1);
3882                   }
3883                 else if (c1 >= 0x2C && c1 <= 0x2F)
3884                   { /* designation of DIMENSION1_CHARS96 character set */
3885                     reg = c1 - 0x2C, chars96 = 1;
3886                     ONE_MORE_BYTE (c1);
3887                   }
3888                 else
3889                   goto invalid_code;
3890                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3891                 /* We must update these variables now.  */
3892                 if (reg == 0)
3893                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3894                 else if (reg == 1)
3895                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3896                 if (chars96 < 0)
3897                   goto invalid_code;
3898               }
3899               continue;
3900             }
3901           break;
3902
3903         default:
3904           emacs_abort ();
3905         }
3906
3907       if (cmp_status->state == COMPOSING_NO
3908           && charset->id != charset_ascii
3909           && last_id != charset->id)
3910         {
3911           if (last_id != charset_ascii)
3912             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3913           last_id = charset->id;
3914           last_offset = char_offset;
3915         }
3916
3917       /* Now we know CHARSET and 1st position code C1 of a character.
3918          Produce a decoded character while getting 2nd and 3rd
3919          position codes C2, C3 if necessary.  */
3920       if (CHARSET_DIMENSION (charset) > 1)
3921         {
3922           ONE_MORE_BYTE (c2);
3923           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3924               || ((c1 & 0x80) != (c2 & 0x80)))
3925             /* C2 is not in a valid range.  */
3926             goto invalid_code;
3927           if (CHARSET_DIMENSION (charset) == 2)
3928             c1 = (c1 << 8) | c2;
3929           else
3930             {
3931               ONE_MORE_BYTE (c3);
3932               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3933                   || ((c1 & 0x80) != (c3 & 0x80)))
3934                 /* C3 is not in a valid range.  */
3935                 goto invalid_code;
3936               c1 = (c1 << 16) | (c2 << 8) | c2;
3937             }
3938         }
3939       c1 &= 0x7F7F7F;
3940       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3941       if (c < 0)
3942         {
3943           MAYBE_FINISH_COMPOSITION ();
3944           for (; src_base < src; src_base++, char_offset++)
3945             {
3946               if (ASCII_CHAR_P (*src_base))
3947                 *charbuf++ = *src_base;
3948               else
3949                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3950             }
3951         }
3952       else if (cmp_status->state == COMPOSING_NO)
3953         {
3954           *charbuf++ = c;
3955           char_offset++;
3956         }
3957       else if ((cmp_status->state == COMPOSING_CHAR
3958                 ? cmp_status->nchars
3959                 : cmp_status->ncomps)
3960                >= MAX_COMPOSITION_COMPONENTS)
3961         {
3962           /* Too long composition.  */
3963           MAYBE_FINISH_COMPOSITION ();
3964           *charbuf++ = c;
3965           char_offset++;
3966         }
3967       else
3968         STORE_COMPOSITION_CHAR (c);
3969       continue;
3970
3971     invalid_code:
3972       MAYBE_FINISH_COMPOSITION ();
3973       src = src_base;
3974       consumed_chars = consumed_chars_base;
3975       ONE_MORE_BYTE (c);
3976       *charbuf++ = c < 0 ? -c : ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
3977       char_offset++;
3978       /* Reset the invocation and designation status to the safest
3979          one; i.e. designate ASCII to the graphic register 0, and
3980          invoke that register to the graphic plane 0.  This typically
3981          helps the case that an designation sequence for ASCII "ESC (
3982          B" is somehow broken (e.g. broken by a newline).  */
3983       CODING_ISO_INVOCATION (coding, 0) = 0;
3984       CODING_ISO_DESIGNATION (coding, 0) = charset_ascii;
3985       charset_id_0 = charset_ascii;
3986       continue;
3987
3988     break_loop:
3989       break;
3990     }
3991
3992  no_more_source:
3993   if (cmp_status->state != COMPOSING_NO)
3994     {
3995       if (coding->mode & CODING_MODE_LAST_BLOCK)
3996         MAYBE_FINISH_COMPOSITION ();
3997       else
3998         {
3999           charbuf -= cmp_status->length;
4000           for (i = 0; i < cmp_status->length; i++)
4001             cmp_status->carryover[i] = charbuf[i];
4002         }
4003     }
4004   else if (last_id != charset_ascii)
4005     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4006   coding->consumed_char += consumed_chars_base;
4007   coding->consumed = src_base - coding->source;
4008   coding->charbuf_used = charbuf - coding->charbuf;
4009 }
4010
4011
4012 /* ISO2022 encoding stuff.  */
4013
4014 /*
4015    It is not enough to say just "ISO2022" on encoding, we have to
4016    specify more details.  In Emacs, each coding system of ISO2022
4017    variant has the following specifications:
4018         1. Initial designation to G0 thru G3.
4019         2. Allows short-form designation?
4020         3. ASCII should be designated to G0 before control characters?
4021         4. ASCII should be designated to G0 at end of line?
4022         5. 7-bit environment or 8-bit environment?
4023         6. Use locking-shift?
4024         7. Use Single-shift?
4025    And the following two are only for Japanese:
4026         8. Use ASCII in place of JIS0201-1976-Roman?
4027         9. Use JISX0208-1983 in place of JISX0208-1978?
4028    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4029    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
4030    details.
4031 */
4032
4033 /* Produce codes (escape sequence) for designating CHARSET to graphic
4034    register REG at DST, and increment DST.  If <final-char> of CHARSET is
4035    '@', 'A', or 'B' and the coding system CODING allows, produce
4036    designation sequence of short-form.  */
4037
4038 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
4039   do {                                                                  \
4040     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
4041     const char *intermediate_char_94 = "()*+";                          \
4042     const char *intermediate_char_96 = ",-./";                          \
4043     int revision = -1;                                                  \
4044                                                                         \
4045     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
4046       revision = CHARSET_ISO_REVISION (charset);                        \
4047                                                                         \
4048     if (revision >= 0)                                                  \
4049       {                                                                 \
4050         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4051         EMIT_ONE_BYTE ('@' + revision);                                 \
4052       }                                                                 \
4053     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4054     if (CHARSET_DIMENSION (charset) == 1)                               \
4055       {                                                                 \
4056         int b;                                                          \
4057         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4058           b = intermediate_char_94[reg];                                \
4059         else                                                            \
4060           b = intermediate_char_96[reg];                                \
4061         EMIT_ONE_ASCII_BYTE (b);                                        \
4062       }                                                                 \
4063     else                                                                \
4064       {                                                                 \
4065         EMIT_ONE_ASCII_BYTE ('$');                                      \
4066         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4067           {                                                             \
4068             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4069                 || reg != 0                                             \
4070                 || final_char < '@' || final_char > 'B')                \
4071               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4072           }                                                             \
4073         else                                                            \
4074           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4075       }                                                                 \
4076     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4077                                                                         \
4078     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4079   } while (0)
4080
4081
4082 /* The following two macros produce codes (control character or escape
4083    sequence) for ISO2022 single-shift functions (single-shift-2 and
4084    single-shift-3).  */
4085
4086 #define ENCODE_SINGLE_SHIFT_2                                           \
4087   do {                                                                  \
4088     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4089       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4090     else                                                                \
4091       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4092     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4093   } while (0)
4094
4095
4096 #define ENCODE_SINGLE_SHIFT_3                                           \
4097   do {                                                                  \
4098     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4099       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4100     else                                                                \
4101       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4102     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4103   } while (0)
4104
4105
4106 /* The following four macros produce codes (control character or
4107    escape sequence) for ISO2022 locking-shift functions (shift-in,
4108    shift-out, locking-shift-2, and locking-shift-3).  */
4109
4110 #define ENCODE_SHIFT_IN                                 \
4111   do {                                                  \
4112     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4113     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4114   } while (0)
4115
4116
4117 #define ENCODE_SHIFT_OUT                                \
4118   do {                                                  \
4119     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4120     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4121   } while (0)
4122
4123
4124 #define ENCODE_LOCKING_SHIFT_2                          \
4125   do {                                                  \
4126     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4127     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4128   } while (0)
4129
4130
4131 #define ENCODE_LOCKING_SHIFT_3                          \
4132   do {                                                  \
4133     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4134     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4135   } while (0)
4136
4137
4138 /* Produce codes for a DIMENSION1 character whose character set is
4139    CHARSET and whose position-code is C1.  Designation and invocation
4140    sequences are also produced in advance if necessary.  */
4141
4142 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4143   do {                                                                  \
4144     int id = CHARSET_ID (charset);                                      \
4145                                                                         \
4146     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4147         && id == charset_ascii)                                         \
4148       {                                                                 \
4149         id = charset_jisx0201_roman;                                    \
4150         charset = CHARSET_FROM_ID (id);                                 \
4151       }                                                                 \
4152                                                                         \
4153     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4154       {                                                                 \
4155         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4156           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4157         else                                                            \
4158           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4159         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4160         break;                                                          \
4161       }                                                                 \
4162     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4163       {                                                                 \
4164         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4165         break;                                                          \
4166       }                                                                 \
4167     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4168       {                                                                 \
4169         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4170         break;                                                          \
4171       }                                                                 \
4172     else                                                                \
4173       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4174          must invoke it, or, at first, designate it to some graphic     \
4175          register.  Then repeat the loop to actually produce the        \
4176          character.  */                                                 \
4177       dst = encode_invocation_designation (charset, coding, dst,        \
4178                                            &produced_chars);            \
4179   } while (1)
4180
4181
4182 /* Produce codes for a DIMENSION2 character whose character set is
4183    CHARSET and whose position-codes are C1 and C2.  Designation and
4184    invocation codes are also produced in advance if necessary.  */
4185
4186 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4187   do {                                                                  \
4188     int id = CHARSET_ID (charset);                                      \
4189                                                                         \
4190     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4191         && id == charset_jisx0208)                                      \
4192       {                                                                 \
4193         id = charset_jisx0208_1978;                                     \
4194         charset = CHARSET_FROM_ID (id);                                 \
4195       }                                                                 \
4196                                                                         \
4197     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4198       {                                                                 \
4199         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4200           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4201         else                                                            \
4202           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4203         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4204         break;                                                          \
4205       }                                                                 \
4206     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4207       {                                                                 \
4208         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4209         break;                                                          \
4210       }                                                                 \
4211     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4212       {                                                                 \
4213         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4214         break;                                                          \
4215       }                                                                 \
4216     else                                                                \
4217       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4218          must invoke it, or, at first, designate it to some graphic     \
4219          register.  Then repeat the loop to actually produce the        \
4220          character.  */                                                 \
4221       dst = encode_invocation_designation (charset, coding, dst,        \
4222                                            &produced_chars);            \
4223   } while (1)
4224
4225
4226 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4227   do {                                                                     \
4228     unsigned code;                                                         \
4229     CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code);       \
4230                                                                            \
4231     if (CHARSET_DIMENSION (charset) == 1)                                  \
4232       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4233     else                                                                   \
4234       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4235   } while (0)
4236
4237
4238 /* Produce designation and invocation codes at a place pointed by DST
4239    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4240    Return new DST.  */
4241
4242 static unsigned char *
4243 encode_invocation_designation (struct charset *charset,
4244                                struct coding_system *coding,
4245                                unsigned char *dst, ptrdiff_t *p_nchars)
4246 {
4247   bool multibytep = coding->dst_multibyte;
4248   ptrdiff_t produced_chars = *p_nchars;
4249   int reg;                      /* graphic register number */
4250   int id = CHARSET_ID (charset);
4251
4252   /* At first, check designations.  */
4253   for (reg = 0; reg < 4; reg++)
4254     if (id == CODING_ISO_DESIGNATION (coding, reg))
4255       break;
4256
4257   if (reg >= 4)
4258     {
4259       /* CHARSET is not yet designated to any graphic registers.  */
4260       /* At first check the requested designation.  */
4261       reg = CODING_ISO_REQUEST (coding, id);
4262       if (reg < 0)
4263         /* Since CHARSET requests no special designation, designate it
4264            to graphic register 0.  */
4265         reg = 0;
4266
4267       ENCODE_DESIGNATION (charset, reg, coding);
4268     }
4269
4270   if (CODING_ISO_INVOCATION (coding, 0) != reg
4271       && CODING_ISO_INVOCATION (coding, 1) != reg)
4272     {
4273       /* Since the graphic register REG is not invoked to any graphic
4274          planes, invoke it to graphic plane 0.  */
4275       switch (reg)
4276         {
4277         case 0:                 /* graphic register 0 */
4278           ENCODE_SHIFT_IN;
4279           break;
4280
4281         case 1:                 /* graphic register 1 */
4282           ENCODE_SHIFT_OUT;
4283           break;
4284
4285         case 2:                 /* graphic register 2 */
4286           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4287             ENCODE_SINGLE_SHIFT_2;
4288           else
4289             ENCODE_LOCKING_SHIFT_2;
4290           break;
4291
4292         case 3:                 /* graphic register 3 */
4293           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4294             ENCODE_SINGLE_SHIFT_3;
4295           else
4296             ENCODE_LOCKING_SHIFT_3;
4297           break;
4298
4299         default:
4300           break;
4301         }
4302     }
4303
4304   *p_nchars = produced_chars;
4305   return dst;
4306 }
4307
4308
4309 /* Produce codes for designation and invocation to reset the graphic
4310    planes and registers to initial state.  */
4311 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4312   do {                                                                  \
4313     int reg;                                                            \
4314     struct charset *charset;                                            \
4315                                                                         \
4316     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4317       ENCODE_SHIFT_IN;                                                  \
4318     for (reg = 0; reg < 4; reg++)                                       \
4319       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4320           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4321               != CODING_ISO_INITIAL (coding, reg)))                     \
4322         {                                                               \
4323           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4324           ENCODE_DESIGNATION (charset, reg, coding);                    \
4325         }                                                               \
4326   } while (0)
4327
4328
4329 /* Produce designation sequences of charsets in the line started from
4330    CHARBUF to a place pointed by DST, and return the number of
4331    produced bytes.  DST should not directly point a buffer text area
4332    which may be relocated by char_charset call.
4333
4334    If the current block ends before any end-of-line, we may fail to
4335    find all the necessary designations.  */
4336
4337 static ptrdiff_t
4338 encode_designation_at_bol (struct coding_system *coding,
4339                            int *charbuf, int *charbuf_end,
4340                            unsigned char *dst)
4341 {
4342   unsigned char *orig = dst;
4343   struct charset *charset;
4344   /* Table of charsets to be designated to each graphic register.  */
4345   int r[4];
4346   int c, found = 0, reg;
4347   ptrdiff_t produced_chars = 0;
4348   bool multibytep = coding->dst_multibyte;
4349   Lisp_Object attrs;
4350   Lisp_Object charset_list;
4351
4352   attrs = CODING_ID_ATTRS (coding->id);
4353   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4354   if (EQ (charset_list, Qiso_2022))
4355     charset_list = Viso_2022_charset_list;
4356
4357   for (reg = 0; reg < 4; reg++)
4358     r[reg] = -1;
4359
4360   while (charbuf < charbuf_end && found < 4)
4361     {
4362       int id;
4363
4364       c = *charbuf++;
4365       if (c == '\n')
4366         break;
4367       charset = char_charset (c, charset_list, NULL);
4368       id = CHARSET_ID (charset);
4369       reg = CODING_ISO_REQUEST (coding, id);
4370       if (reg >= 0 && r[reg] < 0)
4371         {
4372           found++;
4373           r[reg] = id;
4374         }
4375     }
4376
4377   if (found)
4378     {
4379       for (reg = 0; reg < 4; reg++)
4380         if (r[reg] >= 0
4381             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4382           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4383     }
4384
4385   return dst - orig;
4386 }
4387
4388 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4389
4390 static bool
4391 encode_coding_iso_2022 (struct coding_system *coding)
4392 {
4393   bool multibytep = coding->dst_multibyte;
4394   int *charbuf = coding->charbuf;
4395   int *charbuf_end = charbuf + coding->charbuf_used;
4396   unsigned char *dst = coding->destination + coding->produced;
4397   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4398   int safe_room = 16;
4399   bool bol_designation
4400     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4401        && CODING_ISO_BOL (coding));
4402   ptrdiff_t produced_chars = 0;
4403   Lisp_Object attrs, eol_type, charset_list;
4404   bool ascii_compatible;
4405   int c;
4406   int preferred_charset_id = -1;
4407
4408   CODING_GET_INFO (coding, attrs, charset_list);
4409   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4410   if (VECTORP (eol_type))
4411     eol_type = Qunix;
4412
4413   setup_iso_safe_charsets (attrs);
4414   /* Charset list may have been changed.  */
4415   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4416   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4417
4418   ascii_compatible
4419     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4420        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4421                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4422
4423   while (charbuf < charbuf_end)
4424     {
4425       ASSURE_DESTINATION (safe_room);
4426
4427       if (bol_designation)
4428         {
4429           /* We have to produce designation sequences if any now.  */
4430           unsigned char desig_buf[16];
4431           ptrdiff_t nbytes;
4432           ptrdiff_t offset;
4433
4434           charset_map_loaded = 0;
4435           nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4436                                               desig_buf);
4437           if (charset_map_loaded
4438               && (offset = coding_change_destination (coding)))
4439             {
4440               dst += offset;
4441               dst_end += offset;
4442             }
4443           memcpy (dst, desig_buf, nbytes);
4444           dst += nbytes;
4445           /* We are sure that designation sequences are all ASCII bytes.  */
4446           produced_chars += nbytes;
4447           bol_designation = 0;
4448           ASSURE_DESTINATION (safe_room);
4449         }
4450
4451       c = *charbuf++;
4452
4453       if (c < 0)
4454         {
4455           /* Handle an annotation.  */
4456           switch (*charbuf)
4457             {
4458             case CODING_ANNOTATE_COMPOSITION_MASK:
4459               /* Not yet implemented.  */
4460               break;
4461             case CODING_ANNOTATE_CHARSET_MASK:
4462               preferred_charset_id = charbuf[2];
4463               if (preferred_charset_id >= 0
4464                   && NILP (Fmemq (make_number (preferred_charset_id),
4465                                   charset_list)))
4466                 preferred_charset_id = -1;
4467               break;
4468             default:
4469               emacs_abort ();
4470             }
4471           charbuf += -c - 1;
4472           continue;
4473         }
4474
4475       /* Now encode the character C.  */
4476       if (c < 0x20 || c == 0x7F)
4477         {
4478           if (c == '\n'
4479               || (c == '\r' && EQ (eol_type, Qmac)))
4480             {
4481               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4482                 ENCODE_RESET_PLANE_AND_REGISTER ();
4483               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4484                 {
4485                   int i;
4486
4487                   for (i = 0; i < 4; i++)
4488                     CODING_ISO_DESIGNATION (coding, i)
4489                       = CODING_ISO_INITIAL (coding, i);
4490                 }
4491               bol_designation = ((CODING_ISO_FLAGS (coding)
4492                                   & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4493                                  != 0);
4494             }
4495           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4496             ENCODE_RESET_PLANE_AND_REGISTER ();
4497           EMIT_ONE_ASCII_BYTE (c);
4498         }
4499       else if (ASCII_CHAR_P (c))
4500         {
4501           if (ascii_compatible)
4502             EMIT_ONE_ASCII_BYTE (c);
4503           else
4504             {
4505               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4506               ENCODE_ISO_CHARACTER (charset, c);
4507             }
4508         }
4509       else if (CHAR_BYTE8_P (c))
4510         {
4511           c = CHAR_TO_BYTE8 (c);
4512           EMIT_ONE_BYTE (c);
4513         }
4514       else
4515         {
4516           struct charset *charset;
4517
4518           if (preferred_charset_id >= 0)
4519             {
4520               bool result;
4521
4522               charset = CHARSET_FROM_ID (preferred_charset_id);
4523               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4524               if (! result)
4525                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4526                                      NULL, charset);
4527             }
4528           else
4529             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4530                                  NULL, charset);
4531           if (!charset)
4532             {
4533               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4534                 {
4535                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4536                   charset = CHARSET_FROM_ID (charset_ascii);
4537                 }
4538               else
4539                 {
4540                   c = coding->default_char;
4541                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4542                                        charset_list, NULL, charset);
4543                 }
4544             }
4545           ENCODE_ISO_CHARACTER (charset, c);
4546         }
4547     }
4548
4549   if (coding->mode & CODING_MODE_LAST_BLOCK
4550       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4551     {
4552       ASSURE_DESTINATION (safe_room);
4553       ENCODE_RESET_PLANE_AND_REGISTER ();
4554     }
4555   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4556   CODING_ISO_BOL (coding) = bol_designation;
4557   coding->produced_char += produced_chars;
4558   coding->produced = dst - coding->destination;
4559   return 0;
4560 }
4561
4562 \f
4563 /*** 8,9. SJIS and BIG5 handlers ***/
4564
4565 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4566    quite widely.  So, for the moment, Emacs supports them in the bare
4567    C code.  But, in the future, they may be supported only by CCL.  */
4568
4569 /* SJIS is a coding system encoding three character sets: ASCII, right
4570    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4571    as is.  A character of charset katakana-jisx0201 is encoded by
4572    "position-code + 0x80".  A character of charset japanese-jisx0208
4573    is encoded in 2-byte but two position-codes are divided and shifted
4574    so that it fit in the range below.
4575
4576    --- CODE RANGE of SJIS ---
4577    (character set)      (range)
4578    ASCII                0x00 .. 0x7F
4579    KATAKANA-JISX0201    0xA0 .. 0xDF
4580    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4581             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4582    -------------------------------
4583
4584 */
4585
4586 /* BIG5 is a coding system encoding two character sets: ASCII and
4587    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4588    character set and is encoded in two-byte.
4589
4590    --- CODE RANGE of BIG5 ---
4591    (character set)      (range)
4592    ASCII                0x00 .. 0x7F
4593    Big5 (1st byte)      0xA1 .. 0xFE
4594         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4595    --------------------------
4596
4597   */
4598
4599 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4600    Return true if a text is encoded in SJIS.  */
4601
4602 static bool
4603 detect_coding_sjis (struct coding_system *coding,
4604                     struct coding_detection_info *detect_info)
4605 {
4606   const unsigned char *src = coding->source, *src_base;
4607   const unsigned char *src_end = coding->source + coding->src_bytes;
4608   bool multibytep = coding->src_multibyte;
4609   ptrdiff_t consumed_chars = 0;
4610   int found = 0;
4611   int c;
4612   Lisp_Object attrs, charset_list;
4613   int max_first_byte_of_2_byte_code;
4614
4615   CODING_GET_INFO (coding, attrs, charset_list);
4616   max_first_byte_of_2_byte_code
4617     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4618
4619   detect_info->checked |= CATEGORY_MASK_SJIS;
4620   /* A coding system of this category is always ASCII compatible.  */
4621   src += coding->head_ascii;
4622
4623   while (1)
4624     {
4625       src_base = src;
4626       ONE_MORE_BYTE (c);
4627       if (c < 0x80)
4628         continue;
4629       if ((c >= 0x81 && c <= 0x9F)
4630           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4631         {
4632           ONE_MORE_BYTE (c);
4633           if (c < 0x40 || c == 0x7F || c > 0xFC)
4634             break;
4635           found = CATEGORY_MASK_SJIS;
4636         }
4637       else if (c >= 0xA0 && c < 0xE0)
4638         found = CATEGORY_MASK_SJIS;
4639       else
4640         break;
4641     }
4642   detect_info->rejected |= CATEGORY_MASK_SJIS;
4643   return 0;
4644
4645  no_more_source:
4646   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4647     {
4648       detect_info->rejected |= CATEGORY_MASK_SJIS;
4649       return 0;
4650     }
4651   detect_info->found |= found;
4652   return 1;
4653 }
4654
4655 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4656    Return true if a text is encoded in BIG5.  */
4657
4658 static bool
4659 detect_coding_big5 (struct coding_system *coding,
4660                     struct coding_detection_info *detect_info)
4661 {
4662   const unsigned char *src = coding->source, *src_base;
4663   const unsigned char *src_end = coding->source + coding->src_bytes;
4664   bool multibytep = coding->src_multibyte;
4665   ptrdiff_t consumed_chars = 0;
4666   int found = 0;
4667   int c;
4668
4669   detect_info->checked |= CATEGORY_MASK_BIG5;
4670   /* A coding system of this category is always ASCII compatible.  */
4671   src += coding->head_ascii;
4672
4673   while (1)
4674     {
4675       src_base = src;
4676       ONE_MORE_BYTE (c);
4677       if (c < 0x80)
4678         continue;
4679       if (c >= 0xA1)
4680         {
4681           ONE_MORE_BYTE (c);
4682           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4683             return 0;
4684           found = CATEGORY_MASK_BIG5;
4685         }
4686       else
4687         break;
4688     }
4689   detect_info->rejected |= CATEGORY_MASK_BIG5;
4690   return 0;
4691
4692  no_more_source:
4693   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4694     {
4695       detect_info->rejected |= CATEGORY_MASK_BIG5;
4696       return 0;
4697     }
4698   detect_info->found |= found;
4699   return 1;
4700 }
4701
4702 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4703
4704 static void
4705 decode_coding_sjis (struct coding_system *coding)
4706 {
4707   const unsigned char *src = coding->source + coding->consumed;
4708   const unsigned char *src_end = coding->source + coding->src_bytes;
4709   const unsigned char *src_base;
4710   int *charbuf = coding->charbuf + coding->charbuf_used;
4711   /* We may produce one charset annotation in one loop and one more at
4712      the end.  */
4713   int *charbuf_end
4714     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4715   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4716   bool multibytep = coding->src_multibyte;
4717   struct charset *charset_roman, *charset_kanji, *charset_kana;
4718   struct charset *charset_kanji2;
4719   Lisp_Object attrs, charset_list, val;
4720   ptrdiff_t char_offset = coding->produced_char;
4721   ptrdiff_t last_offset = char_offset;
4722   int last_id = charset_ascii;
4723   bool eol_dos
4724     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4725   int byte_after_cr = -1;
4726
4727   CODING_GET_INFO (coding, attrs, charset_list);
4728
4729   val = charset_list;
4730   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4731   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4732   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4733   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4734
4735   while (1)
4736     {
4737       int c, c1;
4738       struct charset *charset;
4739
4740       src_base = src;
4741       consumed_chars_base = consumed_chars;
4742
4743       if (charbuf >= charbuf_end)
4744         {
4745           if (byte_after_cr >= 0)
4746             src_base--;
4747           break;
4748         }
4749
4750       if (byte_after_cr >= 0)
4751         c = byte_after_cr, byte_after_cr = -1;
4752       else
4753         ONE_MORE_BYTE (c);
4754       if (c < 0)
4755         goto invalid_code;
4756       if (c < 0x80)
4757         {
4758           if (eol_dos && c == '\r')
4759             ONE_MORE_BYTE (byte_after_cr);
4760           charset = charset_roman;
4761         }
4762       else if (c == 0x80 || c == 0xA0)
4763         goto invalid_code;
4764       else if (c >= 0xA1 && c <= 0xDF)
4765         {
4766           /* SJIS -> JISX0201-Kana */
4767           c &= 0x7F;
4768           charset = charset_kana;
4769         }
4770       else if (c <= 0xEF)
4771         {
4772           /* SJIS -> JISX0208 */
4773           ONE_MORE_BYTE (c1);
4774           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4775             goto invalid_code;
4776           c = (c << 8) | c1;
4777           SJIS_TO_JIS (c);
4778           charset = charset_kanji;
4779         }
4780       else if (c <= 0xFC && charset_kanji2)
4781         {
4782           /* SJIS -> JISX0213-2 */
4783           ONE_MORE_BYTE (c1);
4784           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4785             goto invalid_code;
4786           c = (c << 8) | c1;
4787           SJIS_TO_JIS2 (c);
4788           charset = charset_kanji2;
4789         }
4790       else
4791         goto invalid_code;
4792       if (charset->id != charset_ascii
4793           && last_id != charset->id)
4794         {
4795           if (last_id != charset_ascii)
4796             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4797           last_id = charset->id;
4798           last_offset = char_offset;
4799         }
4800       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4801       *charbuf++ = c;
4802       char_offset++;
4803       continue;
4804
4805     invalid_code:
4806       src = src_base;
4807       consumed_chars = consumed_chars_base;
4808       ONE_MORE_BYTE (c);
4809       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4810       char_offset++;
4811     }
4812
4813  no_more_source:
4814   if (last_id != charset_ascii)
4815     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4816   coding->consumed_char += consumed_chars_base;
4817   coding->consumed = src_base - coding->source;
4818   coding->charbuf_used = charbuf - coding->charbuf;
4819 }
4820
4821 static void
4822 decode_coding_big5 (struct coding_system *coding)
4823 {
4824   const unsigned char *src = coding->source + coding->consumed;
4825   const unsigned char *src_end = coding->source + coding->src_bytes;
4826   const unsigned char *src_base;
4827   int *charbuf = coding->charbuf + coding->charbuf_used;
4828   /* We may produce one charset annotation in one loop and one more at
4829      the end.  */
4830   int *charbuf_end
4831     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4832   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4833   bool multibytep = coding->src_multibyte;
4834   struct charset *charset_roman, *charset_big5;
4835   Lisp_Object attrs, charset_list, val;
4836   ptrdiff_t char_offset = coding->produced_char;
4837   ptrdiff_t last_offset = char_offset;
4838   int last_id = charset_ascii;
4839   bool eol_dos
4840     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4841   int byte_after_cr = -1;
4842
4843   CODING_GET_INFO (coding, attrs, charset_list);
4844   val = charset_list;
4845   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4846   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4847
4848   while (1)
4849     {
4850       int c, c1;
4851       struct charset *charset;
4852
4853       src_base = src;
4854       consumed_chars_base = consumed_chars;
4855
4856       if (charbuf >= charbuf_end)
4857         {
4858           if (byte_after_cr >= 0)
4859             src_base--;
4860           break;
4861         }
4862
4863       if (byte_after_cr >= 0)
4864         c = byte_after_cr, byte_after_cr = -1;
4865       else
4866         ONE_MORE_BYTE (c);
4867
4868       if (c < 0)
4869         goto invalid_code;
4870       if (c < 0x80)
4871         {
4872           if (eol_dos && c == '\r')
4873             ONE_MORE_BYTE (byte_after_cr);
4874           charset = charset_roman;
4875         }
4876       else
4877         {
4878           /* BIG5 -> Big5 */
4879           if (c < 0xA1 || c > 0xFE)
4880             goto invalid_code;
4881           ONE_MORE_BYTE (c1);
4882           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4883             goto invalid_code;
4884           c = c << 8 | c1;
4885           charset = charset_big5;
4886         }
4887       if (charset->id != charset_ascii
4888           && last_id != charset->id)
4889         {
4890           if (last_id != charset_ascii)
4891             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4892           last_id = charset->id;
4893           last_offset = char_offset;
4894         }
4895       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4896       *charbuf++ = c;
4897       char_offset++;
4898       continue;
4899
4900     invalid_code:
4901       src = src_base;
4902       consumed_chars = consumed_chars_base;
4903       ONE_MORE_BYTE (c);
4904       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4905       char_offset++;
4906     }
4907
4908  no_more_source:
4909   if (last_id != charset_ascii)
4910     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4911   coding->consumed_char += consumed_chars_base;
4912   coding->consumed = src_base - coding->source;
4913   coding->charbuf_used = charbuf - coding->charbuf;
4914 }
4915
4916 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4917    This function can encode charsets `ascii', `katakana-jisx0201',
4918    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4919    are sure that all these charsets are registered as official charset
4920    (i.e. do not have extended leading-codes).  Characters of other
4921    charsets are produced without any encoding.  */
4922
4923 static bool
4924 encode_coding_sjis (struct coding_system *coding)
4925 {
4926   bool multibytep = coding->dst_multibyte;
4927   int *charbuf = coding->charbuf;
4928   int *charbuf_end = charbuf + coding->charbuf_used;
4929   unsigned char *dst = coding->destination + coding->produced;
4930   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4931   int safe_room = 4;
4932   ptrdiff_t produced_chars = 0;
4933   Lisp_Object attrs, charset_list, val;
4934   bool ascii_compatible;
4935   struct charset *charset_kanji, *charset_kana;
4936   struct charset *charset_kanji2;
4937   int c;
4938
4939   CODING_GET_INFO (coding, attrs, charset_list);
4940   val = XCDR (charset_list);
4941   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4942   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4943   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4944
4945   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4946
4947   while (charbuf < charbuf_end)
4948     {
4949       ASSURE_DESTINATION (safe_room);
4950       c = *charbuf++;
4951       /* Now encode the character C.  */
4952       if (ASCII_CHAR_P (c) && ascii_compatible)
4953         EMIT_ONE_ASCII_BYTE (c);
4954       else if (CHAR_BYTE8_P (c))
4955         {
4956           c = CHAR_TO_BYTE8 (c);
4957           EMIT_ONE_BYTE (c);
4958         }
4959       else
4960         {
4961           unsigned code;
4962           struct charset *charset;
4963           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4964                                &code, charset);
4965
4966           if (!charset)
4967             {
4968               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4969                 {
4970                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4971                   charset = CHARSET_FROM_ID (charset_ascii);
4972                 }
4973               else
4974                 {
4975                   c = coding->default_char;
4976                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4977                                        charset_list, &code, charset);
4978                 }
4979             }
4980           if (code == CHARSET_INVALID_CODE (charset))
4981             emacs_abort ();
4982           if (charset == charset_kanji)
4983             {
4984               int c1, c2;
4985               JIS_TO_SJIS (code);
4986               c1 = code >> 8, c2 = code & 0xFF;
4987               EMIT_TWO_BYTES (c1, c2);
4988             }
4989           else if (charset == charset_kana)
4990             EMIT_ONE_BYTE (code | 0x80);
4991           else if (charset_kanji2 && charset == charset_kanji2)
4992             {
4993               int c1, c2;
4994
4995               c1 = code >> 8;
4996               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4997                   || c1 == 0x28
4998                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4999                 {
5000                   JIS_TO_SJIS2 (code);
5001                   c1 = code >> 8, c2 = code & 0xFF;
5002                   EMIT_TWO_BYTES (c1, c2);
5003                 }
5004               else
5005                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5006             }
5007           else
5008             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5009         }
5010     }
5011   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5012   coding->produced_char += produced_chars;
5013   coding->produced = dst - coding->destination;
5014   return 0;
5015 }
5016
5017 static bool
5018 encode_coding_big5 (struct coding_system *coding)
5019 {
5020   bool multibytep = coding->dst_multibyte;
5021   int *charbuf = coding->charbuf;
5022   int *charbuf_end = charbuf + coding->charbuf_used;
5023   unsigned char *dst = coding->destination + coding->produced;
5024   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5025   int safe_room = 4;
5026   ptrdiff_t produced_chars = 0;
5027   Lisp_Object attrs, charset_list, val;
5028   bool ascii_compatible;
5029   struct charset *charset_big5;
5030   int c;
5031
5032   CODING_GET_INFO (coding, attrs, charset_list);
5033   val = XCDR (charset_list);
5034   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5035   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5036
5037   while (charbuf < charbuf_end)
5038     {
5039       ASSURE_DESTINATION (safe_room);
5040       c = *charbuf++;
5041       /* Now encode the character C.  */
5042       if (ASCII_CHAR_P (c) && ascii_compatible)
5043         EMIT_ONE_ASCII_BYTE (c);
5044       else if (CHAR_BYTE8_P (c))
5045         {
5046           c = CHAR_TO_BYTE8 (c);
5047           EMIT_ONE_BYTE (c);
5048         }
5049       else
5050         {
5051           unsigned code;
5052           struct charset *charset;
5053           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5054                                &code, charset);
5055
5056           if (! charset)
5057             {
5058               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5059                 {
5060                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5061                   charset = CHARSET_FROM_ID (charset_ascii);
5062                 }
5063               else
5064                 {
5065                   c = coding->default_char;
5066                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5067                                        charset_list, &code, charset);
5068                 }
5069             }
5070           if (code == CHARSET_INVALID_CODE (charset))
5071             emacs_abort ();
5072           if (charset == charset_big5)
5073             {
5074               int c1, c2;
5075
5076               c1 = code >> 8, c2 = code & 0xFF;
5077               EMIT_TWO_BYTES (c1, c2);
5078             }
5079           else
5080             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5081         }
5082     }
5083   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5084   coding->produced_char += produced_chars;
5085   coding->produced = dst - coding->destination;
5086   return 0;
5087 }
5088
5089 \f
5090 /*** 10. CCL handlers ***/
5091
5092 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5093    Return true if a text is encoded in a coding system of which
5094    encoder/decoder are written in CCL program.  */
5095
5096 static bool
5097 detect_coding_ccl (struct coding_system *coding,
5098                    struct coding_detection_info *detect_info)
5099 {
5100   const unsigned char *src = coding->source, *src_base;
5101   const unsigned char *src_end = coding->source + coding->src_bytes;
5102   bool multibytep = coding->src_multibyte;
5103   ptrdiff_t consumed_chars = 0;
5104   int found = 0;
5105   unsigned char *valids;
5106   ptrdiff_t head_ascii = coding->head_ascii;
5107   Lisp_Object attrs;
5108
5109   detect_info->checked |= CATEGORY_MASK_CCL;
5110
5111   coding = &coding_categories[coding_category_ccl];
5112   valids = CODING_CCL_VALIDS (coding);
5113   attrs = CODING_ID_ATTRS (coding->id);
5114   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5115     src += head_ascii;
5116
5117   while (1)
5118     {
5119       int c;
5120
5121       src_base = src;
5122       ONE_MORE_BYTE (c);
5123       if (c < 0 || ! valids[c])
5124         break;
5125       if ((valids[c] > 1))
5126         found = CATEGORY_MASK_CCL;
5127     }
5128   detect_info->rejected |= CATEGORY_MASK_CCL;
5129   return 0;
5130
5131  no_more_source:
5132   detect_info->found |= found;
5133   return 1;
5134 }
5135
5136 static void
5137 decode_coding_ccl (struct coding_system *coding)
5138 {
5139   const unsigned char *src = coding->source + coding->consumed;
5140   const unsigned char *src_end = coding->source + coding->src_bytes;
5141   int *charbuf = coding->charbuf + coding->charbuf_used;
5142   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5143   ptrdiff_t consumed_chars = 0;
5144   bool multibytep = coding->src_multibyte;
5145   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5146   int source_charbuf[1024];
5147   int source_byteidx[1025];
5148   Lisp_Object attrs, charset_list;
5149
5150   CODING_GET_INFO (coding, attrs, charset_list);
5151
5152   while (1)
5153     {
5154       const unsigned char *p = src;
5155       ptrdiff_t offset;
5156       int i = 0;
5157
5158       if (multibytep)
5159         {
5160           while (i < 1024 && p < src_end)
5161             {
5162               source_byteidx[i] = p - src;
5163               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5164             }
5165           source_byteidx[i] = p - src;
5166         }
5167       else
5168         while (i < 1024 && p < src_end)
5169           source_charbuf[i++] = *p++;
5170
5171       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5172         ccl->last_block = true;
5173       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5174       charset_map_loaded = 0;
5175       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5176                   charset_list);
5177       if (charset_map_loaded
5178           && (offset = coding_change_source (coding)))
5179         {
5180           p += offset;
5181           src += offset;
5182           src_end += offset;
5183         }
5184       charbuf += ccl->produced;
5185       if (multibytep)
5186         src += source_byteidx[ccl->consumed];
5187       else
5188         src += ccl->consumed;
5189       consumed_chars += ccl->consumed;
5190       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5191         break;
5192     }
5193
5194   switch (ccl->status)
5195     {
5196     case CCL_STAT_SUSPEND_BY_SRC:
5197       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5198       break;
5199     case CCL_STAT_SUSPEND_BY_DST:
5200       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5201       break;
5202     case CCL_STAT_QUIT:
5203     case CCL_STAT_INVALID_CMD:
5204       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5205       break;
5206     default:
5207       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5208       break;
5209     }
5210   coding->consumed_char += consumed_chars;
5211   coding->consumed = src - coding->source;
5212   coding->charbuf_used = charbuf - coding->charbuf;
5213 }
5214
5215 static bool
5216 encode_coding_ccl (struct coding_system *coding)
5217 {
5218   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5219   bool multibytep = coding->dst_multibyte;
5220   int *charbuf = coding->charbuf;
5221   int *charbuf_end = charbuf + coding->charbuf_used;
5222   unsigned char *dst = coding->destination + coding->produced;
5223   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5224   int destination_charbuf[1024];
5225   ptrdiff_t produced_chars = 0;
5226   int i;
5227   Lisp_Object attrs, charset_list;
5228
5229   CODING_GET_INFO (coding, attrs, charset_list);
5230   if (coding->consumed_char == coding->src_chars
5231       && coding->mode & CODING_MODE_LAST_BLOCK)
5232     ccl->last_block = true;
5233
5234   do
5235     {
5236       ptrdiff_t offset;
5237
5238       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5239       charset_map_loaded = 0;
5240       ccl_driver (ccl, charbuf, destination_charbuf,
5241                   charbuf_end - charbuf, 1024, charset_list);
5242       if (charset_map_loaded
5243           && (offset = coding_change_destination (coding)))
5244         dst += offset;
5245       if (multibytep)
5246         {
5247           ASSURE_DESTINATION (ccl->produced * 2);
5248           for (i = 0; i < ccl->produced; i++)
5249             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5250         }
5251       else
5252         {
5253           ASSURE_DESTINATION (ccl->produced);
5254           for (i = 0; i < ccl->produced; i++)
5255             *dst++ = destination_charbuf[i] & 0xFF;
5256           produced_chars += ccl->produced;
5257         }
5258       charbuf += ccl->consumed;
5259       if (ccl->status == CCL_STAT_QUIT
5260           || ccl->status == CCL_STAT_INVALID_CMD)
5261         break;
5262     }
5263   while (charbuf < charbuf_end);
5264
5265   switch (ccl->status)
5266     {
5267     case CCL_STAT_SUSPEND_BY_SRC:
5268       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5269       break;
5270     case CCL_STAT_SUSPEND_BY_DST:
5271       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5272       break;
5273     case CCL_STAT_QUIT:
5274     case CCL_STAT_INVALID_CMD:
5275       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5276       break;
5277     default:
5278       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5279       break;
5280     }
5281
5282   coding->produced_char += produced_chars;
5283   coding->produced = dst - coding->destination;
5284   return 0;
5285 }
5286
5287 \f
5288 /*** 10, 11. no-conversion handlers ***/
5289
5290 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5291
5292 static void
5293 decode_coding_raw_text (struct coding_system *coding)
5294 {
5295   bool eol_dos
5296     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5297
5298   coding->chars_at_source = 1;
5299   coding->consumed_char = coding->src_chars;
5300   coding->consumed = coding->src_bytes;
5301   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5302     {
5303       coding->consumed_char--;
5304       coding->consumed--;
5305       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5306     }
5307   else
5308     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5309 }
5310
5311 static bool
5312 encode_coding_raw_text (struct coding_system *coding)
5313 {
5314   bool multibytep = coding->dst_multibyte;
5315   int *charbuf = coding->charbuf;
5316   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5317   unsigned char *dst = coding->destination + coding->produced;
5318   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5319   ptrdiff_t produced_chars = 0;
5320   int c;
5321
5322   if (multibytep)
5323     {
5324       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5325
5326       if (coding->src_multibyte)
5327         while (charbuf < charbuf_end)
5328           {
5329             ASSURE_DESTINATION (safe_room);
5330             c = *charbuf++;
5331             if (ASCII_CHAR_P (c))
5332               EMIT_ONE_ASCII_BYTE (c);
5333             else if (CHAR_BYTE8_P (c))
5334               {
5335                 c = CHAR_TO_BYTE8 (c);
5336                 EMIT_ONE_BYTE (c);
5337               }
5338             else
5339               {
5340                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5341
5342                 CHAR_STRING_ADVANCE (c, p1);
5343                 do
5344                   {
5345                     EMIT_ONE_BYTE (*p0);
5346                     p0++;
5347                   }
5348                 while (p0 < p1);
5349               }
5350           }
5351       else
5352         while (charbuf < charbuf_end)
5353           {
5354             ASSURE_DESTINATION (safe_room);
5355             c = *charbuf++;
5356             EMIT_ONE_BYTE (c);
5357           }
5358     }
5359   else
5360     {
5361       if (coding->src_multibyte)
5362         {
5363           int safe_room = MAX_MULTIBYTE_LENGTH;
5364
5365           while (charbuf < charbuf_end)
5366             {
5367               ASSURE_DESTINATION (safe_room);
5368               c = *charbuf++;
5369               if (ASCII_CHAR_P (c))
5370                 *dst++ = c;
5371               else if (CHAR_BYTE8_P (c))
5372                 *dst++ = CHAR_TO_BYTE8 (c);
5373               else
5374                 CHAR_STRING_ADVANCE (c, dst);
5375             }
5376         }
5377       else
5378         {
5379           ASSURE_DESTINATION (charbuf_end - charbuf);
5380           while (charbuf < charbuf_end && dst < dst_end)
5381             *dst++ = *charbuf++;
5382         }
5383       produced_chars = dst - (coding->destination + coding->produced);
5384     }
5385   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5386   coding->produced_char += produced_chars;
5387   coding->produced = dst - coding->destination;
5388   return 0;
5389 }
5390
5391 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5392    Return true if a text is encoded in a charset-based coding system.  */
5393
5394 static bool
5395 detect_coding_charset (struct coding_system *coding,
5396                        struct coding_detection_info *detect_info)
5397 {
5398   const unsigned char *src = coding->source, *src_base;
5399   const unsigned char *src_end = coding->source + coding->src_bytes;
5400   bool multibytep = coding->src_multibyte;
5401   ptrdiff_t consumed_chars = 0;
5402   Lisp_Object attrs, valids, name;
5403   int found = 0;
5404   ptrdiff_t head_ascii = coding->head_ascii;
5405   bool check_latin_extra = 0;
5406
5407   detect_info->checked |= CATEGORY_MASK_CHARSET;
5408
5409   coding = &coding_categories[coding_category_charset];
5410   attrs = CODING_ID_ATTRS (coding->id);
5411   valids = AREF (attrs, coding_attr_charset_valids);
5412   name = CODING_ID_NAME (coding->id);
5413   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5414                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5415       || strncmp (SSDATA (SYMBOL_NAME (name)),
5416                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5417     check_latin_extra = 1;
5418
5419   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5420     src += head_ascii;
5421
5422   while (1)
5423     {
5424       int c;
5425       Lisp_Object val;
5426       struct charset *charset;
5427       int dim, idx;
5428
5429       src_base = src;
5430       ONE_MORE_BYTE (c);
5431       if (c < 0)
5432         continue;
5433       val = AREF (valids, c);
5434       if (NILP (val))
5435         break;
5436       if (c >= 0x80)
5437         {
5438           if (c < 0xA0
5439               && check_latin_extra
5440               && (!VECTORP (Vlatin_extra_code_table)
5441                   || NILP (AREF (Vlatin_extra_code_table, c))))
5442             break;
5443           found = CATEGORY_MASK_CHARSET;
5444         }
5445       if (INTEGERP (val))
5446         {
5447           charset = CHARSET_FROM_ID (XFASTINT (val));
5448           dim = CHARSET_DIMENSION (charset);
5449           for (idx = 1; idx < dim; idx++)
5450             {
5451               if (src == src_end)
5452                 goto too_short;
5453               ONE_MORE_BYTE (c);
5454               if (c < charset->code_space[(dim - 1 - idx) * 4]
5455                   || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5456                 break;
5457             }
5458           if (idx < dim)
5459             break;
5460         }
5461       else
5462         {
5463           idx = 1;
5464           for (; CONSP (val); val = XCDR (val))
5465             {
5466               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5467               dim = CHARSET_DIMENSION (charset);
5468               while (idx < dim)
5469                 {
5470                   if (src == src_end)
5471                     goto too_short;
5472                   ONE_MORE_BYTE (c);
5473                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5474                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5475                     break;
5476                   idx++;
5477                 }
5478               if (idx == dim)
5479                 {
5480                   val = Qnil;
5481                   break;
5482                 }
5483             }
5484           if (CONSP (val))
5485             break;
5486         }
5487     }
5488  too_short:
5489   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5490   return 0;
5491
5492  no_more_source:
5493   detect_info->found |= found;
5494   return 1;
5495 }
5496
5497 static void
5498 decode_coding_charset (struct coding_system *coding)
5499 {
5500   const unsigned char *src = coding->source + coding->consumed;
5501   const unsigned char *src_end = coding->source + coding->src_bytes;
5502   const unsigned char *src_base;
5503   int *charbuf = coding->charbuf + coding->charbuf_used;
5504   /* We may produce one charset annotation in one loop and one more at
5505      the end.  */
5506   int *charbuf_end
5507     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5508   ptrdiff_t consumed_chars = 0, consumed_chars_base;
5509   bool multibytep = coding->src_multibyte;
5510   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5511   Lisp_Object valids;
5512   ptrdiff_t char_offset = coding->produced_char;
5513   ptrdiff_t last_offset = char_offset;
5514   int last_id = charset_ascii;
5515   bool eol_dos
5516     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5517   int byte_after_cr = -1;
5518
5519   valids = AREF (attrs, coding_attr_charset_valids);
5520
5521   while (1)
5522     {
5523       int c;
5524       Lisp_Object val;
5525       struct charset *charset;
5526       int dim;
5527       int len = 1;
5528       unsigned code;
5529
5530       src_base = src;
5531       consumed_chars_base = consumed_chars;
5532
5533       if (charbuf >= charbuf_end)
5534         {
5535           if (byte_after_cr >= 0)
5536             src_base--;
5537           break;
5538         }
5539
5540       if (byte_after_cr >= 0)
5541         {
5542           c = byte_after_cr;
5543           byte_after_cr = -1;
5544         }
5545       else
5546         {
5547           ONE_MORE_BYTE (c);
5548           if (eol_dos && c == '\r')
5549             ONE_MORE_BYTE (byte_after_cr);
5550         }
5551       if (c < 0)
5552         goto invalid_code;
5553       code = c;
5554
5555       val = AREF (valids, c);
5556       if (! INTEGERP (val) && ! CONSP (val))
5557         goto invalid_code;
5558       if (INTEGERP (val))
5559         {
5560           charset = CHARSET_FROM_ID (XFASTINT (val));
5561           dim = CHARSET_DIMENSION (charset);
5562           while (len < dim)
5563             {
5564               ONE_MORE_BYTE (c);
5565               code = (code << 8) | c;
5566               len++;
5567             }
5568           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5569                               charset, code, c);
5570         }
5571       else
5572         {
5573           /* VAL is a list of charset IDs.  It is assured that the
5574              list is sorted by charset dimensions (smaller one
5575              comes first).  */
5576           while (CONSP (val))
5577             {
5578               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5579               dim = CHARSET_DIMENSION (charset);
5580               while (len < dim)
5581                 {
5582                   ONE_MORE_BYTE (c);
5583                   code = (code << 8) | c;
5584                   len++;
5585                 }
5586               CODING_DECODE_CHAR (coding, src, src_base,
5587                                   src_end, charset, code, c);
5588               if (c >= 0)
5589                 break;
5590               val = XCDR (val);
5591             }
5592         }
5593       if (c < 0)
5594         goto invalid_code;
5595       if (charset->id != charset_ascii
5596           && last_id != charset->id)
5597         {
5598           if (last_id != charset_ascii)
5599             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5600           last_id = charset->id;
5601           last_offset = char_offset;
5602         }
5603
5604       *charbuf++ = c;
5605       char_offset++;
5606       continue;
5607
5608     invalid_code:
5609       src = src_base;
5610       consumed_chars = consumed_chars_base;
5611       ONE_MORE_BYTE (c);
5612       *charbuf++ = c < 0 ? -c : ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
5613       char_offset++;
5614     }
5615
5616  no_more_source:
5617   if (last_id != charset_ascii)
5618     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5619   coding->consumed_char += consumed_chars_base;
5620   coding->consumed = src_base - coding->source;
5621   coding->charbuf_used = charbuf - coding->charbuf;
5622 }
5623
5624 static bool
5625 encode_coding_charset (struct coding_system *coding)
5626 {
5627   bool multibytep = coding->dst_multibyte;
5628   int *charbuf = coding->charbuf;
5629   int *charbuf_end = charbuf + coding->charbuf_used;
5630   unsigned char *dst = coding->destination + coding->produced;
5631   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5632   int safe_room = MAX_MULTIBYTE_LENGTH;
5633   ptrdiff_t produced_chars = 0;
5634   Lisp_Object attrs, charset_list;
5635   bool ascii_compatible;
5636   int c;
5637
5638   CODING_GET_INFO (coding, attrs, charset_list);
5639   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5640
5641   while (charbuf < charbuf_end)
5642     {
5643       struct charset *charset;
5644       unsigned code;
5645
5646       ASSURE_DESTINATION (safe_room);
5647       c = *charbuf++;
5648       if (ascii_compatible && ASCII_CHAR_P (c))
5649         EMIT_ONE_ASCII_BYTE (c);
5650       else if (CHAR_BYTE8_P (c))
5651         {
5652           c = CHAR_TO_BYTE8 (c);
5653           EMIT_ONE_BYTE (c);
5654         }
5655       else
5656         {
5657           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5658                                &code, charset);
5659
5660           if (charset)
5661             {
5662               if (CHARSET_DIMENSION (charset) == 1)
5663                 EMIT_ONE_BYTE (code);
5664               else if (CHARSET_DIMENSION (charset) == 2)
5665                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5666               else if (CHARSET_DIMENSION (charset) == 3)
5667                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5668               else
5669                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5670                                  (code >> 8) & 0xFF, code & 0xFF);
5671             }
5672           else
5673             {
5674               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5675                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5676               else
5677                 c = coding->default_char;
5678               EMIT_ONE_BYTE (c);
5679             }
5680         }
5681     }
5682
5683   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5684   coding->produced_char += produced_chars;
5685   coding->produced = dst - coding->destination;
5686   return 0;
5687 }
5688
5689 \f
5690 /*** 7. C library functions ***/
5691
5692 /* Setup coding context CODING from information about CODING_SYSTEM.
5693    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5694    CODING_SYSTEM is invalid, signal an error.  */
5695
5696 void
5697 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5698 {
5699   Lisp_Object attrs;
5700   Lisp_Object eol_type;
5701   Lisp_Object coding_type;
5702   Lisp_Object val;
5703
5704   if (NILP (coding_system))
5705     coding_system = Qundecided;
5706
5707   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5708
5709   attrs = CODING_ID_ATTRS (coding->id);
5710   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5711
5712   coding->mode = 0;
5713   if (VECTORP (eol_type))
5714     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5715                             | CODING_REQUIRE_DETECTION_MASK);
5716   else if (! EQ (eol_type, Qunix))
5717     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5718                             | CODING_REQUIRE_ENCODING_MASK);
5719   else
5720     coding->common_flags = 0;
5721   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5722     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5723   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5724     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5725   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5726     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5727
5728   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5729   coding->max_charset_id = SCHARS (val) - 1;
5730   coding->safe_charsets = SDATA (val);
5731   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5732   coding->carryover_bytes = 0;
5733   coding->raw_destination = 0;
5734
5735   coding_type = CODING_ATTR_TYPE (attrs);
5736   if (EQ (coding_type, Qundecided))
5737     {
5738       coding->detector = NULL;
5739       coding->decoder = decode_coding_raw_text;
5740       coding->encoder = encode_coding_raw_text;
5741       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5742       coding->spec.undecided.inhibit_nbd
5743         = (encode_inhibit_flag
5744            (AREF (attrs, coding_attr_undecided_inhibit_null_byte_detection)));
5745       coding->spec.undecided.inhibit_ied
5746         = (encode_inhibit_flag
5747            (AREF (attrs, coding_attr_undecided_inhibit_iso_escape_detection)));
5748       coding->spec.undecided.prefer_utf_8
5749         = ! NILP (AREF (attrs, coding_attr_undecided_prefer_utf_8));
5750     }
5751   else if (EQ (coding_type, Qiso_2022))
5752     {
5753       int i;
5754       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5755
5756       /* Invoke graphic register 0 to plane 0.  */
5757       CODING_ISO_INVOCATION (coding, 0) = 0;
5758       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5759       CODING_ISO_INVOCATION (coding, 1)
5760         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5761       /* Setup the initial status of designation.  */
5762       for (i = 0; i < 4; i++)
5763         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5764       /* Not single shifting initially.  */
5765       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5766       /* Beginning of buffer should also be regarded as bol. */
5767       CODING_ISO_BOL (coding) = 1;
5768       coding->detector = detect_coding_iso_2022;
5769       coding->decoder = decode_coding_iso_2022;
5770       coding->encoder = encode_coding_iso_2022;
5771       if (flags & CODING_ISO_FLAG_SAFE)
5772         coding->mode |= CODING_MODE_SAFE_ENCODING;
5773       coding->common_flags
5774         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5775             | CODING_REQUIRE_FLUSHING_MASK);
5776       if (flags & CODING_ISO_FLAG_COMPOSITION)
5777         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5778       if (flags & CODING_ISO_FLAG_DESIGNATION)
5779         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5780       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5781         {
5782           setup_iso_safe_charsets (attrs);
5783           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5784           coding->max_charset_id = SCHARS (val) - 1;
5785           coding->safe_charsets = SDATA (val);
5786         }
5787       CODING_ISO_FLAGS (coding) = flags;
5788       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5789       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5790       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5791       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5792     }
5793   else if (EQ (coding_type, Qcharset))
5794     {
5795       coding->detector = detect_coding_charset;
5796       coding->decoder = decode_coding_charset;
5797       coding->encoder = encode_coding_charset;
5798       coding->common_flags
5799         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5800     }
5801   else if (EQ (coding_type, Qutf_8))
5802     {
5803       val = AREF (attrs, coding_attr_utf_bom);
5804       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5805                                    : EQ (val, Qt) ? utf_with_bom
5806                                    : utf_without_bom);
5807       coding->detector = detect_coding_utf_8;
5808       coding->decoder = decode_coding_utf_8;
5809       coding->encoder = encode_coding_utf_8;
5810       coding->common_flags
5811         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5812       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5813         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5814     }
5815   else if (EQ (coding_type, Qutf_16))
5816     {
5817       val = AREF (attrs, coding_attr_utf_bom);
5818       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5819                                     : EQ (val, Qt) ? utf_with_bom
5820                                     : utf_without_bom);
5821       val = AREF (attrs, coding_attr_utf_16_endian);
5822       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5823                                        : utf_16_little_endian);
5824       CODING_UTF_16_SURROGATE (coding) = 0;
5825       coding->detector = detect_coding_utf_16;
5826       coding->decoder = decode_coding_utf_16;
5827       coding->encoder = encode_coding_utf_16;
5828       coding->common_flags
5829         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5830       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5831         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5832     }
5833   else if (EQ (coding_type, Qccl))
5834     {
5835       coding->detector = detect_coding_ccl;
5836       coding->decoder = decode_coding_ccl;
5837       coding->encoder = encode_coding_ccl;
5838       coding->common_flags
5839         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5840             | CODING_REQUIRE_FLUSHING_MASK);
5841     }
5842   else if (EQ (coding_type, Qemacs_mule))
5843     {
5844       coding->detector = detect_coding_emacs_mule;
5845       coding->decoder = decode_coding_emacs_mule;
5846       coding->encoder = encode_coding_emacs_mule;
5847       coding->common_flags
5848         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5849       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5850           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5851         {
5852           Lisp_Object tail, safe_charsets;
5853           int max_charset_id = 0;
5854
5855           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5856                tail = XCDR (tail))
5857             if (max_charset_id < XFASTINT (XCAR (tail)))
5858               max_charset_id = XFASTINT (XCAR (tail));
5859           safe_charsets = make_uninit_string (max_charset_id + 1);
5860           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5861           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5862                tail = XCDR (tail))
5863             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5864           coding->max_charset_id = max_charset_id;
5865           coding->safe_charsets = SDATA (safe_charsets);
5866         }
5867       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5868       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5869     }
5870   else if (EQ (coding_type, Qshift_jis))
5871     {
5872       coding->detector = detect_coding_sjis;
5873       coding->decoder = decode_coding_sjis;
5874       coding->encoder = encode_coding_sjis;
5875       coding->common_flags
5876         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5877     }
5878   else if (EQ (coding_type, Qbig5))
5879     {
5880       coding->detector = detect_coding_big5;
5881       coding->decoder = decode_coding_big5;
5882       coding->encoder = encode_coding_big5;
5883       coding->common_flags
5884         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5885     }
5886   else                          /* EQ (coding_type, Qraw_text) */
5887     {
5888       coding->detector = NULL;
5889       coding->decoder = decode_coding_raw_text;
5890       coding->encoder = encode_coding_raw_text;
5891       if (! EQ (eol_type, Qunix))
5892         {
5893           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5894           if (! VECTORP (eol_type))
5895             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5896         }
5897
5898     }
5899
5900   return;
5901 }
5902
5903 /* Return a list of charsets supported by CODING.  */
5904
5905 Lisp_Object
5906 coding_charset_list (struct coding_system *coding)
5907 {
5908   Lisp_Object attrs, charset_list;
5909
5910   CODING_GET_INFO (coding, attrs, charset_list);
5911   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5912     {
5913       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5914
5915       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5916         charset_list = Viso_2022_charset_list;
5917     }
5918   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5919     {
5920       charset_list = Vemacs_mule_charset_list;
5921     }
5922   return charset_list;
5923 }
5924
5925
5926 /* Return a list of charsets supported by CODING-SYSTEM.  */
5927
5928 Lisp_Object
5929 coding_system_charset_list (Lisp_Object coding_system)
5930 {
5931   ptrdiff_t id;
5932   Lisp_Object attrs, charset_list;
5933
5934   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5935   attrs = CODING_ID_ATTRS (id);
5936
5937   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5938     {
5939       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5940
5941       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5942         charset_list = Viso_2022_charset_list;
5943       else
5944         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5945     }
5946   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5947     {
5948       charset_list = Vemacs_mule_charset_list;
5949     }
5950   else
5951     {
5952       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5953     }
5954   return charset_list;
5955 }
5956
5957
5958 /* Return raw-text or one of its subsidiaries that has the same
5959    eol_type as CODING-SYSTEM.  */
5960
5961 Lisp_Object
5962 raw_text_coding_system (Lisp_Object coding_system)
5963 {
5964   Lisp_Object spec, attrs;
5965   Lisp_Object eol_type, raw_text_eol_type;
5966
5967   if (NILP (coding_system))
5968     return Qraw_text;
5969   spec = CODING_SYSTEM_SPEC (coding_system);
5970   attrs = AREF (spec, 0);
5971
5972   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5973     return coding_system;
5974
5975   eol_type = AREF (spec, 2);
5976   if (VECTORP (eol_type))
5977     return Qraw_text;
5978   spec = CODING_SYSTEM_SPEC (Qraw_text);
5979   raw_text_eol_type = AREF (spec, 2);
5980   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5981           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5982           : AREF (raw_text_eol_type, 2));
5983 }
5984
5985 /* Return true if CODING corresponds to raw-text coding-system.  */
5986
5987 bool
5988 raw_text_coding_system_p (struct coding_system *coding)
5989 {
5990   return (coding->decoder == decode_coding_raw_text
5991           && coding->encoder == encode_coding_raw_text) ? true : false;
5992 }
5993
5994
5995 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5996    the subsidiary that has the same eol-spec as PARENT (if it is not
5997    nil and specifies end-of-line format) or the system's setting
5998    (system_eol_type).  */
5999
6000 Lisp_Object
6001 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
6002 {
6003   Lisp_Object spec, eol_type;
6004
6005   if (NILP (coding_system))
6006     coding_system = Qraw_text;
6007   else
6008     CHECK_CODING_SYSTEM (coding_system);
6009   spec = CODING_SYSTEM_SPEC (coding_system);
6010   eol_type = AREF (spec, 2);
6011   if (VECTORP (eol_type))
6012     {
6013       Lisp_Object parent_eol_type;
6014
6015       if (! NILP (parent))
6016         {
6017           Lisp_Object parent_spec;
6018
6019           CHECK_CODING_SYSTEM (parent);
6020           parent_spec = CODING_SYSTEM_SPEC (parent);
6021           parent_eol_type = AREF (parent_spec, 2);
6022           if (VECTORP (parent_eol_type))
6023             parent_eol_type = system_eol_type;
6024         }
6025       else
6026         parent_eol_type = system_eol_type;
6027       if (EQ (parent_eol_type, Qunix))
6028         coding_system = AREF (eol_type, 0);
6029       else if (EQ (parent_eol_type, Qdos))
6030         coding_system = AREF (eol_type, 1);
6031       else if (EQ (parent_eol_type, Qmac))
6032         coding_system = AREF (eol_type, 2);
6033     }
6034   return coding_system;
6035 }
6036
6037
6038 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
6039    decided for writing to a process.  If not, complement them, and
6040    return a new coding system.  */
6041
6042 Lisp_Object
6043 complement_process_encoding_system (Lisp_Object coding_system)
6044 {
6045   Lisp_Object coding_base = Qnil, eol_base = Qnil;
6046   Lisp_Object spec, attrs;
6047   int i;
6048
6049   for (i = 0; i < 3; i++)
6050     {
6051       if (i == 1)
6052         coding_system = CDR_SAFE (Vdefault_process_coding_system);
6053       else if (i == 2)
6054         coding_system = preferred_coding_system ();
6055       CHECK_CODING_SYSTEM (coding_system);
6056       spec = CODING_SYSTEM_SPEC (coding_system);
6057       if (NILP (spec))
6058         continue;
6059       attrs = AREF (spec, 0);
6060       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6061         coding_base = CODING_ATTR_BASE_NAME (attrs);
6062       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
6063         eol_base = coding_system;
6064       if (! NILP (coding_base) && ! NILP (eol_base))
6065         break;
6066     }
6067
6068   if (i > 0)
6069     /* The original CODING_SYSTEM didn't specify text-conversion or
6070        eol-conversion.  Be sure that we return a fully complemented
6071        coding system.  */
6072     coding_system = coding_inherit_eol_type (coding_base, eol_base);
6073   return coding_system;
6074 }
6075
6076
6077 /* Emacs has a mechanism to automatically detect a coding system if it
6078    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
6079    it's impossible to distinguish some coding systems accurately
6080    because they use the same range of codes.  So, at first, coding
6081    systems are categorized into 7, those are:
6082
6083    o coding-category-emacs-mule
6084
6085         The category for a coding system which has the same code range
6086         as Emacs' internal format.  Assigned the coding-system (Lisp
6087         symbol) `emacs-mule' by default.
6088
6089    o coding-category-sjis
6090
6091         The category for a coding system which has the same code range
6092         as SJIS.  Assigned the coding-system (Lisp
6093         symbol) `japanese-shift-jis' by default.
6094
6095    o coding-category-iso-7
6096
6097         The category for a coding system which has the same code range
6098         as ISO2022 of 7-bit environment.  This doesn't use any locking
6099         shift and single shift functions.  This can encode/decode all
6100         charsets.  Assigned the coding-system (Lisp symbol)
6101         `iso-2022-7bit' by default.
6102
6103    o coding-category-iso-7-tight
6104
6105         Same as coding-category-iso-7 except that this can
6106         encode/decode only the specified charsets.
6107
6108    o coding-category-iso-8-1
6109
6110         The category for a coding system which has the same code range
6111         as ISO2022 of 8-bit environment and graphic plane 1 used only
6112         for DIMENSION1 charset.  This doesn't use any locking shift
6113         and single shift functions.  Assigned the coding-system (Lisp
6114         symbol) `iso-latin-1' by default.
6115
6116    o coding-category-iso-8-2
6117
6118         The category for a coding system which has the same code range
6119         as ISO2022 of 8-bit environment and graphic plane 1 used only
6120         for DIMENSION2 charset.  This doesn't use any locking shift
6121         and single shift functions.  Assigned the coding-system (Lisp
6122         symbol) `japanese-iso-8bit' by default.
6123
6124    o coding-category-iso-7-else
6125
6126         The category for a coding system which has the same code range
6127         as ISO2022 of 7-bit environment but uses locking shift or
6128         single shift functions.  Assigned the coding-system (Lisp
6129         symbol) `iso-2022-7bit-lock' by default.
6130
6131    o coding-category-iso-8-else
6132
6133         The category for a coding system which has the same code range
6134         as ISO2022 of 8-bit environment but uses locking shift or
6135         single shift functions.  Assigned the coding-system (Lisp
6136         symbol) `iso-2022-8bit-ss2' by default.
6137
6138    o coding-category-big5
6139
6140         The category for a coding system which has the same code range
6141         as BIG5.  Assigned the coding-system (Lisp symbol)
6142         `cn-big5' by default.
6143
6144    o coding-category-utf-8
6145
6146         The category for a coding system which has the same code range
6147         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6148         symbol) `utf-8' by default.
6149
6150    o coding-category-utf-16-be
6151
6152         The category for a coding system in which a text has an
6153         Unicode signature (cf. Unicode Standard) in the order of BIG
6154         endian at the head.  Assigned the coding-system (Lisp symbol)
6155         `utf-16-be' by default.
6156
6157    o coding-category-utf-16-le
6158
6159         The category for a coding system in which a text has an
6160         Unicode signature (cf. Unicode Standard) in the order of
6161         LITTLE endian at the head.  Assigned the coding-system (Lisp
6162         symbol) `utf-16-le' by default.
6163
6164    o coding-category-ccl
6165
6166         The category for a coding system of which encoder/decoder is
6167         written in CCL programs.  The default value is nil, i.e., no
6168         coding system is assigned.
6169
6170    o coding-category-binary
6171
6172         The category for a coding system not categorized in any of the
6173         above.  Assigned the coding-system (Lisp symbol)
6174         `no-conversion' by default.
6175
6176    Each of them is a Lisp symbol and the value is an actual
6177    `coding-system's (this is also a Lisp symbol) assigned by a user.
6178    What Emacs does actually is to detect a category of coding system.
6179    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6180    decide only one possible category, it selects a category of the
6181    highest priority.  Priorities of categories are also specified by a
6182    user in a Lisp variable `coding-category-list'.
6183
6184 */
6185
6186 static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,
6187                                            int eol_seen);
6188
6189
6190 /* Return the number of ASCII characters at the head of the source.
6191    By side effects, set coding->head_ascii and update
6192    coding->eol_seen.  The value of coding->eol_seen is "logical or" of
6193    EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is
6194    reliable only when all the source bytes are ASCII.  */
6195
6196 static ptrdiff_t
6197 check_ascii (struct coding_system *coding)
6198 {
6199   const unsigned char *src, *end;
6200   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6201   int eol_seen = coding->eol_seen;
6202
6203   coding_set_source (coding);
6204   src = coding->source;
6205   end = src + coding->src_bytes;
6206
6207   if (inhibit_eol_conversion
6208       || SYMBOLP (eol_type))
6209     {
6210       /* We don't have to check EOL format.  */
6211       while (src < end && !( *src & 0x80))
6212         {
6213           if (*src++ == '\n')
6214             eol_seen |= EOL_SEEN_LF;
6215         }
6216     }
6217   else
6218     {
6219       end--;                /* We look ahead one byte for "CR LF".  */
6220       while (src < end)
6221         {
6222           int c = *src;
6223
6224           if (c & 0x80)
6225             break;
6226           src++;
6227           if (c == '\r')
6228             {
6229               if (*src == '\n')
6230                 {
6231                   eol_seen |= EOL_SEEN_CRLF;
6232                   src++;
6233                 }
6234               else
6235                 eol_seen |= EOL_SEEN_CR;
6236             }
6237           else if (c == '\n')
6238             eol_seen |= EOL_SEEN_LF;
6239         }
6240       if (src == end)
6241         {
6242           int c = *src;
6243
6244           /* All bytes but the last one C are ASCII.  */
6245           if (! (c & 0x80))
6246             {
6247               if (c == '\r')
6248                 eol_seen |= EOL_SEEN_CR;
6249               else if (c  == '\n')
6250                 eol_seen |= EOL_SEEN_LF;
6251               src++;
6252             }
6253         }
6254     }
6255   coding->head_ascii = src - coding->source;
6256   coding->eol_seen = eol_seen;
6257   return (coding->head_ascii);
6258 }
6259
6260
6261 /* Return the number of characters at the source if all the bytes are
6262    valid UTF-8 (of Unicode range).  Otherwise, return -1.  By side
6263    effects, update coding->eol_seen.  The value of coding->eol_seen is
6264    "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but
6265    the value is reliable only when all the source bytes are valid
6266    UTF-8.  */
6267
6268 static ptrdiff_t
6269 check_utf_8 (struct coding_system *coding)
6270 {
6271   const unsigned char *src, *end;
6272   int eol_seen;
6273   ptrdiff_t nchars = coding->head_ascii;
6274
6275   if (coding->head_ascii < 0)
6276     check_ascii (coding);
6277   else
6278     coding_set_source (coding);
6279   src = coding->source + coding->head_ascii;
6280   /* We look ahead one byte for CR LF.  */
6281   end = coding->source + coding->src_bytes - 1;
6282   eol_seen = coding->eol_seen;
6283   while (src < end)
6284     {
6285       int c = *src;
6286
6287       if (UTF_8_1_OCTET_P (*src))
6288         {
6289           src++;
6290           if (c < 0x20)
6291             {
6292               if (c == '\r')
6293                 {
6294                   if (*src == '\n')
6295                     {
6296                       eol_seen |= EOL_SEEN_CRLF;
6297                       src++;
6298                       nchars++;
6299                     }
6300                   else
6301                     eol_seen |= EOL_SEEN_CR;
6302                 }
6303               else if (c == '\n')
6304                 eol_seen |= EOL_SEEN_LF;
6305             }
6306         }
6307       else if (UTF_8_2_OCTET_LEADING_P (c))
6308         {
6309           if (c < 0xC2          /* overlong sequence */
6310               || src + 1 >= end
6311               || ! UTF_8_EXTRA_OCTET_P (src[1]))
6312             return -1;
6313           src += 2;
6314         }
6315       else if (UTF_8_3_OCTET_LEADING_P (c))
6316         {
6317           if (src + 2 >= end
6318               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6319                     && UTF_8_EXTRA_OCTET_P (src[2])))
6320             return -1;
6321           c = (((c & 0xF) << 12)
6322                | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
6323           if (c < 0x800                       /* overlong sequence */
6324               || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
6325             return -1;
6326           src += 3;
6327         }
6328       else if (UTF_8_4_OCTET_LEADING_P (c))
6329         {
6330           if (src + 3 >= end
6331               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6332                     && UTF_8_EXTRA_OCTET_P (src[2])
6333                     && UTF_8_EXTRA_OCTET_P (src[3])))
6334             return -1;
6335           c = (((c & 0x7) << 18) | ((src[1] & 0x3F) << 12)
6336                | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
6337           if (c < 0x10000       /* overlong sequence */
6338               || c >= 0x110000) /* non-Unicode character  */
6339             return -1;
6340           src += 4;
6341         }
6342       else
6343         return -1;
6344       nchars++;
6345     }
6346
6347   if (src == end)
6348     {
6349       if (! UTF_8_1_OCTET_P (*src))
6350         return -1;
6351       nchars++;
6352       if (*src == '\r')
6353         eol_seen |= EOL_SEEN_CR;
6354       else if (*src  == '\n')
6355         eol_seen |= EOL_SEEN_LF;
6356     }
6357   coding->eol_seen = eol_seen;
6358   return nchars;
6359 }
6360
6361
6362 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6363    SOURCE is encoded.  If CATEGORY is one of
6364    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6365    two-byte, else they are encoded by one-byte.
6366
6367    Return one of EOL_SEEN_XXX.  */
6368
6369 #define MAX_EOL_CHECK_COUNT 3
6370
6371 static int
6372 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6373             enum coding_category category)
6374 {
6375   const unsigned char *src = source, *src_end = src + src_bytes;
6376   unsigned char c;
6377   int total  = 0;
6378   int eol_seen = EOL_SEEN_NONE;
6379
6380   if ((1 << category) & CATEGORY_MASK_UTF_16)
6381     {
6382       bool msb = category == (coding_category_utf_16_le
6383                               | coding_category_utf_16_le_nosig);
6384       bool lsb = !msb;
6385
6386       while (src + 1 < src_end)
6387         {
6388           c = src[lsb];
6389           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6390             {
6391               int this_eol;
6392
6393               if (c == '\n')
6394                 this_eol = EOL_SEEN_LF;
6395               else if (src + 3 >= src_end
6396                        || src[msb + 2] != 0
6397                        || src[lsb + 2] != '\n')
6398                 this_eol = EOL_SEEN_CR;
6399               else
6400                 {
6401                   this_eol = EOL_SEEN_CRLF;
6402                   src += 2;
6403                 }
6404
6405               if (eol_seen == EOL_SEEN_NONE)
6406                 /* This is the first end-of-line.  */
6407                 eol_seen = this_eol;
6408               else if (eol_seen != this_eol)
6409                 {
6410                   /* The found type is different from what found before.
6411                      Allow for stray ^M characters in DOS EOL files.  */
6412                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6413                       || (eol_seen == EOL_SEEN_CRLF
6414                           && this_eol == EOL_SEEN_CR))
6415                     eol_seen = EOL_SEEN_CRLF;
6416                   else
6417                     {
6418                       eol_seen = EOL_SEEN_LF;
6419                       break;
6420                     }
6421                 }
6422               if (++total == MAX_EOL_CHECK_COUNT)
6423                 break;
6424             }
6425           src += 2;
6426         }
6427     }
6428   else
6429     while (src < src_end)
6430       {
6431         c = *src++;
6432         if (c == '\n' || c == '\r')
6433           {
6434             int this_eol;
6435
6436             if (c == '\n')
6437               this_eol = EOL_SEEN_LF;
6438             else if (src >= src_end || *src != '\n')
6439               this_eol = EOL_SEEN_CR;
6440             else
6441               this_eol = EOL_SEEN_CRLF, src++;
6442
6443             if (eol_seen == EOL_SEEN_NONE)
6444               /* This is the first end-of-line.  */
6445               eol_seen = this_eol;
6446             else if (eol_seen != this_eol)
6447               {
6448                 /* The found type is different from what found before.
6449                    Allow for stray ^M characters in DOS EOL files.  */
6450                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6451                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6452                   eol_seen = EOL_SEEN_CRLF;
6453                 else
6454                   {
6455                     eol_seen = EOL_SEEN_LF;
6456                     break;
6457                   }
6458               }
6459             if (++total == MAX_EOL_CHECK_COUNT)
6460               break;
6461           }
6462       }
6463   return eol_seen;
6464 }
6465
6466
6467 static Lisp_Object
6468 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6469 {
6470   Lisp_Object eol_type;
6471
6472   eol_type = CODING_ID_EOL_TYPE (coding->id);
6473   if (! VECTORP (eol_type))
6474     /* Already adjusted.  */
6475     return eol_type;
6476   if (eol_seen & EOL_SEEN_LF)
6477     {
6478       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6479       eol_type = Qunix;
6480     }
6481   else if (eol_seen & EOL_SEEN_CRLF)
6482     {
6483       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6484       eol_type = Qdos;
6485     }
6486   else if (eol_seen & EOL_SEEN_CR)
6487     {
6488       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6489       eol_type = Qmac;
6490     }
6491   return eol_type;
6492 }
6493
6494 /* Detect how a text specified in CODING is encoded.  If a coding
6495    system is detected, update fields of CODING by the detected coding
6496    system.  */
6497
6498 static void
6499 detect_coding (struct coding_system *coding)
6500 {
6501   const unsigned char *src, *src_end;
6502   unsigned int saved_mode = coding->mode;
6503   Lisp_Object found = Qnil;
6504   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6505
6506   coding->consumed = coding->consumed_char = 0;
6507   coding->produced = coding->produced_char = 0;
6508   coding_set_source (coding);
6509
6510   src_end = coding->source + coding->src_bytes;
6511
6512   coding->eol_seen = EOL_SEEN_NONE;
6513   /* If we have not yet decided the text encoding type, detect it
6514      now.  */
6515   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6516     {
6517       int c, i;
6518       struct coding_detection_info detect_info;
6519       bool null_byte_found = 0, eight_bit_found = 0;
6520       bool inhibit_nbd = inhibit_flag (coding->spec.undecided.inhibit_nbd,
6521                                        inhibit_null_byte_detection);
6522       bool inhibit_ied = inhibit_flag (coding->spec.undecided.inhibit_ied,
6523                                        inhibit_iso_escape_detection);
6524       bool prefer_utf_8 = coding->spec.undecided.prefer_utf_8;
6525
6526       coding->head_ascii = 0;
6527       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6528       for (src = coding->source; src < src_end; src++)
6529         {
6530           c = *src;
6531           if (c & 0x80)
6532             {
6533               eight_bit_found = 1;
6534               if (null_byte_found)
6535                 break;
6536             }
6537           else if (c < 0x20)
6538             {
6539               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6540                   && ! inhibit_ied
6541                   && ! detect_info.checked)
6542                 {
6543                   if (detect_coding_iso_2022 (coding, &detect_info))
6544                     {
6545                       /* We have scanned the whole data.  */
6546                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6547                         {
6548                           /* We didn't find an 8-bit code.  We may
6549                              have found a null-byte, but it's very
6550                              rare that a binary file conforms to
6551                              ISO-2022.  */
6552                           src = src_end;
6553                           coding->head_ascii = src - coding->source;
6554                         }
6555                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6556                       break;
6557                     }
6558                 }
6559               else if (! c && !inhibit_nbd)
6560                 {
6561                   null_byte_found = 1;
6562                   if (eight_bit_found)
6563                     break;
6564                 }
6565               else if (! disable_ascii_optimization
6566                        && ! inhibit_eol_conversion)
6567                 {
6568                   if (c == '\r')
6569                     {
6570                       if (src < src_end && src[1] == '\n')
6571                         {
6572                           coding->eol_seen |= EOL_SEEN_CRLF;
6573                           src++;
6574                           if (! eight_bit_found)
6575                             coding->head_ascii++;
6576                         }
6577                       else
6578                         coding->eol_seen |= EOL_SEEN_CR;
6579                     }
6580                   else if (c == '\n')
6581                     {
6582                       coding->eol_seen |= EOL_SEEN_LF;
6583                     }
6584                 }
6585
6586               if (! eight_bit_found)
6587                 coding->head_ascii++;
6588             }
6589           else if (! eight_bit_found)
6590             coding->head_ascii++;
6591         }
6592
6593       if (null_byte_found || eight_bit_found
6594           || coding->head_ascii < coding->src_bytes
6595           || detect_info.found)
6596         {
6597           enum coding_category category;
6598           struct coding_system *this;
6599
6600           if (coding->head_ascii == coding->src_bytes)
6601             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6602             for (i = 0; i < coding_category_raw_text; i++)
6603               {
6604                 category = coding_priorities[i];
6605                 this = coding_categories + category;
6606                 if (detect_info.found & (1 << category))
6607                   break;
6608               }
6609           else
6610             {
6611               if (null_byte_found)
6612                 {
6613                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6614                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6615                 }
6616               else if (prefer_utf_8
6617                        && detect_coding_utf_8 (coding, &detect_info))
6618                 {
6619                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
6620                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
6621                 }
6622               for (i = 0; i < coding_category_raw_text; i++)
6623                 {
6624                   category = coding_priorities[i];
6625                   this = coding_categories + category;
6626                   /* Some of this->detector (e.g. detect_coding_sjis)
6627                      require this information.  */
6628                   coding->id = this->id;
6629                   if (this->id < 0)
6630                     {
6631                       /* No coding system of this category is defined.  */
6632                       detect_info.rejected |= (1 << category);
6633                     }
6634                   else if (category >= coding_category_raw_text)
6635                     continue;
6636                   else if (detect_info.checked & (1 << category))
6637                     {
6638                       if (detect_info.found & (1 << category))
6639                         break;
6640                     }
6641                   else if ((*(this->detector)) (coding, &detect_info)
6642                            && detect_info.found & (1 << category))
6643                     break;
6644                 }
6645             }
6646
6647           if (i < coding_category_raw_text)
6648             {
6649               if (category == coding_category_utf_8_auto)
6650                 {
6651                   Lisp_Object coding_systems;
6652
6653                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6654                                          coding_attr_utf_bom);
6655                   if (CONSP (coding_systems))
6656                     {
6657                       if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6658                         found = XCAR (coding_systems);
6659                       else
6660                         found = XCDR (coding_systems);
6661                     }
6662                   else
6663                     found = CODING_ID_NAME (this->id);
6664                 }
6665               else if (category == coding_category_utf_16_auto)
6666                 {
6667                   Lisp_Object coding_systems;
6668
6669                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6670                                          coding_attr_utf_bom);
6671                   if (CONSP (coding_systems))
6672                     {
6673                       if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6674                         found = XCAR (coding_systems);
6675                       else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6676                         found = XCDR (coding_systems);
6677                     }
6678                   else
6679                     found = CODING_ID_NAME (this->id);
6680                 }
6681               else
6682                 found = CODING_ID_NAME (this->id);
6683             }
6684           else if (null_byte_found)
6685             found = Qno_conversion;
6686           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6687                    == CATEGORY_MASK_ANY)
6688             found = Qraw_text;
6689           else if (detect_info.rejected)
6690             for (i = 0; i < coding_category_raw_text; i++)
6691               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6692                 {
6693                   this = coding_categories + coding_priorities[i];
6694                   found = CODING_ID_NAME (this->id);
6695                   break;
6696                 }
6697         }
6698     }
6699   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6700            == coding_category_utf_8_auto)
6701     {
6702       Lisp_Object coding_systems;
6703       struct coding_detection_info detect_info;
6704
6705       coding_systems
6706         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6707       detect_info.found = detect_info.rejected = 0;
6708       if (check_ascii (coding) == coding->src_bytes)
6709         {
6710           if (CONSP (coding_systems))
6711             found = XCDR (coding_systems);
6712         }
6713       else
6714         {
6715           if (CONSP (coding_systems)
6716               && detect_coding_utf_8 (coding, &detect_info))
6717             {
6718               if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6719                 found = XCAR (coding_systems);
6720               else
6721                 found = XCDR (coding_systems);
6722             }
6723         }
6724     }
6725   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6726            == coding_category_utf_16_auto)
6727     {
6728       Lisp_Object coding_systems;
6729       struct coding_detection_info detect_info;
6730
6731       coding_systems
6732         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6733       detect_info.found = detect_info.rejected = 0;
6734       coding->head_ascii = 0;
6735       if (CONSP (coding_systems)
6736           && detect_coding_utf_16 (coding, &detect_info))
6737         {
6738           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6739             found = XCAR (coding_systems);
6740           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6741             found = XCDR (coding_systems);
6742         }
6743     }
6744
6745   if (! NILP (found))
6746     {
6747       int specified_eol = (VECTORP (eol_type) ? EOL_SEEN_NONE
6748                            : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
6749                            : EQ (eol_type, Qmac) ? EOL_SEEN_CR
6750                            : EOL_SEEN_LF);
6751
6752       setup_coding_system (found, coding);
6753       if (specified_eol != EOL_SEEN_NONE)
6754         adjust_coding_eol_type (coding, specified_eol);
6755     }
6756
6757   coding->mode = saved_mode;
6758 }
6759
6760
6761 static void
6762 decode_eol (struct coding_system *coding)
6763 {
6764   Lisp_Object eol_type;
6765   unsigned char *p, *pbeg, *pend;
6766
6767   eol_type = CODING_ID_EOL_TYPE (coding->id);
6768   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6769     return;
6770
6771   if (NILP (coding->dst_object))
6772     pbeg = coding->destination;
6773   else
6774     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6775   pend = pbeg + coding->produced;
6776
6777   if (VECTORP (eol_type))
6778     {
6779       int eol_seen = EOL_SEEN_NONE;
6780
6781       for (p = pbeg; p < pend; p++)
6782         {
6783           if (*p == '\n')
6784             eol_seen |= EOL_SEEN_LF;
6785           else if (*p == '\r')
6786             {
6787               if (p + 1 < pend && *(p + 1) == '\n')
6788                 {
6789                   eol_seen |= EOL_SEEN_CRLF;
6790                   p++;
6791                 }
6792               else
6793                 eol_seen |= EOL_SEEN_CR;
6794             }
6795         }
6796       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6797       if ((eol_seen & EOL_SEEN_CRLF) != 0
6798           && (eol_seen & EOL_SEEN_CR) != 0
6799           && (eol_seen & EOL_SEEN_LF) == 0)
6800         eol_seen = EOL_SEEN_CRLF;
6801       else if (eol_seen != EOL_SEEN_NONE
6802           && eol_seen != EOL_SEEN_LF
6803           && eol_seen != EOL_SEEN_CRLF
6804           && eol_seen != EOL_SEEN_CR)
6805         eol_seen = EOL_SEEN_LF;
6806       if (eol_seen != EOL_SEEN_NONE)
6807         eol_type = adjust_coding_eol_type (coding, eol_seen);
6808     }
6809
6810   if (EQ (eol_type, Qmac))
6811     {
6812       for (p = pbeg; p < pend; p++)
6813         if (*p == '\r')
6814           *p = '\n';
6815     }
6816   else if (EQ (eol_type, Qdos))
6817     {
6818       ptrdiff_t n = 0;
6819
6820       if (NILP (coding->dst_object))
6821         {
6822           /* Start deleting '\r' from the tail to minimize the memory
6823              movement.  */
6824           for (p = pend - 2; p >= pbeg; p--)
6825             if (*p == '\r')
6826               {
6827                 memmove (p, p + 1, pend-- - p - 1);
6828                 n++;
6829               }
6830         }
6831       else
6832         {
6833           ptrdiff_t pos_byte = coding->dst_pos_byte;
6834           ptrdiff_t pos = coding->dst_pos;
6835           ptrdiff_t pos_end = pos + coding->produced_char - 1;
6836
6837           while (pos < pos_end)
6838             {
6839               p = BYTE_POS_ADDR (pos_byte);
6840               if (*p == '\r' && p[1] == '\n')
6841                 {
6842                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6843                   n++;
6844                   pos_end--;
6845                 }
6846               pos++;
6847               if (coding->dst_multibyte)
6848                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6849               else
6850                 pos_byte++;
6851             }
6852         }
6853       coding->produced -= n;
6854       coding->produced_char -= n;
6855     }
6856 }
6857
6858
6859 /* MAX_LOOKUP's maximum value.  MAX_LOOKUP is an int and so cannot
6860    exceed INT_MAX.  Also, MAX_LOOKUP is multiplied by sizeof (int) for
6861    alloca, so it cannot exceed MAX_ALLOCA / sizeof (int).  */
6862 enum { MAX_LOOKUP_MAX = min (INT_MAX, MAX_ALLOCA / sizeof (int)) };
6863
6864 /* Return a translation table (or list of them) from coding system
6865    attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6866    not ENCODEP). */
6867
6868 static Lisp_Object
6869 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6870 {
6871   Lisp_Object standard, translation_table;
6872   Lisp_Object val;
6873
6874   if (NILP (Venable_character_translation))
6875     {
6876       if (max_lookup)
6877         *max_lookup = 0;
6878       return Qnil;
6879     }
6880   if (encodep)
6881     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6882       standard = Vstandard_translation_table_for_encode;
6883   else
6884     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6885       standard = Vstandard_translation_table_for_decode;
6886   if (NILP (translation_table))
6887     translation_table = standard;
6888   else
6889     {
6890       if (SYMBOLP (translation_table))
6891         translation_table = Fget (translation_table, Qtranslation_table);
6892       else if (CONSP (translation_table))
6893         {
6894           translation_table = Fcopy_sequence (translation_table);
6895           for (val = translation_table; CONSP (val); val = XCDR (val))
6896             if (SYMBOLP (XCAR (val)))
6897               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6898         }
6899       if (CHAR_TABLE_P (standard))
6900         {
6901           if (CONSP (translation_table))
6902             translation_table = nconc2 (translation_table, list1 (standard));
6903           else
6904             translation_table = list2 (translation_table, standard);
6905         }
6906     }
6907
6908   if (max_lookup)
6909     {
6910       *max_lookup = 1;
6911       if (CHAR_TABLE_P (translation_table)
6912           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6913         {
6914           val = XCHAR_TABLE (translation_table)->extras[1];
6915           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6916             *max_lookup = min (XFASTINT (val), MAX_LOOKUP_MAX);
6917         }
6918       else if (CONSP (translation_table))
6919         {
6920           Lisp_Object tail;
6921
6922           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6923             if (CHAR_TABLE_P (XCAR (tail))
6924                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6925               {
6926                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6927                 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6928                   *max_lookup = min (XFASTINT (tailval), MAX_LOOKUP_MAX);
6929               }
6930         }
6931     }
6932   return translation_table;
6933 }
6934
6935 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6936   do {                                                          \
6937     trans = Qnil;                                               \
6938     if (CHAR_TABLE_P (table))                                   \
6939       {                                                         \
6940         trans = CHAR_TABLE_REF (table, c);                      \
6941         if (CHARACTERP (trans))                                 \
6942           c = XFASTINT (trans), trans = Qnil;                   \
6943       }                                                         \
6944     else if (CONSP (table))                                     \
6945       {                                                         \
6946         Lisp_Object tail;                                       \
6947                                                                 \
6948         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6949           if (CHAR_TABLE_P (XCAR (tail)))                       \
6950             {                                                   \
6951               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6952               if (CHARACTERP (trans))                           \
6953                 c = XFASTINT (trans), trans = Qnil;             \
6954               else if (! NILP (trans))                          \
6955                 break;                                          \
6956             }                                                   \
6957       }                                                         \
6958   } while (0)
6959
6960
6961 /* Return a translation of character(s) at BUF according to TRANS.
6962    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6963    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6964    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6965    translation is found, and Qnil if not found..
6966    If BUF is too short to lookup characters in FROM, return Qt.  */
6967
6968 static Lisp_Object
6969 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6970 {
6971
6972   if (INTEGERP (trans))
6973     return trans;
6974   for (; CONSP (trans); trans = XCDR (trans))
6975     {
6976       Lisp_Object val = XCAR (trans);
6977       Lisp_Object from = XCAR (val);
6978       ptrdiff_t len = ASIZE (from);
6979       ptrdiff_t i;
6980
6981       for (i = 0; i < len; i++)
6982         {
6983           if (buf + i == buf_end)
6984             return Qt;
6985           if (XINT (AREF (from, i)) != buf[i])
6986             break;
6987         }
6988       if (i == len)
6989         return val;
6990     }
6991   return Qnil;
6992 }
6993
6994
6995 static int
6996 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6997                bool last_block)
6998 {
6999   unsigned char *dst = coding->destination + coding->produced;
7000   unsigned char *dst_end = coding->destination + coding->dst_bytes;
7001   ptrdiff_t produced;
7002   ptrdiff_t produced_chars = 0;
7003   int carryover = 0;
7004
7005   if (! coding->chars_at_source)
7006     {
7007       /* Source characters are in coding->charbuf.  */
7008       int *buf = coding->charbuf;
7009       int *buf_end = buf + coding->charbuf_used;
7010
7011       if (EQ (coding->src_object, coding->dst_object)
7012           && ! NILP (coding->dst_object))
7013         {
7014           eassert (growable_destination (coding));
7015           coding_set_source (coding);
7016           dst_end = ((unsigned char *) coding->source) + coding->consumed;
7017         }
7018
7019       while (buf < buf_end)
7020         {
7021           int c = *buf;
7022           ptrdiff_t i;
7023
7024           if (c >= 0)
7025             {
7026               ptrdiff_t from_nchars = 1, to_nchars = 1;
7027               Lisp_Object trans = Qnil;
7028
7029               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7030               if (! NILP (trans))
7031                 {
7032                   trans = get_translation (trans, buf, buf_end);
7033                   if (INTEGERP (trans))
7034                     c = XINT (trans);
7035                   else if (CONSP (trans))
7036                     {
7037                       from_nchars = ASIZE (XCAR (trans));
7038                       trans = XCDR (trans);
7039                       if (INTEGERP (trans))
7040                         c = XINT (trans);
7041                       else
7042                         {
7043                           to_nchars = ASIZE (trans);
7044                           c = XINT (AREF (trans, 0));
7045                         }
7046                     }
7047                   else if (EQ (trans, Qt) && ! last_block)
7048                     break;
7049                 }
7050
7051               if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
7052                 {
7053                   eassert (growable_destination (coding));
7054                   if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
7055                        / MAX_MULTIBYTE_LENGTH)
7056                       < to_nchars)
7057                     memory_full (SIZE_MAX);
7058                   dst = alloc_destination (coding,
7059                                            buf_end - buf
7060                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
7061                                            dst);
7062                   if (EQ (coding->src_object, coding->dst_object))
7063                     {
7064                       coding_set_source (coding);
7065                       dst_end = (((unsigned char *) coding->source)
7066                                  + coding->consumed);
7067                     }
7068                   else
7069                     dst_end = coding->destination + coding->dst_bytes;
7070                 }
7071
7072               for (i = 0; i < to_nchars; i++)
7073                 {
7074                   if (i > 0)
7075                     c = XINT (AREF (trans, i));
7076                   if (coding->dst_multibyte
7077                       || ! CHAR_BYTE8_P (c))
7078                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
7079                   else
7080                     *dst++ = CHAR_TO_BYTE8 (c);
7081                 }
7082               produced_chars += to_nchars;
7083               buf += from_nchars;
7084             }
7085           else
7086             /* This is an annotation datum.  (-C) is the length.  */
7087             buf += -c;
7088         }
7089       carryover = buf_end - buf;
7090     }
7091   else
7092     {
7093       /* Source characters are at coding->source.  */
7094       const unsigned char *src = coding->source;
7095       const unsigned char *src_end = src + coding->consumed;
7096
7097       if (EQ (coding->dst_object, coding->src_object))
7098         {
7099           eassert (growable_destination (coding));
7100           dst_end = (unsigned char *) src;
7101         }
7102       if (coding->src_multibyte != coding->dst_multibyte)
7103         {
7104           if (coding->src_multibyte)
7105             {
7106               bool multibytep = 1;
7107               ptrdiff_t consumed_chars = 0;
7108
7109               while (1)
7110                 {
7111                   const unsigned char *src_base = src;
7112                   int c;
7113
7114                   ONE_MORE_BYTE (c);
7115                   if (dst == dst_end)
7116                     {
7117                       eassert (growable_destination (coding));
7118                       if (EQ (coding->src_object, coding->dst_object))
7119                         dst_end = (unsigned char *) src;
7120                       if (dst == dst_end)
7121                         {
7122                           ptrdiff_t offset = src - coding->source;
7123
7124                           dst = alloc_destination (coding, src_end - src + 1,
7125                                                    dst);
7126                           dst_end = coding->destination + coding->dst_bytes;
7127                           coding_set_source (coding);
7128                           src = coding->source + offset;
7129                           src_end = coding->source + coding->consumed;
7130                           if (EQ (coding->src_object, coding->dst_object))
7131                             dst_end = (unsigned char *) src;
7132                         }
7133                     }
7134                   *dst++ = c;
7135                   produced_chars++;
7136                 }
7137             no_more_source:
7138               ;
7139             }
7140           else
7141             while (src < src_end)
7142               {
7143                 bool multibytep = 1;
7144                 int c = *src++;
7145
7146                 if (dst >= dst_end - 1)
7147                   {
7148                     eassert (growable_destination (coding));
7149                     if (EQ (coding->src_object, coding->dst_object))
7150                       dst_end = (unsigned char *) src;
7151                     if (dst >= dst_end - 1)
7152                       {
7153                         ptrdiff_t offset = src - coding->source;
7154                         ptrdiff_t more_bytes;
7155
7156                         if (EQ (coding->src_object, coding->dst_object))
7157                           more_bytes = ((src_end - src) / 2) + 2;
7158                         else
7159                           more_bytes = src_end - src + 2;
7160                         dst = alloc_destination (coding, more_bytes, dst);
7161                         dst_end = coding->destination + coding->dst_bytes;
7162                         coding_set_source (coding);
7163                         src = coding->source + offset;
7164                         src_end = coding->source + coding->consumed;
7165                         if (EQ (coding->src_object, coding->dst_object))
7166                           dst_end = (unsigned char *) src;
7167                       }
7168                   }
7169                 EMIT_ONE_BYTE (c);
7170               }
7171         }
7172       else
7173         {
7174           if (!EQ (coding->src_object, coding->dst_object))
7175             {
7176               ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
7177
7178               if (require > 0)
7179                 {
7180                   ptrdiff_t offset = src - coding->source;
7181
7182                   dst = alloc_destination (coding, require, dst);
7183                   coding_set_source (coding);
7184                   src = coding->source + offset;
7185                   src_end = coding->source + coding->consumed;
7186                 }
7187             }
7188           produced_chars = coding->consumed_char;
7189           while (src < src_end)
7190             *dst++ = *src++;
7191         }
7192     }
7193
7194   produced = dst - (coding->destination + coding->produced);
7195   if (BUFFERP (coding->dst_object) && produced_chars > 0)
7196     insert_from_gap (produced_chars, produced, 0);
7197   coding->produced += produced;
7198   coding->produced_char += produced_chars;
7199   return carryover;
7200 }
7201
7202 /* Compose text in CODING->object according to the annotation data at
7203    CHARBUF.  CHARBUF is an array:
7204      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
7205  */
7206
7207 static void
7208 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7209 {
7210   int len;
7211   ptrdiff_t to;
7212   enum composition_method method;
7213   Lisp_Object components;
7214
7215   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
7216   to = pos + charbuf[2];
7217   method = (enum composition_method) (charbuf[4]);
7218
7219   if (method == COMPOSITION_RELATIVE)
7220     components = Qnil;
7221   else
7222     {
7223       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
7224       int i, j;
7225
7226       if (method == COMPOSITION_WITH_RULE)
7227         len = charbuf[2] * 3 - 2;
7228       charbuf += MAX_ANNOTATION_LENGTH;
7229       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
7230       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
7231         {
7232           if (charbuf[i] >= 0)
7233             args[j] = make_number (charbuf[i]);
7234           else
7235             {
7236               i++;
7237               args[j] = make_number (charbuf[i] % 0x100);
7238             }
7239         }
7240       components = (i == j ? Fstring (j, args) : Fvector (j, args));
7241     }
7242   compose_text (pos, to, components, Qnil, coding->dst_object);
7243 }
7244
7245
7246 /* Put `charset' property on text in CODING->object according to
7247    the annotation data at CHARBUF.  CHARBUF is an array:
7248      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
7249  */
7250
7251 static void
7252 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7253 {
7254   ptrdiff_t from = pos - charbuf[2];
7255   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
7256
7257   Fput_text_property (make_number (from), make_number (pos),
7258                       Qcharset, CHARSET_NAME (charset),
7259                       coding->dst_object);
7260 }
7261
7262 #define MAX_CHARBUF_SIZE 0x4000
7263 /* How many units decoding functions expect in coding->charbuf at
7264    most.  Currently, decode_coding_emacs_mule expects the following
7265    size, and that is the largest value.  */
7266 #define MAX_CHARBUF_EXTRA_SIZE ((MAX_ANNOTATION_LENGTH * 3) + 1)
7267
7268 #define ALLOC_CONVERSION_WORK_AREA(coding, size)                \
7269   do {                                                          \
7270     ptrdiff_t units = min ((size) + MAX_CHARBUF_EXTRA_SIZE,     \
7271                            MAX_CHARBUF_SIZE);                   \
7272     coding->charbuf = SAFE_ALLOCA (units * sizeof (int));       \
7273     coding->charbuf_size = units;                               \
7274   } while (0)
7275
7276 static void
7277 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
7278 {
7279   int *charbuf = coding->charbuf;
7280   int *charbuf_end = charbuf + coding->charbuf_used;
7281
7282   if (NILP (coding->dst_object))
7283     return;
7284
7285   while (charbuf < charbuf_end)
7286     {
7287       if (*charbuf >= 0)
7288         pos++, charbuf++;
7289       else
7290         {
7291           int len = -*charbuf;
7292
7293           if (len > 2)
7294             switch (charbuf[1])
7295               {
7296               case CODING_ANNOTATE_COMPOSITION_MASK:
7297                 produce_composition (coding, charbuf, pos);
7298                 break;
7299               case CODING_ANNOTATE_CHARSET_MASK:
7300                 produce_charset (coding, charbuf, pos);
7301                 break;
7302               default:
7303                 break;
7304               }
7305           charbuf += len;
7306         }
7307     }
7308 }
7309
7310 /* Decode the data at CODING->src_object into CODING->dst_object.
7311    CODING->src_object is a buffer, a string, or nil.
7312    CODING->dst_object is a buffer.
7313
7314    If CODING->src_object is a buffer, it must be the current buffer.
7315    In this case, if CODING->src_pos is positive, it is a position of
7316    the source text in the buffer, otherwise, the source text is in the
7317    gap area of the buffer, and CODING->src_pos specifies the offset of
7318    the text from GPT (which must be the same as PT).  If this is the
7319    same buffer as CODING->dst_object, CODING->src_pos must be
7320    negative.
7321
7322    If CODING->src_object is a string, CODING->src_pos is an index to
7323    that string.
7324
7325    If CODING->src_object is nil, CODING->source must already point to
7326    the non-relocatable memory area.  In this case, CODING->src_pos is
7327    an offset from CODING->source.
7328
7329    The decoded data is inserted at the current point of the buffer
7330    CODING->dst_object.
7331 */
7332
7333 static void
7334 decode_coding (struct coding_system *coding)
7335 {
7336   Lisp_Object attrs;
7337   Lisp_Object undo_list;
7338   Lisp_Object translation_table;
7339   struct ccl_spec cclspec;
7340   int carryover;
7341   int i;
7342
7343   USE_SAFE_ALLOCA;
7344
7345   if (BUFFERP (coding->src_object)
7346       && coding->src_pos > 0
7347       && coding->src_pos < GPT
7348       && coding->src_pos + coding->src_chars > GPT)
7349     move_gap_both (coding->src_pos, coding->src_pos_byte);
7350
7351   undo_list = Qt;
7352   if (BUFFERP (coding->dst_object))
7353     {
7354       set_buffer_internal (XBUFFER (coding->dst_object));
7355       if (GPT != PT)
7356         move_gap_both (PT, PT_BYTE);
7357
7358       /* We must disable undo_list in order to record the whole insert
7359          transaction via record_insert at the end.  But doing so also
7360          disables the recording of the first change to the undo_list.
7361          Therefore we check for first change here and record it via
7362          record_first_change if needed.  */
7363       if (MODIFF <= SAVE_MODIFF)
7364         record_first_change ();
7365
7366       undo_list = BVAR (current_buffer, undo_list);
7367       bset_undo_list (current_buffer, Qt);
7368     }
7369
7370   coding->consumed = coding->consumed_char = 0;
7371   coding->produced = coding->produced_char = 0;
7372   coding->chars_at_source = 0;
7373   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7374
7375   ALLOC_CONVERSION_WORK_AREA (coding, coding->src_bytes);
7376
7377   attrs = CODING_ID_ATTRS (coding->id);
7378   translation_table = get_translation_table (attrs, 0, NULL);
7379
7380   carryover = 0;
7381   if (coding->decoder == decode_coding_ccl)
7382     {
7383       coding->spec.ccl = &cclspec;
7384       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7385     }
7386   do
7387     {
7388       ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7389
7390       coding_set_source (coding);
7391       coding->annotated = 0;
7392       coding->charbuf_used = carryover;
7393       (*(coding->decoder)) (coding);
7394       coding_set_destination (coding);
7395       carryover = produce_chars (coding, translation_table, 0);
7396       if (coding->annotated)
7397         produce_annotation (coding, pos);
7398       for (i = 0; i < carryover; i++)
7399         coding->charbuf[i]
7400           = coding->charbuf[coding->charbuf_used - carryover + i];
7401     }
7402   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7403          || (coding->consumed < coding->src_bytes
7404              && (coding->result == CODING_RESULT_SUCCESS
7405                  || coding->result == CODING_RESULT_INVALID_SRC)));
7406
7407   if (carryover > 0)
7408     {
7409       coding_set_destination (coding);
7410       coding->charbuf_used = carryover;
7411       produce_chars (coding, translation_table, 1);
7412     }
7413
7414   coding->carryover_bytes = 0;
7415   if (coding->consumed < coding->src_bytes)
7416     {
7417       ptrdiff_t nbytes = coding->src_bytes - coding->consumed;
7418       const unsigned char *src;
7419
7420       coding_set_source (coding);
7421       coding_set_destination (coding);
7422       src = coding->source + coding->consumed;
7423
7424       if (coding->mode & CODING_MODE_LAST_BLOCK)
7425         {
7426           /* Flush out unprocessed data as binary chars.  We are sure
7427              that the number of data is less than the size of
7428              coding->charbuf.  */
7429           coding->charbuf_used = 0;
7430           coding->chars_at_source = 0;
7431
7432           while (nbytes-- > 0)
7433             {
7434               int c = *src++;
7435
7436               if (c & 0x80)
7437                 c = BYTE8_TO_CHAR (c);
7438               coding->charbuf[coding->charbuf_used++] = c;
7439             }
7440           produce_chars (coding, Qnil, 1);
7441         }
7442       else
7443         {
7444           /* Record unprocessed bytes in coding->carryover.  We are
7445              sure that the number of data is less than the size of
7446              coding->carryover.  */
7447           unsigned char *p = coding->carryover;
7448
7449           if (nbytes > sizeof coding->carryover)
7450             nbytes = sizeof coding->carryover;
7451           coding->carryover_bytes = nbytes;
7452           while (nbytes-- > 0)
7453             *p++ = *src++;
7454         }
7455       coding->consumed = coding->src_bytes;
7456     }
7457
7458   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7459       && !inhibit_eol_conversion)
7460     decode_eol (coding);
7461   if (BUFFERP (coding->dst_object))
7462     {
7463       bset_undo_list (current_buffer, undo_list);
7464       record_insert (coding->dst_pos, coding->produced_char);
7465     }
7466
7467   SAFE_FREE ();
7468 }
7469
7470
7471 /* Extract an annotation datum from a composition starting at POS and
7472    ending before LIMIT of CODING->src_object (buffer or string), store
7473    the data in BUF, set *STOP to a starting position of the next
7474    composition (if any) or to LIMIT, and return the address of the
7475    next element of BUF.
7476
7477    If such an annotation is not found, set *STOP to a starting
7478    position of a composition after POS (if any) or to LIMIT, and
7479    return BUF.  */
7480
7481 static int *
7482 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7483                                struct coding_system *coding, int *buf,
7484                                ptrdiff_t *stop)
7485 {
7486   ptrdiff_t start, end;
7487   Lisp_Object prop;
7488
7489   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7490       || end > limit)
7491     *stop = limit;
7492   else if (start > pos)
7493     *stop = start;
7494   else
7495     {
7496       if (start == pos)
7497         {
7498           /* We found a composition.  Store the corresponding
7499              annotation data in BUF.  */
7500           int *head = buf;
7501           enum composition_method method = composition_method (prop);
7502           int nchars = COMPOSITION_LENGTH (prop);
7503
7504           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7505           if (method != COMPOSITION_RELATIVE)
7506             {
7507               Lisp_Object components;
7508               ptrdiff_t i, len, i_byte;
7509
7510               components = COMPOSITION_COMPONENTS (prop);
7511               if (VECTORP (components))
7512                 {
7513                   len = ASIZE (components);
7514                   for (i = 0; i < len; i++)
7515                     *buf++ = XINT (AREF (components, i));
7516                 }
7517               else if (STRINGP (components))
7518                 {
7519                   len = SCHARS (components);
7520                   i = i_byte = 0;
7521                   while (i < len)
7522                     {
7523                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7524                       buf++;
7525                     }
7526                 }
7527               else if (INTEGERP (components))
7528                 {
7529                   len = 1;
7530                   *buf++ = XINT (components);
7531                 }
7532               else if (CONSP (components))
7533                 {
7534                   for (len = 0; CONSP (components);
7535                        len++, components = XCDR (components))
7536                     *buf++ = XINT (XCAR (components));
7537                 }
7538               else
7539                 emacs_abort ();
7540               *head -= len;
7541             }
7542         }
7543
7544       if (find_composition (end, limit, &start, &end, &prop,
7545                             coding->src_object)
7546           && end <= limit)
7547         *stop = start;
7548       else
7549         *stop = limit;
7550     }
7551   return buf;
7552 }
7553
7554
7555 /* Extract an annotation datum from a text property `charset' at POS of
7556    CODING->src_object (buffer of string), store the data in BUF, set
7557    *STOP to the position where the value of `charset' property changes
7558    (limiting by LIMIT), and return the address of the next element of
7559    BUF.
7560
7561    If the property value is nil, set *STOP to the position where the
7562    property value is non-nil (limiting by LIMIT), and return BUF.  */
7563
7564 static int *
7565 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7566                            struct coding_system *coding, int *buf,
7567                            ptrdiff_t *stop)
7568 {
7569   Lisp_Object val, next;
7570   int id;
7571
7572   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7573   if (! NILP (val) && CHARSETP (val))
7574     id = XINT (CHARSET_SYMBOL_ID (val));
7575   else
7576     id = -1;
7577   ADD_CHARSET_DATA (buf, 0, id);
7578   next = Fnext_single_property_change (make_number (pos), Qcharset,
7579                                        coding->src_object,
7580                                        make_number (limit));
7581   *stop = XINT (next);
7582   return buf;
7583 }
7584
7585
7586 static void
7587 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7588                int max_lookup)
7589 {
7590   int *buf = coding->charbuf;
7591   int *buf_end = coding->charbuf + coding->charbuf_size;
7592   const unsigned char *src = coding->source + coding->consumed;
7593   const unsigned char *src_end = coding->source + coding->src_bytes;
7594   ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7595   ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7596   bool multibytep = coding->src_multibyte;
7597   Lisp_Object eol_type;
7598   int c;
7599   ptrdiff_t stop, stop_composition, stop_charset;
7600   int *lookup_buf = NULL;
7601
7602   if (! NILP (translation_table))
7603     lookup_buf = alloca (sizeof (int) * max_lookup);
7604
7605   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7606   if (VECTORP (eol_type))
7607     eol_type = Qunix;
7608
7609   /* Note: composition handling is not yet implemented.  */
7610   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7611
7612   if (NILP (coding->src_object))
7613     stop = stop_composition = stop_charset = end_pos;
7614   else
7615     {
7616       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7617         stop = stop_composition = pos;
7618       else
7619         stop = stop_composition = end_pos;
7620       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7621         stop = stop_charset = pos;
7622       else
7623         stop_charset = end_pos;
7624     }
7625
7626   /* Compensate for CRLF and conversion.  */
7627   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7628   while (buf < buf_end)
7629     {
7630       Lisp_Object trans;
7631
7632       if (pos == stop)
7633         {
7634           if (pos == end_pos)
7635             break;
7636           if (pos == stop_composition)
7637             buf = handle_composition_annotation (pos, end_pos, coding,
7638                                                  buf, &stop_composition);
7639           if (pos == stop_charset)
7640             buf = handle_charset_annotation (pos, end_pos, coding,
7641                                              buf, &stop_charset);
7642           stop = (stop_composition < stop_charset
7643                   ? stop_composition : stop_charset);
7644         }
7645
7646       if (! multibytep)
7647         {
7648           int bytes;
7649
7650           if (coding->encoder == encode_coding_raw_text
7651               || coding->encoder == encode_coding_ccl)
7652             c = *src++, pos++;
7653           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7654             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7655           else
7656             c = BYTE8_TO_CHAR (*src), src++, pos++;
7657         }
7658       else
7659         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7660       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7661         c = '\n';
7662       if (! EQ (eol_type, Qunix))
7663         {
7664           if (c == '\n')
7665             {
7666               if (EQ (eol_type, Qdos))
7667                 *buf++ = '\r';
7668               else
7669                 c = '\r';
7670             }
7671         }
7672
7673       trans = Qnil;
7674       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7675       if (NILP (trans))
7676         *buf++ = c;
7677       else
7678         {
7679           ptrdiff_t from_nchars = 1, to_nchars = 1;
7680           int *lookup_buf_end;
7681           const unsigned char *p = src;
7682           int i;
7683
7684           lookup_buf[0] = c;
7685           for (i = 1; i < max_lookup && p < src_end; i++)
7686             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7687           lookup_buf_end = lookup_buf + i;
7688           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7689           if (INTEGERP (trans))
7690             c = XINT (trans);
7691           else if (CONSP (trans))
7692             {
7693               from_nchars = ASIZE (XCAR (trans));
7694               trans = XCDR (trans);
7695               if (INTEGERP (trans))
7696                 c = XINT (trans);
7697               else
7698                 {
7699                   to_nchars = ASIZE (trans);
7700                   if (buf_end - buf < to_nchars)
7701                     break;
7702                   c = XINT (AREF (trans, 0));
7703                 }
7704             }
7705           else
7706             break;
7707           *buf++ = c;
7708           for (i = 1; i < to_nchars; i++)
7709             *buf++ = XINT (AREF (trans, i));
7710           for (i = 1; i < from_nchars; i++, pos++)
7711             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7712         }
7713     }
7714
7715   coding->consumed = src - coding->source;
7716   coding->consumed_char = pos - coding->src_pos;
7717   coding->charbuf_used = buf - coding->charbuf;
7718   coding->chars_at_source = 0;
7719 }
7720
7721
7722 /* Encode the text at CODING->src_object into CODING->dst_object.
7723    CODING->src_object is a buffer or a string.
7724    CODING->dst_object is a buffer or nil.
7725
7726    If CODING->src_object is a buffer, it must be the current buffer.
7727    In this case, if CODING->src_pos is positive, it is a position of
7728    the source text in the buffer, otherwise. the source text is in the
7729    gap area of the buffer, and coding->src_pos specifies the offset of
7730    the text from GPT (which must be the same as PT).  If this is the
7731    same buffer as CODING->dst_object, CODING->src_pos must be
7732    negative and CODING should not have `pre-write-conversion'.
7733
7734    If CODING->src_object is a string, CODING should not have
7735    `pre-write-conversion'.
7736
7737    If CODING->dst_object is a buffer, the encoded data is inserted at
7738    the current point of that buffer.
7739
7740    If CODING->dst_object is nil, the encoded data is placed at the
7741    memory area specified by CODING->destination.  */
7742
7743 static void
7744 encode_coding (struct coding_system *coding)
7745 {
7746   Lisp_Object attrs;
7747   Lisp_Object translation_table;
7748   int max_lookup;
7749   struct ccl_spec cclspec;
7750
7751   USE_SAFE_ALLOCA;
7752
7753   attrs = CODING_ID_ATTRS (coding->id);
7754   if (coding->encoder == encode_coding_raw_text)
7755     translation_table = Qnil, max_lookup = 0;
7756   else
7757     translation_table = get_translation_table (attrs, 1, &max_lookup);
7758
7759   if (BUFFERP (coding->dst_object))
7760     {
7761       set_buffer_internal (XBUFFER (coding->dst_object));
7762       coding->dst_multibyte
7763         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7764     }
7765
7766   coding->consumed = coding->consumed_char = 0;
7767   coding->produced = coding->produced_char = 0;
7768   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7769
7770   ALLOC_CONVERSION_WORK_AREA (coding, coding->src_chars);
7771
7772   if (coding->encoder == encode_coding_ccl)
7773     {
7774       coding->spec.ccl = &cclspec;
7775       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7776     }
7777   do {
7778     coding_set_source (coding);
7779     consume_chars (coding, translation_table, max_lookup);
7780     coding_set_destination (coding);
7781     (*(coding->encoder)) (coding);
7782   } while (coding->consumed_char < coding->src_chars);
7783
7784   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7785     insert_from_gap (coding->produced_char, coding->produced, 0);
7786
7787   SAFE_FREE ();
7788 }
7789
7790
7791 /* Name (or base name) of work buffer for code conversion.  */
7792 static Lisp_Object Vcode_conversion_workbuf_name;
7793
7794 /* A working buffer used by the top level conversion.  Once it is
7795    created, it is never destroyed.  It has the name
7796    Vcode_conversion_workbuf_name.  The other working buffers are
7797    destroyed after the use is finished, and their names are modified
7798    versions of Vcode_conversion_workbuf_name.  */
7799 static Lisp_Object Vcode_conversion_reused_workbuf;
7800
7801 /* True iff Vcode_conversion_reused_workbuf is already in use.  */
7802 static bool reused_workbuf_in_use;
7803
7804
7805 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7806    multibyteness of returning buffer.  */
7807
7808 static Lisp_Object
7809 make_conversion_work_buffer (bool multibyte)
7810 {
7811   Lisp_Object name, workbuf;
7812   struct buffer *current;
7813
7814   if (reused_workbuf_in_use)
7815     {
7816       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7817       workbuf = Fget_buffer_create (name);
7818     }
7819   else
7820     {
7821       reused_workbuf_in_use = 1;
7822       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7823         Vcode_conversion_reused_workbuf
7824           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7825       workbuf = Vcode_conversion_reused_workbuf;
7826     }
7827   current = current_buffer;
7828   set_buffer_internal (XBUFFER (workbuf));
7829   /* We can't allow modification hooks to run in the work buffer.  For
7830      instance, directory_files_internal assumes that file decoding
7831      doesn't compile new regexps.  */
7832   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7833   Ferase_buffer ();
7834   bset_undo_list (current_buffer, Qt);
7835   bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7836   set_buffer_internal (current);
7837   return workbuf;
7838 }
7839
7840
7841 static void
7842 code_conversion_restore (Lisp_Object arg)
7843 {
7844   Lisp_Object current, workbuf;
7845
7846   current = XCAR (arg);
7847   workbuf = XCDR (arg);
7848   if (! NILP (workbuf))
7849     {
7850       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7851         reused_workbuf_in_use = 0;
7852       else
7853         Fkill_buffer (workbuf);
7854     }
7855   set_buffer_internal (XBUFFER (current));
7856 }
7857
7858 Lisp_Object
7859 code_conversion_save (bool with_work_buf, bool multibyte)
7860 {
7861   Lisp_Object workbuf = Qnil;
7862
7863   if (with_work_buf)
7864     workbuf = make_conversion_work_buffer (multibyte);
7865   record_unwind_protect (code_conversion_restore,
7866                          Fcons (Fcurrent_buffer (), workbuf));
7867   return workbuf;
7868 }
7869
7870 void
7871 decode_coding_gap (struct coding_system *coding,
7872                    ptrdiff_t chars, ptrdiff_t bytes)
7873 {
7874   ptrdiff_t count = SPECPDL_INDEX ();
7875   Lisp_Object attrs;
7876
7877   coding->src_object = Fcurrent_buffer ();
7878   coding->src_chars = chars;
7879   coding->src_bytes = bytes;
7880   coding->src_pos = -chars;
7881   coding->src_pos_byte = -bytes;
7882   coding->src_multibyte = chars < bytes;
7883   coding->dst_object = coding->src_object;
7884   coding->dst_pos = PT;
7885   coding->dst_pos_byte = PT_BYTE;
7886   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7887
7888   coding->head_ascii = -1;
7889   coding->detected_utf8_bytes = coding->detected_utf8_chars = -1;
7890   coding->eol_seen = EOL_SEEN_NONE;
7891   if (CODING_REQUIRE_DETECTION (coding))
7892     detect_coding (coding);
7893   attrs = CODING_ID_ATTRS (coding->id);
7894   if (! disable_ascii_optimization
7895       && ! coding->src_multibyte
7896       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
7897       && NILP (CODING_ATTR_POST_READ (attrs))
7898       && NILP (get_translation_table (attrs, 0, NULL)))
7899     {
7900       chars = coding->head_ascii;
7901       if (chars < 0)
7902         chars = check_ascii (coding);
7903       if (chars != bytes)
7904         {
7905           /* There exists a non-ASCII byte.  */
7906           if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8)
7907               && coding->detected_utf8_bytes == coding->src_bytes)
7908             {
7909               if (coding->detected_utf8_chars >= 0)
7910                 chars = coding->detected_utf8_chars;
7911               else
7912                 chars = check_utf_8 (coding);
7913               if (CODING_UTF_8_BOM (coding) != utf_without_bom
7914                   && coding->head_ascii == 0
7915                   && coding->source[0] == UTF_8_BOM_1
7916                   && coding->source[1] == UTF_8_BOM_2
7917                   && coding->source[2] == UTF_8_BOM_3)
7918                 {
7919                   chars--;
7920                   bytes -= 3;
7921                   coding->src_bytes -= 3;
7922                 }
7923             }
7924           else
7925             chars = -1;
7926         }
7927       if (chars >= 0)
7928         {
7929           Lisp_Object eol_type;
7930
7931           eol_type = CODING_ID_EOL_TYPE (coding->id);
7932           if (VECTORP (eol_type))
7933             {
7934               if (coding->eol_seen != EOL_SEEN_NONE)
7935                 eol_type = adjust_coding_eol_type (coding, coding->eol_seen);
7936             }
7937           if (EQ (eol_type, Qmac))
7938             {
7939               unsigned char *src_end = GAP_END_ADDR;
7940               unsigned char *src = src_end - coding->src_bytes;
7941
7942               while (src < src_end)
7943                 {
7944                   if (*src++ == '\r')
7945                     src[-1] = '\n';
7946                 }
7947             }
7948           else if (EQ (eol_type, Qdos))
7949             {
7950               unsigned char *src = GAP_END_ADDR;
7951               unsigned char *src_beg = src - coding->src_bytes;
7952               unsigned char *dst = src;
7953               ptrdiff_t diff;
7954
7955               while (src_beg < src)
7956                 {
7957                   *--dst = *--src;
7958                   if (*src == '\n' && src > src_beg && src[-1] == '\r')
7959                     src--;
7960                 }
7961               diff = dst - src;
7962               bytes -= diff;
7963               chars -= diff;
7964             }
7965           coding->produced = bytes;
7966           coding->produced_char = chars;
7967           insert_from_gap (chars, bytes, 1);
7968           return;
7969         }
7970     }
7971   code_conversion_save (0, 0);
7972
7973   coding->mode |= CODING_MODE_LAST_BLOCK;
7974   current_buffer->text->inhibit_shrinking = 1;
7975   decode_coding (coding);
7976   current_buffer->text->inhibit_shrinking = 0;
7977
7978   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7979     {
7980       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7981       Lisp_Object val;
7982
7983       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7984       val = call1 (CODING_ATTR_POST_READ (attrs),
7985                    make_number (coding->produced_char));
7986       CHECK_NATNUM (val);
7987       coding->produced_char += Z - prev_Z;
7988       coding->produced += Z_BYTE - prev_Z_BYTE;
7989     }
7990
7991   unbind_to (count, Qnil);
7992 }
7993
7994
7995 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7996    SRC_OBJECT into DST_OBJECT by coding context CODING.
7997
7998    SRC_OBJECT is a buffer, a string, or Qnil.
7999
8000    If it is a buffer, the text is at point of the buffer.  FROM and TO
8001    are positions in the buffer.
8002
8003    If it is a string, the text is at the beginning of the string.
8004    FROM and TO are indices to the string.
8005
8006    If it is nil, the text is at coding->source.  FROM and TO are
8007    indices to coding->source.
8008
8009    DST_OBJECT is a buffer, Qt, or Qnil.
8010
8011    If it is a buffer, the decoded text is inserted at point of the
8012    buffer.  If the buffer is the same as SRC_OBJECT, the source text
8013    is deleted.
8014
8015    If it is Qt, a string is made from the decoded text, and
8016    set in CODING->dst_object.
8017
8018    If it is Qnil, the decoded text is stored at CODING->destination.
8019    The caller must allocate CODING->dst_bytes bytes at
8020    CODING->destination by xmalloc.  If the decoded text is longer than
8021    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
8022  */
8023
8024 void
8025 decode_coding_object (struct coding_system *coding,
8026                       Lisp_Object src_object,
8027                       ptrdiff_t from, ptrdiff_t from_byte,
8028                       ptrdiff_t to, ptrdiff_t to_byte,
8029                       Lisp_Object dst_object)
8030 {
8031   ptrdiff_t count = SPECPDL_INDEX ();
8032   unsigned char *destination IF_LINT (= NULL);
8033   ptrdiff_t dst_bytes IF_LINT (= 0);
8034   ptrdiff_t chars = to - from;
8035   ptrdiff_t bytes = to_byte - from_byte;
8036   Lisp_Object attrs;
8037   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
8038   bool need_marker_adjustment = 0;
8039   Lisp_Object old_deactivate_mark;
8040
8041   old_deactivate_mark = Vdeactivate_mark;
8042
8043   if (NILP (dst_object))
8044     {
8045       destination = coding->destination;
8046       dst_bytes = coding->dst_bytes;
8047     }
8048
8049   coding->src_object = src_object;
8050   coding->src_chars = chars;
8051   coding->src_bytes = bytes;
8052   coding->src_multibyte = chars < bytes;
8053
8054   if (STRINGP (src_object))
8055     {
8056       coding->src_pos = from;
8057       coding->src_pos_byte = from_byte;
8058     }
8059   else if (BUFFERP (src_object))
8060     {
8061       set_buffer_internal (XBUFFER (src_object));
8062       if (from != GPT)
8063         move_gap_both (from, from_byte);
8064       if (EQ (src_object, dst_object))
8065         {
8066           struct Lisp_Marker *tail;
8067
8068           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8069             {
8070               tail->need_adjustment
8071                 = tail->charpos == (tail->insertion_type ? from : to);
8072               need_marker_adjustment |= tail->need_adjustment;
8073             }
8074           saved_pt = PT, saved_pt_byte = PT_BYTE;
8075           TEMP_SET_PT_BOTH (from, from_byte);
8076           current_buffer->text->inhibit_shrinking = 1;
8077           del_range_both (from, from_byte, to, to_byte, 1);
8078           coding->src_pos = -chars;
8079           coding->src_pos_byte = -bytes;
8080         }
8081       else
8082         {
8083           coding->src_pos = from;
8084           coding->src_pos_byte = from_byte;
8085         }
8086     }
8087
8088   if (CODING_REQUIRE_DETECTION (coding))
8089     detect_coding (coding);
8090   attrs = CODING_ID_ATTRS (coding->id);
8091
8092   if (EQ (dst_object, Qt)
8093       || (! NILP (CODING_ATTR_POST_READ (attrs))
8094           && NILP (dst_object)))
8095     {
8096       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
8097       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
8098       coding->dst_pos = BEG;
8099       coding->dst_pos_byte = BEG_BYTE;
8100     }
8101   else if (BUFFERP (dst_object))
8102     {
8103       code_conversion_save (0, 0);
8104       coding->dst_object = dst_object;
8105       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
8106       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
8107       coding->dst_multibyte
8108         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8109     }
8110   else
8111     {
8112       code_conversion_save (0, 0);
8113       coding->dst_object = Qnil;
8114       /* Most callers presume this will return a multibyte result, and they
8115          won't use `binary' or `raw-text' anyway, so let's not worry about
8116          CODING_FOR_UNIBYTE.  */
8117       coding->dst_multibyte = 1;
8118     }
8119
8120   decode_coding (coding);
8121
8122   if (BUFFERP (coding->dst_object))
8123     set_buffer_internal (XBUFFER (coding->dst_object));
8124
8125   if (! NILP (CODING_ATTR_POST_READ (attrs)))
8126     {
8127       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
8128       Lisp_Object val;
8129
8130       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
8131       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
8132                         make_number (coding->produced_char));
8133       CHECK_NATNUM (val);
8134       coding->produced_char += Z - prev_Z;
8135       coding->produced += Z_BYTE - prev_Z_BYTE;
8136     }
8137
8138   if (EQ (dst_object, Qt))
8139     {
8140       coding->dst_object = Fbuffer_string ();
8141     }
8142   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
8143     {
8144       set_buffer_internal (XBUFFER (coding->dst_object));
8145       if (dst_bytes < coding->produced)
8146         {
8147           eassert (coding->produced > 0);
8148           destination = xrealloc (destination, coding->produced);
8149           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
8150             move_gap_both (BEGV, BEGV_BYTE);
8151           memcpy (destination, BEGV_ADDR, coding->produced);
8152           coding->destination = destination;
8153         }
8154     }
8155
8156   if (saved_pt >= 0)
8157     {
8158       /* This is the case of:
8159          (BUFFERP (src_object) && EQ (src_object, dst_object))
8160          As we have moved PT while replacing the original buffer
8161          contents, we must recover it now.  */
8162       set_buffer_internal (XBUFFER (src_object));
8163       current_buffer->text->inhibit_shrinking = 0;
8164       if (saved_pt < from)
8165         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8166       else if (saved_pt < from + chars)
8167         TEMP_SET_PT_BOTH (from, from_byte);
8168       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8169         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8170                           saved_pt_byte + (coding->produced - bytes));
8171       else
8172         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8173                           saved_pt_byte + (coding->produced - bytes));
8174
8175       if (need_marker_adjustment)
8176         {
8177           struct Lisp_Marker *tail;
8178
8179           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8180             if (tail->need_adjustment)
8181               {
8182                 tail->need_adjustment = 0;
8183                 if (tail->insertion_type)
8184                   {
8185                     tail->bytepos = from_byte;
8186                     tail->charpos = from;
8187                   }
8188                 else
8189                   {
8190                     tail->bytepos = from_byte + coding->produced;
8191                     tail->charpos
8192                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8193                          ? tail->bytepos : from + coding->produced_char);
8194                   }
8195               }
8196         }
8197     }
8198
8199   Vdeactivate_mark = old_deactivate_mark;
8200   unbind_to (count, coding->dst_object);
8201 }
8202
8203
8204 void
8205 encode_coding_object (struct coding_system *coding,
8206                       Lisp_Object src_object,
8207                       ptrdiff_t from, ptrdiff_t from_byte,
8208                       ptrdiff_t to, ptrdiff_t to_byte,
8209                       Lisp_Object dst_object)
8210 {
8211   ptrdiff_t count = SPECPDL_INDEX ();
8212   ptrdiff_t chars = to - from;
8213   ptrdiff_t bytes = to_byte - from_byte;
8214   Lisp_Object attrs;
8215   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
8216   bool need_marker_adjustment = 0;
8217   bool kill_src_buffer = 0;
8218   Lisp_Object old_deactivate_mark;
8219
8220   old_deactivate_mark = Vdeactivate_mark;
8221
8222   coding->src_object = src_object;
8223   coding->src_chars = chars;
8224   coding->src_bytes = bytes;
8225   coding->src_multibyte = chars < bytes;
8226
8227   attrs = CODING_ID_ATTRS (coding->id);
8228
8229   if (EQ (src_object, dst_object))
8230     {
8231       struct Lisp_Marker *tail;
8232
8233       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8234         {
8235           tail->need_adjustment
8236             = tail->charpos == (tail->insertion_type ? from : to);
8237           need_marker_adjustment |= tail->need_adjustment;
8238         }
8239     }
8240
8241   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
8242     {
8243       coding->src_object = code_conversion_save (1, coding->src_multibyte);
8244       set_buffer_internal (XBUFFER (coding->src_object));
8245       if (STRINGP (src_object))
8246         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
8247       else if (BUFFERP (src_object))
8248         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
8249       else
8250         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
8251
8252       if (EQ (src_object, dst_object))
8253         {
8254           set_buffer_internal (XBUFFER (src_object));
8255           saved_pt = PT, saved_pt_byte = PT_BYTE;
8256           del_range_both (from, from_byte, to, to_byte, 1);
8257           set_buffer_internal (XBUFFER (coding->src_object));
8258         }
8259
8260       safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
8261                   make_number (BEG), make_number (Z));
8262       if (XBUFFER (coding->src_object) != current_buffer)
8263         kill_src_buffer = 1;
8264       coding->src_object = Fcurrent_buffer ();
8265       if (BEG != GPT)
8266         move_gap_both (BEG, BEG_BYTE);
8267       coding->src_chars = Z - BEG;
8268       coding->src_bytes = Z_BYTE - BEG_BYTE;
8269       coding->src_pos = BEG;
8270       coding->src_pos_byte = BEG_BYTE;
8271       coding->src_multibyte = Z < Z_BYTE;
8272     }
8273   else if (STRINGP (src_object))
8274     {
8275       code_conversion_save (0, 0);
8276       coding->src_pos = from;
8277       coding->src_pos_byte = from_byte;
8278     }
8279   else if (BUFFERP (src_object))
8280     {
8281       code_conversion_save (0, 0);
8282       set_buffer_internal (XBUFFER (src_object));
8283       if (EQ (src_object, dst_object))
8284         {
8285           saved_pt = PT, saved_pt_byte = PT_BYTE;
8286           coding->src_object = del_range_1 (from, to, 1, 1);
8287           coding->src_pos = 0;
8288           coding->src_pos_byte = 0;
8289         }
8290       else
8291         {
8292           if (from < GPT && to >= GPT)
8293             move_gap_both (from, from_byte);
8294           coding->src_pos = from;
8295           coding->src_pos_byte = from_byte;
8296         }
8297     }
8298   else
8299     {
8300       code_conversion_save (0, 0);
8301       coding->src_pos = from;
8302       coding->src_pos_byte = from_byte;
8303     }
8304
8305   if (BUFFERP (dst_object))
8306     {
8307       coding->dst_object = dst_object;
8308       if (EQ (src_object, dst_object))
8309         {
8310           coding->dst_pos = from;
8311           coding->dst_pos_byte = from_byte;
8312         }
8313       else
8314         {
8315           struct buffer *current = current_buffer;
8316
8317           set_buffer_temp (XBUFFER (dst_object));
8318           coding->dst_pos = PT;
8319           coding->dst_pos_byte = PT_BYTE;
8320           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8321           set_buffer_temp (current);
8322         }
8323       coding->dst_multibyte
8324         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8325     }
8326   else if (EQ (dst_object, Qt))
8327     {
8328       ptrdiff_t dst_bytes = max (1, coding->src_chars);
8329       coding->dst_object = Qnil;
8330       coding->destination = xmalloc (dst_bytes);
8331       coding->dst_bytes = dst_bytes;
8332       coding->dst_multibyte = 0;
8333     }
8334   else
8335     {
8336       coding->dst_object = Qnil;
8337       coding->dst_multibyte = 0;
8338     }
8339
8340   encode_coding (coding);
8341
8342   if (EQ (dst_object, Qt))
8343     {
8344       if (BUFFERP (coding->dst_object))
8345         coding->dst_object = Fbuffer_string ();
8346       else if (coding->raw_destination)
8347         /* This is used to avoid creating huge Lisp string.
8348            NOTE: caller who sets `raw_destination' is also
8349            responsible for freeing `destination' buffer.  */
8350         coding->dst_object = Qnil;
8351       else
8352         {
8353           coding->dst_object
8354             = make_unibyte_string ((char *) coding->destination,
8355                                    coding->produced);
8356           xfree (coding->destination);
8357         }
8358     }
8359
8360   if (saved_pt >= 0)
8361     {
8362       /* This is the case of:
8363          (BUFFERP (src_object) && EQ (src_object, dst_object))
8364          As we have moved PT while replacing the original buffer
8365          contents, we must recover it now.  */
8366       set_buffer_internal (XBUFFER (src_object));
8367       if (saved_pt < from)
8368         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8369       else if (saved_pt < from + chars)
8370         TEMP_SET_PT_BOTH (from, from_byte);
8371       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8372         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8373                           saved_pt_byte + (coding->produced - bytes));
8374       else
8375         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8376                           saved_pt_byte + (coding->produced - bytes));
8377
8378       if (need_marker_adjustment)
8379         {
8380           struct Lisp_Marker *tail;
8381
8382           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8383             if (tail->need_adjustment)
8384               {
8385                 tail->need_adjustment = 0;
8386                 if (tail->insertion_type)
8387                   {
8388                     tail->bytepos = from_byte;
8389                     tail->charpos = from;
8390                   }
8391                 else
8392                   {
8393                     tail->bytepos = from_byte + coding->produced;
8394                     tail->charpos
8395                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8396                          ? tail->bytepos : from + coding->produced_char);
8397                   }
8398               }
8399         }
8400     }
8401
8402   if (kill_src_buffer)
8403     Fkill_buffer (coding->src_object);
8404
8405   Vdeactivate_mark = old_deactivate_mark;
8406   unbind_to (count, Qnil);
8407 }
8408
8409
8410 Lisp_Object
8411 preferred_coding_system (void)
8412 {
8413   int id = coding_categories[coding_priorities[0]].id;
8414
8415   return CODING_ID_NAME (id);
8416 }
8417
8418 #if defined (WINDOWSNT) || defined (CYGWIN)
8419
8420 Lisp_Object
8421 from_unicode (Lisp_Object str)
8422 {
8423   CHECK_STRING (str);
8424   if (!STRING_MULTIBYTE (str) &&
8425       SBYTES (str) & 1)
8426     {
8427       str = Fsubstring (str, make_number (0), make_number (-1));
8428     }
8429
8430   return code_convert_string_norecord (str, Qutf_16le, 0);
8431 }
8432
8433 Lisp_Object
8434 from_unicode_buffer (const wchar_t *wstr)
8435 {
8436     return from_unicode (
8437         make_unibyte_string (
8438             (char *) wstr,
8439             /* we get one of the two final 0 bytes for free. */
8440             1 + sizeof (wchar_t) * wcslen (wstr)));
8441 }
8442
8443 wchar_t *
8444 to_unicode (Lisp_Object str, Lisp_Object *buf)
8445 {
8446   *buf = code_convert_string_norecord (str, Qutf_16le, 1);
8447   /* We need to make another copy (in addition to the one made by
8448      code_convert_string_norecord) to ensure that the final string is
8449      _doubly_ zero terminated --- that is, that the string is
8450      terminated by two zero bytes and one utf-16le null character.
8451      Because strings are already terminated with a single zero byte,
8452      we just add one additional zero. */
8453   str = make_uninit_string (SBYTES (*buf) + 1);
8454   memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
8455   SDATA (str) [SBYTES (*buf)] = '\0';
8456   *buf = str;
8457   return WCSDATA (*buf);
8458 }
8459
8460 #endif /* WINDOWSNT || CYGWIN */
8461
8462 \f
8463 #ifdef emacs
8464 /*** 8. Emacs Lisp library functions ***/
8465
8466 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8467        doc: /* Return t if OBJECT is nil or a coding-system.
8468 See the documentation of `define-coding-system' for information
8469 about coding-system objects.  */)
8470   (Lisp_Object object)
8471 {
8472   if (NILP (object)
8473       || CODING_SYSTEM_ID (object) >= 0)
8474     return Qt;
8475   if (! SYMBOLP (object)
8476       || NILP (Fget (object, Qcoding_system_define_form)))
8477     return Qnil;
8478   return Qt;
8479 }
8480
8481 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8482        Sread_non_nil_coding_system, 1, 1, 0,
8483        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8484   (Lisp_Object prompt)
8485 {
8486   Lisp_Object val;
8487   do
8488     {
8489       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8490                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8491     }
8492   while (SCHARS (val) == 0);
8493   return (Fintern (val, Qnil));
8494 }
8495
8496 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8497        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8498 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8499 Ignores case when completing coding systems (all Emacs coding systems
8500 are lower-case).  */)
8501   (Lisp_Object prompt, Lisp_Object default_coding_system)
8502 {
8503   Lisp_Object val;
8504   ptrdiff_t count = SPECPDL_INDEX ();
8505
8506   if (SYMBOLP (default_coding_system))
8507     default_coding_system = SYMBOL_NAME (default_coding_system);
8508   specbind (Qcompletion_ignore_case, Qt);
8509   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8510                           Qt, Qnil, Qcoding_system_history,
8511                           default_coding_system, Qnil);
8512   unbind_to (count, Qnil);
8513   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8514 }
8515
8516 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8517        1, 1, 0,
8518        doc: /* Check validity of CODING-SYSTEM.
8519 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8520 It is valid if it is nil or a symbol defined as a coding system by the
8521 function `define-coding-system'.  */)
8522   (Lisp_Object coding_system)
8523 {
8524   Lisp_Object define_form;
8525
8526   define_form = Fget (coding_system, Qcoding_system_define_form);
8527   if (! NILP (define_form))
8528     {
8529       Fput (coding_system, Qcoding_system_define_form, Qnil);
8530       safe_eval (define_form);
8531     }
8532   if (!NILP (Fcoding_system_p (coding_system)))
8533     return coding_system;
8534   xsignal1 (Qcoding_system_error, coding_system);
8535 }
8536
8537 \f
8538 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8539    HIGHEST, return the coding system of the highest
8540    priority among the detected coding systems.  Otherwise return a
8541    list of detected coding systems sorted by their priorities.  If
8542    MULTIBYTEP, it is assumed that the bytes are in correct
8543    multibyte form but contains only ASCII and eight-bit chars.
8544    Otherwise, the bytes are raw bytes.
8545
8546    CODING-SYSTEM controls the detection as below:
8547
8548    If it is nil, detect both text-format and eol-format.  If the
8549    text-format part of CODING-SYSTEM is already specified
8550    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8551    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8552    detect only text-format.  */
8553
8554 Lisp_Object
8555 detect_coding_system (const unsigned char *src,
8556                       ptrdiff_t src_chars, ptrdiff_t src_bytes,
8557                       bool highest, bool multibytep,
8558                       Lisp_Object coding_system)
8559 {
8560   const unsigned char *src_end = src + src_bytes;
8561   Lisp_Object attrs, eol_type;
8562   Lisp_Object val = Qnil;
8563   struct coding_system coding;
8564   ptrdiff_t id;
8565   struct coding_detection_info detect_info;
8566   enum coding_category base_category;
8567   bool null_byte_found = 0, eight_bit_found = 0;
8568
8569   if (NILP (coding_system))
8570     coding_system = Qundecided;
8571   setup_coding_system (coding_system, &coding);
8572   attrs = CODING_ID_ATTRS (coding.id);
8573   eol_type = CODING_ID_EOL_TYPE (coding.id);
8574   coding_system = CODING_ATTR_BASE_NAME (attrs);
8575
8576   coding.source = src;
8577   coding.src_chars = src_chars;
8578   coding.src_bytes = src_bytes;
8579   coding.src_multibyte = multibytep;
8580   coding.consumed = 0;
8581   coding.mode |= CODING_MODE_LAST_BLOCK;
8582   coding.head_ascii = 0;
8583
8584   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8585
8586   /* At first, detect text-format if necessary.  */
8587   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8588   if (base_category == coding_category_undecided)
8589     {
8590       enum coding_category category IF_LINT (= 0);
8591       struct coding_system *this IF_LINT (= NULL);
8592       int c, i;
8593       bool inhibit_nbd = inhibit_flag (coding.spec.undecided.inhibit_nbd,
8594                                        inhibit_null_byte_detection);
8595       bool inhibit_ied = inhibit_flag (coding.spec.undecided.inhibit_ied,
8596                                        inhibit_iso_escape_detection);
8597       bool prefer_utf_8 = coding.spec.undecided.prefer_utf_8;
8598
8599       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8600       for (; src < src_end; src++)
8601         {
8602           c = *src;
8603           if (c & 0x80)
8604             {
8605               eight_bit_found = 1;
8606               if (null_byte_found)
8607                 break;
8608             }
8609           else if (c < 0x20)
8610             {
8611               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8612                   && ! inhibit_ied
8613                   && ! detect_info.checked)
8614                 {
8615                   if (detect_coding_iso_2022 (&coding, &detect_info))
8616                     {
8617                       /* We have scanned the whole data.  */
8618                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8619                         {
8620                           /* We didn't find an 8-bit code.  We may
8621                              have found a null-byte, but it's very
8622                              rare that a binary file confirm to
8623                              ISO-2022.  */
8624                           src = src_end;
8625                           coding.head_ascii = src - coding.source;
8626                         }
8627                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8628                       break;
8629                     }
8630                 }
8631               else if (! c && !inhibit_nbd)
8632                 {
8633                   null_byte_found = 1;
8634                   if (eight_bit_found)
8635                     break;
8636                 }
8637               if (! eight_bit_found)
8638                 coding.head_ascii++;
8639             }
8640           else if (! eight_bit_found)
8641             coding.head_ascii++;
8642         }
8643
8644       if (null_byte_found || eight_bit_found
8645           || coding.head_ascii < coding.src_bytes
8646           || detect_info.found)
8647         {
8648           if (coding.head_ascii == coding.src_bytes)
8649             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8650             for (i = 0; i < coding_category_raw_text; i++)
8651               {
8652                 category = coding_priorities[i];
8653                 this = coding_categories + category;
8654                 if (detect_info.found & (1 << category))
8655                   break;
8656               }
8657           else
8658             {
8659               if (null_byte_found)
8660                 {
8661                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8662                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8663                 }
8664               else if (prefer_utf_8
8665                        && detect_coding_utf_8 (&coding, &detect_info))
8666                 {
8667                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
8668                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
8669                 }
8670               for (i = 0; i < coding_category_raw_text; i++)
8671                 {
8672                   category = coding_priorities[i];
8673                   this = coding_categories + category;
8674
8675                   if (this->id < 0)
8676                     {
8677                       /* No coding system of this category is defined.  */
8678                       detect_info.rejected |= (1 << category);
8679                     }
8680                   else if (category >= coding_category_raw_text)
8681                     continue;
8682                   else if (detect_info.checked & (1 << category))
8683                     {
8684                       if (highest
8685                           && (detect_info.found & (1 << category)))
8686                         break;
8687                     }
8688                   else if ((*(this->detector)) (&coding, &detect_info)
8689                            && highest
8690                            && (detect_info.found & (1 << category)))
8691                     {
8692                       if (category == coding_category_utf_16_auto)
8693                         {
8694                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8695                             category = coding_category_utf_16_le;
8696                           else
8697                             category = coding_category_utf_16_be;
8698                         }
8699                       break;
8700                     }
8701                 }
8702             }
8703         }
8704
8705       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8706           || null_byte_found)
8707         {
8708           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8709           id = CODING_SYSTEM_ID (Qno_conversion);
8710           val = list1 (make_number (id));
8711         }
8712       else if (! detect_info.rejected && ! detect_info.found)
8713         {
8714           detect_info.found = CATEGORY_MASK_ANY;
8715           id = coding_categories[coding_category_undecided].id;
8716           val = list1 (make_number (id));
8717         }
8718       else if (highest)
8719         {
8720           if (detect_info.found)
8721             {
8722               detect_info.found = 1 << category;
8723               val = list1 (make_number (this->id));
8724             }
8725           else
8726             for (i = 0; i < coding_category_raw_text; i++)
8727               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8728                 {
8729                   detect_info.found = 1 << coding_priorities[i];
8730                   id = coding_categories[coding_priorities[i]].id;
8731                   val = list1 (make_number (id));
8732                   break;
8733                 }
8734         }
8735       else
8736         {
8737           int mask = detect_info.rejected | detect_info.found;
8738           int found = 0;
8739
8740           for (i = coding_category_raw_text - 1; i >= 0; i--)
8741             {
8742               category = coding_priorities[i];
8743               if (! (mask & (1 << category)))
8744                 {
8745                   found |= 1 << category;
8746                   id = coding_categories[category].id;
8747                   if (id >= 0)
8748                     val = list1 (make_number (id));
8749                 }
8750             }
8751           for (i = coding_category_raw_text - 1; i >= 0; i--)
8752             {
8753               category = coding_priorities[i];
8754               if (detect_info.found & (1 << category))
8755                 {
8756                   id = coding_categories[category].id;
8757                   val = Fcons (make_number (id), val);
8758                 }
8759             }
8760           detect_info.found |= found;
8761         }
8762     }
8763   else if (base_category == coding_category_utf_8_auto)
8764     {
8765       if (detect_coding_utf_8 (&coding, &detect_info))
8766         {
8767           struct coding_system *this;
8768
8769           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8770             this = coding_categories + coding_category_utf_8_sig;
8771           else
8772             this = coding_categories + coding_category_utf_8_nosig;
8773           val = list1 (make_number (this->id));
8774         }
8775     }
8776   else if (base_category == coding_category_utf_16_auto)
8777     {
8778       if (detect_coding_utf_16 (&coding, &detect_info))
8779         {
8780           struct coding_system *this;
8781
8782           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8783             this = coding_categories + coding_category_utf_16_le;
8784           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8785             this = coding_categories + coding_category_utf_16_be;
8786           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8787             this = coding_categories + coding_category_utf_16_be_nosig;
8788           else
8789             this = coding_categories + coding_category_utf_16_le_nosig;
8790           val = list1 (make_number (this->id));
8791         }
8792     }
8793   else
8794     {
8795       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8796       val = list1 (make_number (coding.id));
8797     }
8798
8799   /* Then, detect eol-format if necessary.  */
8800   {
8801     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8802     Lisp_Object tail;
8803
8804     if (VECTORP (eol_type))
8805       {
8806         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8807           {
8808             if (null_byte_found)
8809               normal_eol = EOL_SEEN_LF;
8810             else
8811               normal_eol = detect_eol (coding.source, src_bytes,
8812                                        coding_category_raw_text);
8813           }
8814         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8815                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8816           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8817                                       coding_category_utf_16_be);
8818         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8819                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8820           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8821                                       coding_category_utf_16_le);
8822       }
8823     else
8824       {
8825         if (EQ (eol_type, Qunix))
8826           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8827         else if (EQ (eol_type, Qdos))
8828           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8829         else
8830           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8831       }
8832
8833     for (tail = val; CONSP (tail); tail = XCDR (tail))
8834       {
8835         enum coding_category category;
8836         int this_eol;
8837
8838         id = XINT (XCAR (tail));
8839         attrs = CODING_ID_ATTRS (id);
8840         category = XINT (CODING_ATTR_CATEGORY (attrs));
8841         eol_type = CODING_ID_EOL_TYPE (id);
8842         if (VECTORP (eol_type))
8843           {
8844             if (category == coding_category_utf_16_be
8845                 || category == coding_category_utf_16_be_nosig)
8846               this_eol = utf_16_be_eol;
8847             else if (category == coding_category_utf_16_le
8848                      || category == coding_category_utf_16_le_nosig)
8849               this_eol = utf_16_le_eol;
8850             else
8851               this_eol = normal_eol;
8852
8853             if (this_eol == EOL_SEEN_LF)
8854               XSETCAR (tail, AREF (eol_type, 0));
8855             else if (this_eol == EOL_SEEN_CRLF)
8856               XSETCAR (tail, AREF (eol_type, 1));
8857             else if (this_eol == EOL_SEEN_CR)
8858               XSETCAR (tail, AREF (eol_type, 2));
8859             else
8860               XSETCAR (tail, CODING_ID_NAME (id));
8861           }
8862         else
8863           XSETCAR (tail, CODING_ID_NAME (id));
8864       }
8865   }
8866
8867   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8868 }
8869
8870
8871 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8872        2, 3, 0,
8873        doc: /* Detect coding system of the text in the region between START and END.
8874 Return a list of possible coding systems ordered by priority.
8875 The coding systems to try and their priorities follows what
8876 the function `coding-system-priority-list' (which see) returns.
8877
8878 If only ASCII characters are found (except for such ISO-2022 control
8879 characters as ESC), it returns a list of single element `undecided'
8880 or its subsidiary coding system according to a detected end-of-line
8881 format.
8882
8883 If optional argument HIGHEST is non-nil, return the coding system of
8884 highest priority.  */)
8885   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8886 {
8887   ptrdiff_t from, to;
8888   ptrdiff_t from_byte, to_byte;
8889
8890   validate_region (&start, &end);
8891   from = XINT (start), to = XINT (end);
8892   from_byte = CHAR_TO_BYTE (from);
8893   to_byte = CHAR_TO_BYTE (to);
8894
8895   if (from < GPT && to >= GPT)
8896     move_gap_both (to, to_byte);
8897
8898   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8899                                to - from, to_byte - from_byte,
8900                                !NILP (highest),
8901                                !NILP (BVAR (current_buffer
8902                                       , enable_multibyte_characters)),
8903                                Qnil);
8904 }
8905
8906 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8907        1, 2, 0,
8908        doc: /* Detect coding system of the text in STRING.
8909 Return a list of possible coding systems ordered by priority.
8910 The coding systems to try and their priorities follows what
8911 the function `coding-system-priority-list' (which see) returns.
8912
8913 If only ASCII characters are found (except for such ISO-2022 control
8914 characters as ESC), it returns a list of single element `undecided'
8915 or its subsidiary coding system according to a detected end-of-line
8916 format.
8917
8918 If optional argument HIGHEST is non-nil, return the coding system of
8919 highest priority.  */)
8920   (Lisp_Object string, Lisp_Object highest)
8921 {
8922   CHECK_STRING (string);
8923
8924   return detect_coding_system (SDATA (string),
8925                                SCHARS (string), SBYTES (string),
8926                                !NILP (highest), STRING_MULTIBYTE (string),
8927                                Qnil);
8928 }
8929
8930
8931 static bool
8932 char_encodable_p (int c, Lisp_Object attrs)
8933 {
8934   Lisp_Object tail;
8935   struct charset *charset;
8936   Lisp_Object translation_table;
8937
8938   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8939   if (! NILP (translation_table))
8940     c = translate_char (translation_table, c);
8941   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8942        CONSP (tail); tail = XCDR (tail))
8943     {
8944       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8945       if (CHAR_CHARSET_P (c, charset))
8946         break;
8947     }
8948   return (! NILP (tail));
8949 }
8950
8951
8952 /* Return a list of coding systems that safely encode the text between
8953    START and END.  If EXCLUDE is non-nil, it is a list of coding
8954    systems not to check.  The returned list doesn't contain any such
8955    coding systems.  In any case, if the text contains only ASCII or is
8956    unibyte, return t.  */
8957
8958 DEFUN ("find-coding-systems-region-internal",
8959        Ffind_coding_systems_region_internal,
8960        Sfind_coding_systems_region_internal, 2, 3, 0,
8961        doc: /* Internal use only.  */)
8962   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8963 {
8964   Lisp_Object coding_attrs_list, safe_codings;
8965   ptrdiff_t start_byte, end_byte;
8966   const unsigned char *p, *pbeg, *pend;
8967   int c;
8968   Lisp_Object tail, elt, work_table;
8969
8970   if (STRINGP (start))
8971     {
8972       if (!STRING_MULTIBYTE (start)
8973           || SCHARS (start) == SBYTES (start))
8974         return Qt;
8975       start_byte = 0;
8976       end_byte = SBYTES (start);
8977     }
8978   else
8979     {
8980       CHECK_NUMBER_COERCE_MARKER (start);
8981       CHECK_NUMBER_COERCE_MARKER (end);
8982       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8983         args_out_of_range (start, end);
8984       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8985         return Qt;
8986       start_byte = CHAR_TO_BYTE (XINT (start));
8987       end_byte = CHAR_TO_BYTE (XINT (end));
8988       if (XINT (end) - XINT (start) == end_byte - start_byte)
8989         return Qt;
8990
8991       if (XINT (start) < GPT && XINT (end) > GPT)
8992         {
8993           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8994             move_gap_both (XINT (start), start_byte);
8995           else
8996             move_gap_both (XINT (end), end_byte);
8997         }
8998     }
8999
9000   coding_attrs_list = Qnil;
9001   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
9002     if (NILP (exclude)
9003         || NILP (Fmemq (XCAR (tail), exclude)))
9004       {
9005         Lisp_Object attrs;
9006
9007         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
9008         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs)))
9009           {
9010             ASET (attrs, coding_attr_trans_tbl,
9011                   get_translation_table (attrs, 1, NULL));
9012             coding_attrs_list = Fcons (attrs, coding_attrs_list);
9013           }
9014       }
9015
9016   if (STRINGP (start))
9017     p = pbeg = SDATA (start);
9018   else
9019     p = pbeg = BYTE_POS_ADDR (start_byte);
9020   pend = p + (end_byte - start_byte);
9021
9022   while (p < pend && ASCII_CHAR_P (*p)) p++;
9023   while (p < pend && ASCII_CHAR_P (*(pend - 1))) pend--;
9024
9025   work_table = Fmake_char_table (Qnil, Qnil);
9026   while (p < pend)
9027     {
9028       if (ASCII_CHAR_P (*p))
9029         p++;
9030       else
9031         {
9032           c = STRING_CHAR_ADVANCE (p);
9033           if (!NILP (char_table_ref (work_table, c)))
9034             /* This character was already checked.  Ignore it.  */
9035             continue;
9036
9037           charset_map_loaded = 0;
9038           for (tail = coding_attrs_list; CONSP (tail);)
9039             {
9040               elt = XCAR (tail);
9041               if (NILP (elt))
9042                 tail = XCDR (tail);
9043               else if (char_encodable_p (c, elt))
9044                 tail = XCDR (tail);
9045               else if (CONSP (XCDR (tail)))
9046                 {
9047                   XSETCAR (tail, XCAR (XCDR (tail)));
9048                   XSETCDR (tail, XCDR (XCDR (tail)));
9049                 }
9050               else
9051                 {
9052                   XSETCAR (tail, Qnil);
9053                   tail = XCDR (tail);
9054                 }
9055             }
9056           if (charset_map_loaded)
9057             {
9058               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9059
9060               if (STRINGP (start))
9061                 pbeg = SDATA (start);
9062               else
9063                 pbeg = BYTE_POS_ADDR (start_byte);
9064               p = pbeg + p_offset;
9065               pend = pbeg + pend_offset;
9066             }
9067           char_table_set (work_table, c, Qt);
9068         }
9069     }
9070
9071   safe_codings = list2 (Qraw_text, Qno_conversion);
9072   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
9073     if (! NILP (XCAR (tail)))
9074       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
9075
9076   return safe_codings;
9077 }
9078
9079
9080 DEFUN ("unencodable-char-position", Funencodable_char_position,
9081        Sunencodable_char_position, 3, 5, 0,
9082        doc: /* Return position of first un-encodable character in a region.
9083 START and END specify the region and CODING-SYSTEM specifies the
9084 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
9085
9086 If optional 4th argument COUNT is non-nil, it specifies at most how
9087 many un-encodable characters to search.  In this case, the value is a
9088 list of positions.
9089
9090 If optional 5th argument STRING is non-nil, it is a string to search
9091 for un-encodable characters.  In that case, START and END are indexes
9092 to the string and treated as in `substring'.  */)
9093   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system,
9094    Lisp_Object count, Lisp_Object string)
9095 {
9096   EMACS_INT n;
9097   struct coding_system coding;
9098   Lisp_Object attrs, charset_list, translation_table;
9099   Lisp_Object positions;
9100   ptrdiff_t from, to;
9101   const unsigned char *p, *stop, *pend;
9102   bool ascii_compatible;
9103
9104   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
9105   attrs = CODING_ID_ATTRS (coding.id);
9106   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
9107     return Qnil;
9108   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
9109   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9110   translation_table = get_translation_table (attrs, 1, NULL);
9111
9112   if (NILP (string))
9113     {
9114       validate_region (&start, &end);
9115       from = XINT (start);
9116       to = XINT (end);
9117       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
9118           || (ascii_compatible
9119               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
9120         return Qnil;
9121       p = CHAR_POS_ADDR (from);
9122       pend = CHAR_POS_ADDR (to);
9123       if (from < GPT && to >= GPT)
9124         stop = GPT_ADDR;
9125       else
9126         stop = pend;
9127     }
9128   else
9129     {
9130       CHECK_STRING (string);
9131       validate_subarray (string, start, end, SCHARS (string), &from, &to);
9132       if (! STRING_MULTIBYTE (string))
9133         return Qnil;
9134       p = SDATA (string) + string_char_to_byte (string, from);
9135       stop = pend = SDATA (string) + string_char_to_byte (string, to);
9136       if (ascii_compatible && (to - from) == (pend - p))
9137         return Qnil;
9138     }
9139
9140   if (NILP (count))
9141     n = 1;
9142   else
9143     {
9144       CHECK_NATNUM (count);
9145       n = XINT (count);
9146     }
9147
9148   positions = Qnil;
9149   charset_map_loaded = 0;
9150   while (1)
9151     {
9152       int c;
9153
9154       if (ascii_compatible)
9155         while (p < stop && ASCII_CHAR_P (*p))
9156           p++, from++;
9157       if (p >= stop)
9158         {
9159           if (p >= pend)
9160             break;
9161           stop = pend;
9162           p = GAP_END_ADDR;
9163         }
9164
9165       c = STRING_CHAR_ADVANCE (p);
9166       if (! (ASCII_CHAR_P (c) && ascii_compatible)
9167           && ! char_charset (translate_char (translation_table, c),
9168                              charset_list, NULL))
9169         {
9170           positions = Fcons (make_number (from), positions);
9171           n--;
9172           if (n == 0)
9173             break;
9174         }
9175
9176       from++;
9177       if (charset_map_loaded && NILP (string))
9178         {
9179           p = CHAR_POS_ADDR (from);
9180           pend = CHAR_POS_ADDR (to);
9181           if (from < GPT && to >= GPT)
9182             stop = GPT_ADDR;
9183           else
9184             stop = pend;
9185           charset_map_loaded = 0;
9186         }
9187     }
9188
9189   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
9190 }
9191
9192
9193 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
9194        Scheck_coding_systems_region, 3, 3, 0,
9195        doc: /* Check if the region is encodable by coding systems.
9196
9197 START and END are buffer positions specifying the region.
9198 CODING-SYSTEM-LIST is a list of coding systems to check.
9199
9200 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
9201 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
9202 whole region, POS0, POS1, ... are buffer positions where non-encodable
9203 characters are found.
9204
9205 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
9206 value is nil.
9207
9208 START may be a string.  In that case, check if the string is
9209 encodable, and the value contains indices to the string instead of
9210 buffer positions.  END is ignored.
9211
9212 If the current buffer (or START if it is a string) is unibyte, the value
9213 is nil.  */)
9214   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
9215 {
9216   Lisp_Object list;
9217   ptrdiff_t start_byte, end_byte;
9218   ptrdiff_t pos;
9219   const unsigned char *p, *pbeg, *pend;
9220   int c;
9221   Lisp_Object tail, elt, attrs;
9222
9223   if (STRINGP (start))
9224     {
9225       if (!STRING_MULTIBYTE (start)
9226           || SCHARS (start) == SBYTES (start))
9227         return Qnil;
9228       start_byte = 0;
9229       end_byte = SBYTES (start);
9230       pos = 0;
9231     }
9232   else
9233     {
9234       CHECK_NUMBER_COERCE_MARKER (start);
9235       CHECK_NUMBER_COERCE_MARKER (end);
9236       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
9237         args_out_of_range (start, end);
9238       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9239         return Qnil;
9240       start_byte = CHAR_TO_BYTE (XINT (start));
9241       end_byte = CHAR_TO_BYTE (XINT (end));
9242       if (XINT (end) - XINT (start) == end_byte - start_byte)
9243         return Qnil;
9244
9245       if (XINT (start) < GPT && XINT (end) > GPT)
9246         {
9247           if ((GPT - XINT (start)) < (XINT (end) - GPT))
9248             move_gap_both (XINT (start), start_byte);
9249           else
9250             move_gap_both (XINT (end), end_byte);
9251         }
9252       pos = XINT (start);
9253     }
9254
9255   list = Qnil;
9256   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
9257     {
9258       elt = XCAR (tail);
9259       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
9260       ASET (attrs, coding_attr_trans_tbl,
9261             get_translation_table (attrs, 1, NULL));
9262       list = Fcons (list2 (elt, attrs), list);
9263     }
9264
9265   if (STRINGP (start))
9266     p = pbeg = SDATA (start);
9267   else
9268     p = pbeg = BYTE_POS_ADDR (start_byte);
9269   pend = p + (end_byte - start_byte);
9270
9271   while (p < pend && ASCII_CHAR_P (*p)) p++, pos++;
9272   while (p < pend && ASCII_CHAR_P (*(pend - 1))) pend--;
9273
9274   while (p < pend)
9275     {
9276       if (ASCII_CHAR_P (*p))
9277         p++;
9278       else
9279         {
9280           c = STRING_CHAR_ADVANCE (p);
9281
9282           charset_map_loaded = 0;
9283           for (tail = list; CONSP (tail); tail = XCDR (tail))
9284             {
9285               elt = XCDR (XCAR (tail));
9286               if (! char_encodable_p (c, XCAR (elt)))
9287                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
9288             }
9289           if (charset_map_loaded)
9290             {
9291               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9292
9293               if (STRINGP (start))
9294                 pbeg = SDATA (start);
9295               else
9296                 pbeg = BYTE_POS_ADDR (start_byte);
9297               p = pbeg + p_offset;
9298               pend = pbeg + pend_offset;
9299             }
9300         }
9301       pos++;
9302     }
9303
9304   tail = list;
9305   list = Qnil;
9306   for (; CONSP (tail); tail = XCDR (tail))
9307     {
9308       elt = XCAR (tail);
9309       if (CONSP (XCDR (XCDR (elt))))
9310         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
9311                       list);
9312     }
9313
9314   return list;
9315 }
9316
9317
9318 static Lisp_Object
9319 code_convert_region (Lisp_Object start, Lisp_Object end,
9320                      Lisp_Object coding_system, Lisp_Object dst_object,
9321                      bool encodep, bool norecord)
9322 {
9323   struct coding_system coding;
9324   ptrdiff_t from, from_byte, to, to_byte;
9325   Lisp_Object src_object;
9326
9327   if (NILP (coding_system))
9328     coding_system = Qno_conversion;
9329   else
9330     CHECK_CODING_SYSTEM (coding_system);
9331   src_object = Fcurrent_buffer ();
9332   if (NILP (dst_object))
9333     dst_object = src_object;
9334   else if (! EQ (dst_object, Qt))
9335     CHECK_BUFFER (dst_object);
9336
9337   validate_region (&start, &end);
9338   from = XFASTINT (start);
9339   from_byte = CHAR_TO_BYTE (from);
9340   to = XFASTINT (end);
9341   to_byte = CHAR_TO_BYTE (to);
9342
9343   setup_coding_system (coding_system, &coding);
9344   coding.mode |= CODING_MODE_LAST_BLOCK;
9345
9346   if (BUFFERP (dst_object) && !EQ (dst_object, src_object))
9347     {
9348       struct buffer *buf = XBUFFER (dst_object);
9349       ptrdiff_t buf_pt = BUF_PT (buf);
9350
9351       invalidate_buffer_caches (buf, buf_pt, buf_pt);
9352     }
9353
9354   if (encodep)
9355     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9356                           dst_object);
9357   else
9358     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9359                           dst_object);
9360   if (! norecord)
9361     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9362
9363   return (BUFFERP (dst_object)
9364           ? make_number (coding.produced_char)
9365           : coding.dst_object);
9366 }
9367
9368
9369 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9370        3, 4, "r\nzCoding system: ",
9371        doc: /* Decode the current region from the specified coding system.
9372 When called from a program, takes four arguments:
9373         START, END, CODING-SYSTEM, and DESTINATION.
9374 START and END are buffer positions.
9375
9376 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9377 If nil, the region between START and END is replaced by the decoded text.
9378 If buffer, the decoded text is inserted in that buffer after point (point
9379 does not move).
9380 In those cases, the length of the decoded text is returned.
9381 If DESTINATION is t, the decoded text is returned.
9382
9383 This function sets `last-coding-system-used' to the precise coding system
9384 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9385 not fully specified.)  */)
9386   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9387 {
9388   return code_convert_region (start, end, coding_system, destination, 0, 0);
9389 }
9390
9391 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9392        3, 4, "r\nzCoding system: ",
9393        doc: /* Encode the current region by specified coding system.
9394 When called from a program, takes four arguments:
9395         START, END, CODING-SYSTEM and DESTINATION.
9396 START and END are buffer positions.
9397
9398 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9399 If nil, the region between START and END is replace by the encoded text.
9400 If buffer, the encoded text is inserted in that buffer after point (point
9401 does not move).
9402 In those cases, the length of the encoded text is returned.
9403 If DESTINATION is t, the encoded text is returned.
9404
9405 This function sets `last-coding-system-used' to the precise coding system
9406 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9407 not fully specified.)  */)
9408   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9409 {
9410   return code_convert_region (start, end, coding_system, destination, 1, 0);
9411 }
9412
9413 Lisp_Object
9414 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
9415                      Lisp_Object dst_object, bool encodep, bool nocopy,
9416                      bool norecord)
9417 {
9418   struct coding_system coding;
9419   ptrdiff_t chars, bytes;
9420
9421   CHECK_STRING (string);
9422   if (NILP (coding_system))
9423     {
9424       if (! norecord)
9425         Vlast_coding_system_used = Qno_conversion;
9426       if (NILP (dst_object))
9427         return (nocopy ? Fcopy_sequence (string) : string);
9428     }
9429
9430   if (NILP (coding_system))
9431     coding_system = Qno_conversion;
9432   else
9433     CHECK_CODING_SYSTEM (coding_system);
9434   if (NILP (dst_object))
9435     dst_object = Qt;
9436   else if (! EQ (dst_object, Qt))
9437     CHECK_BUFFER (dst_object);
9438
9439   setup_coding_system (coding_system, &coding);
9440   coding.mode |= CODING_MODE_LAST_BLOCK;
9441   chars = SCHARS (string);
9442   bytes = SBYTES (string);
9443
9444   if (BUFFERP (dst_object))
9445     {
9446       struct buffer *buf = XBUFFER (dst_object);
9447       ptrdiff_t buf_pt = BUF_PT (buf);
9448
9449       invalidate_buffer_caches (buf, buf_pt, buf_pt);
9450     }
9451
9452   if (encodep)
9453     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9454   else
9455     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9456   if (! norecord)
9457     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9458
9459   return (BUFFERP (dst_object)
9460           ? make_number (coding.produced_char)
9461           : coding.dst_object);
9462 }
9463
9464
9465 /* Encode or decode STRING according to CODING_SYSTEM.
9466    Do not set Vlast_coding_system_used.
9467
9468    This function is called only from macros DECODE_FILE and
9469    ENCODE_FILE, thus we ignore character composition.  */
9470
9471 Lisp_Object
9472 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9473                               bool encodep)
9474 {
9475   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9476 }
9477
9478 /* Encode or decode a file name, to or from a unibyte string suitable
9479    for passing to C library functions.  */
9480 Lisp_Object
9481 decode_file_name (Lisp_Object fname)
9482 {
9483 #ifdef WINDOWSNT
9484   /* The w32 build pretends to use UTF-8 for file-name encoding, and
9485      converts the file names either to UTF-16LE or to the system ANSI
9486      codepage internally, depending on the underlying OS; see w32.c.  */
9487   if (! NILP (Fcoding_system_p (Qutf_8)))
9488     return code_convert_string_norecord (fname, Qutf_8, 0);
9489   return fname;
9490 #else  /* !WINDOWSNT */
9491   if (! NILP (Vfile_name_coding_system))
9492     return code_convert_string_norecord (fname, Vfile_name_coding_system, 0);
9493   else if (! NILP (Vdefault_file_name_coding_system))
9494     return code_convert_string_norecord (fname,
9495                                          Vdefault_file_name_coding_system, 0);
9496   else
9497     return fname;
9498 #endif
9499 }
9500
9501 Lisp_Object
9502 encode_file_name (Lisp_Object fname)
9503 {
9504   /* This is especially important during bootstrap and dumping, when
9505      file-name encoding is not yet known, and therefore any non-ASCII
9506      file names are unibyte strings, and could only be thrashed if we
9507      try to encode them.  */
9508   if (!STRING_MULTIBYTE (fname))
9509     return fname;
9510 #ifdef WINDOWSNT
9511   /* The w32 build pretends to use UTF-8 for file-name encoding, and
9512      converts the file names either to UTF-16LE or to the system ANSI
9513      codepage internally, depending on the underlying OS; see w32.c.  */
9514   if (! NILP (Fcoding_system_p (Qutf_8)))
9515     return code_convert_string_norecord (fname, Qutf_8, 1);
9516   return fname;
9517 #else  /* !WINDOWSNT */
9518   if (! NILP (Vfile_name_coding_system))
9519     return code_convert_string_norecord (fname, Vfile_name_coding_system, 1);
9520   else if (! NILP (Vdefault_file_name_coding_system))
9521     return code_convert_string_norecord (fname,
9522                                          Vdefault_file_name_coding_system, 1);
9523   else
9524     return fname;
9525 #endif
9526 }
9527
9528 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9529        2, 4, 0,
9530        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9531
9532 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9533 if the decoding operation is trivial.
9534
9535 Optional fourth arg BUFFER non-nil means that the decoded text is
9536 inserted in that buffer after point (point does not move).  In this
9537 case, the return value is the length of the decoded text.
9538
9539 This function sets `last-coding-system-used' to the precise coding system
9540 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9541 not fully specified.)  */)
9542   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9543 {
9544   return code_convert_string (string, coding_system, buffer,
9545                               0, ! NILP (nocopy), 0);
9546 }
9547
9548 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9549        2, 4, 0,
9550        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9551
9552 Optional third arg NOCOPY non-nil means it is OK to return STRING
9553 itself if the encoding operation is trivial.
9554
9555 Optional fourth arg BUFFER non-nil means that the encoded text is
9556 inserted in that buffer after point (point does not move).  In this
9557 case, the return value is the length of the encoded text.
9558
9559 This function sets `last-coding-system-used' to the precise coding system
9560 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9561 not fully specified.)  */)
9562   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9563 {
9564   return code_convert_string (string, coding_system, buffer,
9565                               1, ! NILP (nocopy), 0);
9566 }
9567
9568 \f
9569 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9570        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9571 Return the corresponding character.  */)
9572   (Lisp_Object code)
9573 {
9574   Lisp_Object spec, attrs, val;
9575   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9576   EMACS_INT ch;
9577   int c;
9578
9579   CHECK_NATNUM (code);
9580   ch = XFASTINT (code);
9581   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9582   attrs = AREF (spec, 0);
9583
9584   if (ASCII_CHAR_P (ch)
9585       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9586     return code;
9587
9588   val = CODING_ATTR_CHARSET_LIST (attrs);
9589   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9590   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9591   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9592
9593   if (ch <= 0x7F)
9594     {
9595       c = ch;
9596       charset = charset_roman;
9597     }
9598   else if (ch >= 0xA0 && ch < 0xDF)
9599     {
9600       c = ch - 0x80;
9601       charset = charset_kana;
9602     }
9603   else
9604     {
9605       EMACS_INT c1 = ch >> 8;
9606       int c2 = ch & 0xFF;
9607
9608       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9609           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9610         error ("Invalid code: %"pI"d", ch);
9611       c = ch;
9612       SJIS_TO_JIS (c);
9613       charset = charset_kanji;
9614     }
9615   c = DECODE_CHAR (charset, c);
9616   if (c < 0)
9617     error ("Invalid code: %"pI"d", ch);
9618   return make_number (c);
9619 }
9620
9621
9622 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9623        doc: /* Encode a Japanese character CH to shift_jis encoding.
9624 Return the corresponding code in SJIS.  */)
9625   (Lisp_Object ch)
9626 {
9627   Lisp_Object spec, attrs, charset_list;
9628   int c;
9629   struct charset *charset;
9630   unsigned code;
9631
9632   CHECK_CHARACTER (ch);
9633   c = XFASTINT (ch);
9634   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9635   attrs = AREF (spec, 0);
9636
9637   if (ASCII_CHAR_P (c)
9638       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9639     return ch;
9640
9641   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9642   charset = char_charset (c, charset_list, &code);
9643   if (code == CHARSET_INVALID_CODE (charset))
9644     error ("Can't encode by shift_jis encoding: %c", c);
9645   JIS_TO_SJIS (code);
9646
9647   return make_number (code);
9648 }
9649
9650 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9651        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9652 Return the corresponding character.  */)
9653   (Lisp_Object code)
9654 {
9655   Lisp_Object spec, attrs, val;
9656   struct charset *charset_roman, *charset_big5, *charset;
9657   EMACS_INT ch;
9658   int c;
9659
9660   CHECK_NATNUM (code);
9661   ch = XFASTINT (code);
9662   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9663   attrs = AREF (spec, 0);
9664
9665   if (ASCII_CHAR_P (ch)
9666       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9667     return code;
9668
9669   val = CODING_ATTR_CHARSET_LIST (attrs);
9670   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9671   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9672
9673   if (ch <= 0x7F)
9674     {
9675       c = ch;
9676       charset = charset_roman;
9677     }
9678   else
9679     {
9680       EMACS_INT b1 = ch >> 8;
9681       int b2 = ch & 0x7F;
9682       if (b1 < 0xA1 || b1 > 0xFE
9683           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9684         error ("Invalid code: %"pI"d", ch);
9685       c = ch;
9686       charset = charset_big5;
9687     }
9688   c = DECODE_CHAR (charset, c);
9689   if (c < 0)
9690     error ("Invalid code: %"pI"d", ch);
9691   return make_number (c);
9692 }
9693
9694 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9695        doc: /* Encode the Big5 character CH to BIG5 coding system.
9696 Return the corresponding character code in Big5.  */)
9697   (Lisp_Object ch)
9698 {
9699   Lisp_Object spec, attrs, charset_list;
9700   struct charset *charset;
9701   int c;
9702   unsigned code;
9703
9704   CHECK_CHARACTER (ch);
9705   c = XFASTINT (ch);
9706   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9707   attrs = AREF (spec, 0);
9708   if (ASCII_CHAR_P (c)
9709       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9710     return ch;
9711
9712   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9713   charset = char_charset (c, charset_list, &code);
9714   if (code == CHARSET_INVALID_CODE (charset))
9715     error ("Can't encode by Big5 encoding: %c", c);
9716
9717   return make_number (code);
9718 }
9719
9720 \f
9721 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9722        Sset_terminal_coding_system_internal, 1, 2, 0,
9723        doc: /* Internal use only.  */)
9724   (Lisp_Object coding_system, Lisp_Object terminal)
9725 {
9726   struct terminal *term = decode_live_terminal (terminal);
9727   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9728   CHECK_SYMBOL (coding_system);
9729   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9730   /* We had better not send unsafe characters to terminal.  */
9731   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9732   /* Character composition should be disabled.  */
9733   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9734   terminal_coding->src_multibyte = 1;
9735   terminal_coding->dst_multibyte = 0;
9736   tset_charset_list
9737     (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9738             ? coding_charset_list (terminal_coding)
9739             : list1 (make_number (charset_ascii))));
9740   return Qnil;
9741 }
9742
9743 DEFUN ("set-safe-terminal-coding-system-internal",
9744        Fset_safe_terminal_coding_system_internal,
9745        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9746        doc: /* Internal use only.  */)
9747   (Lisp_Object coding_system)
9748 {
9749   CHECK_SYMBOL (coding_system);
9750   setup_coding_system (Fcheck_coding_system (coding_system),
9751                        &safe_terminal_coding);
9752   /* Character composition should be disabled.  */
9753   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9754   safe_terminal_coding.src_multibyte = 1;
9755   safe_terminal_coding.dst_multibyte = 0;
9756   return Qnil;
9757 }
9758
9759 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9760        Sterminal_coding_system, 0, 1, 0,
9761        doc: /* Return coding system specified for terminal output on the given terminal.
9762 TERMINAL may be a terminal object, a frame, or nil for the selected
9763 frame's terminal device.  */)
9764   (Lisp_Object terminal)
9765 {
9766   struct coding_system *terminal_coding
9767     = TERMINAL_TERMINAL_CODING (decode_live_terminal (terminal));
9768   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9769
9770   /* For backward compatibility, return nil if it is `undecided'.  */
9771   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9772 }
9773
9774 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9775        Sset_keyboard_coding_system_internal, 1, 2, 0,
9776        doc: /* Internal use only.  */)
9777   (Lisp_Object coding_system, Lisp_Object terminal)
9778 {
9779   struct terminal *t = decode_live_terminal (terminal);
9780   CHECK_SYMBOL (coding_system);
9781   if (NILP (coding_system))
9782     coding_system = Qno_conversion;
9783   else
9784     Fcheck_coding_system (coding_system);
9785   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9786   /* Character composition should be disabled.  */
9787   TERMINAL_KEYBOARD_CODING (t)->common_flags
9788     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9789   return Qnil;
9790 }
9791
9792 DEFUN ("keyboard-coding-system",
9793        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9794        doc: /* Return coding system specified for decoding keyboard input.  */)
9795   (Lisp_Object terminal)
9796 {
9797   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9798                          (decode_live_terminal (terminal))->id);
9799 }
9800
9801 \f
9802 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9803        Sfind_operation_coding_system,  1, MANY, 0,
9804        doc: /* Choose a coding system for an operation based on the target name.
9805 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9806 DECODING-SYSTEM is the coding system to use for decoding
9807 (in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9808 for encoding (in case OPERATION does encoding).
9809
9810 The first argument OPERATION specifies an I/O primitive:
9811   For file I/O, `insert-file-contents' or `write-region'.
9812   For process I/O, `call-process', `call-process-region', or `start-process'.
9813   For network I/O, `open-network-stream'.
9814
9815 The remaining arguments should be the same arguments that were passed
9816 to the primitive.  Depending on which primitive, one of those arguments
9817 is selected as the TARGET.  For example, if OPERATION does file I/O,
9818 whichever argument specifies the file name is TARGET.
9819
9820 TARGET has a meaning which depends on OPERATION:
9821   For file I/O, TARGET is a file name (except for the special case below).
9822   For process I/O, TARGET is a process name.
9823   For network I/O, TARGET is a service name or a port number.
9824
9825 This function looks up what is specified for TARGET in
9826 `file-coding-system-alist', `process-coding-system-alist',
9827 or `network-coding-system-alist' depending on OPERATION.
9828 They may specify a coding system, a cons of coding systems,
9829 or a function symbol to call.
9830 In the last case, we call the function with one argument,
9831 which is a list of all the arguments given to this function.
9832 If the function can't decide a coding system, it can return
9833 `undecided' so that the normal code-detection is performed.
9834
9835 If OPERATION is `insert-file-contents', the argument corresponding to
9836 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9837 file name to look up, and BUFFER is a buffer that contains the file's
9838 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9839 function to call for FILENAME, that function should examine the
9840 contents of BUFFER instead of reading the file.
9841
9842 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9843   (ptrdiff_t nargs, Lisp_Object *args)
9844 {
9845   Lisp_Object operation, target_idx, target, val;
9846   register Lisp_Object chain;
9847
9848   if (nargs < 2)
9849     error ("Too few arguments");
9850   operation = args[0];
9851   if (!SYMBOLP (operation)
9852       || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9853     error ("Invalid first argument");
9854   if (nargs <= 1 + XFASTINT (target_idx))
9855     error ("Too few arguments for operation `%s'",
9856            SDATA (SYMBOL_NAME (operation)));
9857   target = args[XFASTINT (target_idx) + 1];
9858   if (!(STRINGP (target)
9859         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9860             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9861         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9862     error ("Invalid argument %"pI"d of operation `%s'",
9863            XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9864   if (CONSP (target))
9865     target = XCAR (target);
9866
9867   chain = ((EQ (operation, Qinsert_file_contents)
9868             || EQ (operation, Qwrite_region))
9869            ? Vfile_coding_system_alist
9870            : (EQ (operation, Qopen_network_stream)
9871               ? Vnetwork_coding_system_alist
9872               : Vprocess_coding_system_alist));
9873   if (NILP (chain))
9874     return Qnil;
9875
9876   for (; CONSP (chain); chain = XCDR (chain))
9877     {
9878       Lisp_Object elt;
9879
9880       elt = XCAR (chain);
9881       if (CONSP (elt)
9882           && ((STRINGP (target)
9883                && STRINGP (XCAR (elt))
9884                && fast_string_match (XCAR (elt), target) >= 0)
9885               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9886         {
9887           val = XCDR (elt);
9888           /* Here, if VAL is both a valid coding system and a valid
9889              function symbol, we return VAL as a coding system.  */
9890           if (CONSP (val))
9891             return val;
9892           if (! SYMBOLP (val))
9893             return Qnil;
9894           if (! NILP (Fcoding_system_p (val)))
9895             return Fcons (val, val);
9896           if (! NILP (Ffboundp (val)))
9897             {
9898               /* We use call1 rather than safe_call1
9899                  so as to get bug reports about functions called here
9900                  which don't handle the current interface.  */
9901               val = call1 (val, Flist (nargs, args));
9902               if (CONSP (val))
9903                 return val;
9904               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9905                 return Fcons (val, val);
9906             }
9907           return Qnil;
9908         }
9909     }
9910   return Qnil;
9911 }
9912
9913 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9914        Sset_coding_system_priority, 0, MANY, 0,
9915        doc: /* Assign higher priority to the coding systems given as arguments.
9916 If multiple coding systems belong to the same category,
9917 all but the first one are ignored.
9918
9919 usage: (set-coding-system-priority &rest coding-systems)  */)
9920   (ptrdiff_t nargs, Lisp_Object *args)
9921 {
9922   ptrdiff_t i, j;
9923   bool changed[coding_category_max];
9924   enum coding_category priorities[coding_category_max];
9925
9926   memset (changed, 0, sizeof changed);
9927
9928   for (i = j = 0; i < nargs; i++)
9929     {
9930       enum coding_category category;
9931       Lisp_Object spec, attrs;
9932
9933       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9934       attrs = AREF (spec, 0);
9935       category = XINT (CODING_ATTR_CATEGORY (attrs));
9936       if (changed[category])
9937         /* Ignore this coding system because a coding system of the
9938            same category already had a higher priority.  */
9939         continue;
9940       changed[category] = 1;
9941       priorities[j++] = category;
9942       if (coding_categories[category].id >= 0
9943           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9944         setup_coding_system (args[i], &coding_categories[category]);
9945       Fset (AREF (Vcoding_category_table, category), args[i]);
9946     }
9947
9948   /* Now we have decided top J priorities.  Reflect the order of the
9949      original priorities to the remaining priorities.  */
9950
9951   for (i = j, j = 0; i < coding_category_max; i++, j++)
9952     {
9953       while (j < coding_category_max
9954              && changed[coding_priorities[j]])
9955         j++;
9956       if (j == coding_category_max)
9957         emacs_abort ();
9958       priorities[i] = coding_priorities[j];
9959     }
9960
9961   memcpy (coding_priorities, priorities, sizeof priorities);
9962
9963   /* Update `coding-category-list'.  */
9964   Vcoding_category_list = Qnil;
9965   for (i = coding_category_max; i-- > 0; )
9966     Vcoding_category_list
9967       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9968                Vcoding_category_list);
9969
9970   return Qnil;
9971 }
9972
9973 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9974        Scoding_system_priority_list, 0, 1, 0,
9975        doc: /* Return a list of coding systems ordered by their priorities.
9976 The list contains a subset of coding systems; i.e. coding systems
9977 assigned to each coding category (see `coding-category-list').
9978
9979 HIGHESTP non-nil means just return the highest priority one.  */)
9980   (Lisp_Object highestp)
9981 {
9982   int i;
9983   Lisp_Object val;
9984
9985   for (i = 0, val = Qnil; i < coding_category_max; i++)
9986     {
9987       enum coding_category category = coding_priorities[i];
9988       int id = coding_categories[category].id;
9989       Lisp_Object attrs;
9990
9991       if (id < 0)
9992         continue;
9993       attrs = CODING_ID_ATTRS (id);
9994       if (! NILP (highestp))
9995         return CODING_ATTR_BASE_NAME (attrs);
9996       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9997     }
9998   return Fnreverse (val);
9999 }
10000
10001 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
10002
10003 static Lisp_Object
10004 make_subsidiaries (Lisp_Object base)
10005 {
10006   Lisp_Object subsidiaries;
10007   ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
10008   USE_SAFE_ALLOCA;
10009   char *buf = SAFE_ALLOCA (base_name_len + 6);
10010   int i;
10011
10012   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
10013   subsidiaries = make_uninit_vector (3);
10014   for (i = 0; i < 3; i++)
10015     {
10016       strcpy (buf + base_name_len, suffixes[i]);
10017       ASET (subsidiaries, i, intern (buf));
10018     }
10019   SAFE_FREE ();
10020   return subsidiaries;
10021 }
10022
10023
10024 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
10025        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
10026        doc: /* For internal use only.
10027 usage: (define-coding-system-internal ...)  */)
10028   (ptrdiff_t nargs, Lisp_Object *args)
10029 {
10030   Lisp_Object name;
10031   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
10032   Lisp_Object attrs;            /* Vector of attributes.  */
10033   Lisp_Object eol_type;
10034   Lisp_Object aliases;
10035   Lisp_Object coding_type, charset_list, safe_charsets;
10036   enum coding_category category;
10037   Lisp_Object tail, val;
10038   int max_charset_id = 0;
10039   int i;
10040
10041   if (nargs < coding_arg_max)
10042     goto short_args;
10043
10044   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
10045
10046   name = args[coding_arg_name];
10047   CHECK_SYMBOL (name);
10048   ASET (attrs, coding_attr_base_name, name);
10049
10050   val = args[coding_arg_mnemonic];
10051   if (! STRINGP (val))
10052     CHECK_CHARACTER (val);
10053   ASET (attrs, coding_attr_mnemonic, val);
10054
10055   coding_type = args[coding_arg_coding_type];
10056   CHECK_SYMBOL (coding_type);
10057   ASET (attrs, coding_attr_type, coding_type);
10058
10059   charset_list = args[coding_arg_charset_list];
10060   if (SYMBOLP (charset_list))
10061     {
10062       if (EQ (charset_list, Qiso_2022))
10063         {
10064           if (! EQ (coding_type, Qiso_2022))
10065             error ("Invalid charset-list");
10066           charset_list = Viso_2022_charset_list;
10067         }
10068       else if (EQ (charset_list, Qemacs_mule))
10069         {
10070           if (! EQ (coding_type, Qemacs_mule))
10071             error ("Invalid charset-list");
10072           charset_list = Vemacs_mule_charset_list;
10073         }
10074       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10075         {
10076           if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
10077             error ("Invalid charset-list");
10078           if (max_charset_id < XFASTINT (XCAR (tail)))
10079             max_charset_id = XFASTINT (XCAR (tail));
10080         }
10081     }
10082   else
10083     {
10084       charset_list = Fcopy_sequence (charset_list);
10085       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10086         {
10087           struct charset *charset;
10088
10089           val = XCAR (tail);
10090           CHECK_CHARSET_GET_CHARSET (val, charset);
10091           if (EQ (coding_type, Qiso_2022)
10092               ? CHARSET_ISO_FINAL (charset) < 0
10093               : EQ (coding_type, Qemacs_mule)
10094               ? CHARSET_EMACS_MULE_ID (charset) < 0
10095               : 0)
10096             error ("Can't handle charset `%s'",
10097                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10098
10099           XSETCAR (tail, make_number (charset->id));
10100           if (max_charset_id < charset->id)
10101             max_charset_id = charset->id;
10102         }
10103     }
10104   ASET (attrs, coding_attr_charset_list, charset_list);
10105
10106   safe_charsets = make_uninit_string (max_charset_id + 1);
10107   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
10108   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10109     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
10110   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
10111
10112   ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
10113
10114   val = args[coding_arg_decode_translation_table];
10115   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10116     CHECK_SYMBOL (val);
10117   ASET (attrs, coding_attr_decode_tbl, val);
10118
10119   val = args[coding_arg_encode_translation_table];
10120   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10121     CHECK_SYMBOL (val);
10122   ASET (attrs, coding_attr_encode_tbl, val);
10123
10124   val = args[coding_arg_post_read_conversion];
10125   CHECK_SYMBOL (val);
10126   ASET (attrs, coding_attr_post_read, val);
10127
10128   val = args[coding_arg_pre_write_conversion];
10129   CHECK_SYMBOL (val);
10130   ASET (attrs, coding_attr_pre_write, val);
10131
10132   val = args[coding_arg_default_char];
10133   if (NILP (val))
10134     ASET (attrs, coding_attr_default_char, make_number (' '));
10135   else
10136     {
10137       CHECK_CHARACTER (val);
10138       ASET (attrs, coding_attr_default_char, val);
10139     }
10140
10141   val = args[coding_arg_for_unibyte];
10142   ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
10143
10144   val = args[coding_arg_plist];
10145   CHECK_LIST (val);
10146   ASET (attrs, coding_attr_plist, val);
10147
10148   if (EQ (coding_type, Qcharset))
10149     {
10150       /* Generate a lisp vector of 256 elements.  Each element is nil,
10151          integer, or a list of charset IDs.
10152
10153          If Nth element is nil, the byte code N is invalid in this
10154          coding system.
10155
10156          If Nth element is a number NUM, N is the first byte of a
10157          charset whose ID is NUM.
10158
10159          If Nth element is a list of charset IDs, N is the first byte
10160          of one of them.  The list is sorted by dimensions of the
10161          charsets.  A charset of smaller dimension comes first. */
10162       val = Fmake_vector (make_number (256), Qnil);
10163
10164       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10165         {
10166           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
10167           int dim = CHARSET_DIMENSION (charset);
10168           int idx = (dim - 1) * 4;
10169
10170           if (CHARSET_ASCII_COMPATIBLE_P (charset))
10171             ASET (attrs, coding_attr_ascii_compat, Qt);
10172
10173           for (i = charset->code_space[idx];
10174                i <= charset->code_space[idx + 1]; i++)
10175             {
10176               Lisp_Object tmp, tmp2;
10177               int dim2;
10178
10179               tmp = AREF (val, i);
10180               if (NILP (tmp))
10181                 tmp = XCAR (tail);
10182               else if (NUMBERP (tmp))
10183                 {
10184                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
10185                   if (dim < dim2)
10186                     tmp = list2 (XCAR (tail), tmp);
10187                   else
10188                     tmp = list2 (tmp, XCAR (tail));
10189                 }
10190               else
10191                 {
10192                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
10193                     {
10194                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
10195                       if (dim < dim2)
10196                         break;
10197                     }
10198                   if (NILP (tmp2))
10199                     tmp = nconc2 (tmp, list1 (XCAR (tail)));
10200                   else
10201                     {
10202                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
10203                       XSETCAR (tmp2, XCAR (tail));
10204                     }
10205                 }
10206               ASET (val, i, tmp);
10207             }
10208         }
10209       ASET (attrs, coding_attr_charset_valids, val);
10210       category = coding_category_charset;
10211     }
10212   else if (EQ (coding_type, Qccl))
10213     {
10214       Lisp_Object valids;
10215
10216       if (nargs < coding_arg_ccl_max)
10217         goto short_args;
10218
10219       val = args[coding_arg_ccl_decoder];
10220       CHECK_CCL_PROGRAM (val);
10221       if (VECTORP (val))
10222         val = Fcopy_sequence (val);
10223       ASET (attrs, coding_attr_ccl_decoder, val);
10224
10225       val = args[coding_arg_ccl_encoder];
10226       CHECK_CCL_PROGRAM (val);
10227       if (VECTORP (val))
10228         val = Fcopy_sequence (val);
10229       ASET (attrs, coding_attr_ccl_encoder, val);
10230
10231       val = args[coding_arg_ccl_valids];
10232       valids = Fmake_string (make_number (256), make_number (0));
10233       for (tail = val; CONSP (tail); tail = XCDR (tail))
10234         {
10235           int from, to;
10236
10237           val = XCAR (tail);
10238           if (INTEGERP (val))
10239             {
10240               if (! (0 <= XINT (val) && XINT (val) <= 255))
10241                 args_out_of_range_3 (val, make_number (0), make_number (255));
10242               from = to = XINT (val);
10243             }
10244           else
10245             {
10246               CHECK_CONS (val);
10247               CHECK_NATNUM_CAR (val);
10248               CHECK_NUMBER_CDR (val);
10249               if (XINT (XCAR (val)) > 255)
10250                 args_out_of_range_3 (XCAR (val),
10251                                      make_number (0), make_number (255));
10252               from = XINT (XCAR (val));
10253               if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
10254                 args_out_of_range_3 (XCDR (val),
10255                                      XCAR (val), make_number (255));
10256               to = XINT (XCDR (val));
10257             }
10258           for (i = from; i <= to; i++)
10259             SSET (valids, i, 1);
10260         }
10261       ASET (attrs, coding_attr_ccl_valids, valids);
10262
10263       category = coding_category_ccl;
10264     }
10265   else if (EQ (coding_type, Qutf_16))
10266     {
10267       Lisp_Object bom, endian;
10268
10269       ASET (attrs, coding_attr_ascii_compat, Qnil);
10270
10271       if (nargs < coding_arg_utf16_max)
10272         goto short_args;
10273
10274       bom = args[coding_arg_utf16_bom];
10275       if (! NILP (bom) && ! EQ (bom, Qt))
10276         {
10277           CHECK_CONS (bom);
10278           val = XCAR (bom);
10279           CHECK_CODING_SYSTEM (val);
10280           val = XCDR (bom);
10281           CHECK_CODING_SYSTEM (val);
10282         }
10283       ASET (attrs, coding_attr_utf_bom, bom);
10284
10285       endian = args[coding_arg_utf16_endian];
10286       CHECK_SYMBOL (endian);
10287       if (NILP (endian))
10288         endian = Qbig;
10289       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
10290         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
10291       ASET (attrs, coding_attr_utf_16_endian, endian);
10292
10293       category = (CONSP (bom)
10294                   ? coding_category_utf_16_auto
10295                   : NILP (bom)
10296                   ? (EQ (endian, Qbig)
10297                      ? coding_category_utf_16_be_nosig
10298                      : coding_category_utf_16_le_nosig)
10299                   : (EQ (endian, Qbig)
10300                      ? coding_category_utf_16_be
10301                      : coding_category_utf_16_le));
10302     }
10303   else if (EQ (coding_type, Qiso_2022))
10304     {
10305       Lisp_Object initial, reg_usage, request, flags;
10306
10307       if (nargs < coding_arg_iso2022_max)
10308         goto short_args;
10309
10310       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
10311       CHECK_VECTOR (initial);
10312       for (i = 0; i < 4; i++)
10313         {
10314           val = AREF (initial, i);
10315           if (! NILP (val))
10316             {
10317               struct charset *charset;
10318
10319               CHECK_CHARSET_GET_CHARSET (val, charset);
10320               ASET (initial, i, make_number (CHARSET_ID (charset)));
10321               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
10322                 ASET (attrs, coding_attr_ascii_compat, Qt);
10323             }
10324           else
10325             ASET (initial, i, make_number (-1));
10326         }
10327
10328       reg_usage = args[coding_arg_iso2022_reg_usage];
10329       CHECK_CONS (reg_usage);
10330       CHECK_NUMBER_CAR (reg_usage);
10331       CHECK_NUMBER_CDR (reg_usage);
10332
10333       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
10334       for (tail = request; CONSP (tail); tail = XCDR (tail))
10335         {
10336           int id;
10337           Lisp_Object tmp1;
10338
10339           val = XCAR (tail);
10340           CHECK_CONS (val);
10341           tmp1 = XCAR (val);
10342           CHECK_CHARSET_GET_ID (tmp1, id);
10343           CHECK_NATNUM_CDR (val);
10344           if (XINT (XCDR (val)) >= 4)
10345             error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
10346           XSETCAR (val, make_number (id));
10347         }
10348
10349       flags = args[coding_arg_iso2022_flags];
10350       CHECK_NATNUM (flags);
10351       i = XINT (flags) & INT_MAX;
10352       if (EQ (args[coding_arg_charset_list], Qiso_2022))
10353         i |= CODING_ISO_FLAG_FULL_SUPPORT;
10354       flags = make_number (i);
10355
10356       ASET (attrs, coding_attr_iso_initial, initial);
10357       ASET (attrs, coding_attr_iso_usage, reg_usage);
10358       ASET (attrs, coding_attr_iso_request, request);
10359       ASET (attrs, coding_attr_iso_flags, flags);
10360       setup_iso_safe_charsets (attrs);
10361
10362       if (i & CODING_ISO_FLAG_SEVEN_BITS)
10363         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
10364                           | CODING_ISO_FLAG_SINGLE_SHIFT))
10365                     ? coding_category_iso_7_else
10366                     : EQ (args[coding_arg_charset_list], Qiso_2022)
10367                     ? coding_category_iso_7
10368                     : coding_category_iso_7_tight);
10369       else
10370         {
10371           int id = XINT (AREF (initial, 1));
10372
10373           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
10374                        || EQ (args[coding_arg_charset_list], Qiso_2022)
10375                        || id < 0)
10376                       ? coding_category_iso_8_else
10377                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
10378                       ? coding_category_iso_8_1
10379                       : coding_category_iso_8_2);
10380         }
10381       if (category != coding_category_iso_8_1
10382           && category != coding_category_iso_8_2)
10383         ASET (attrs, coding_attr_ascii_compat, Qnil);
10384     }
10385   else if (EQ (coding_type, Qemacs_mule))
10386     {
10387       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
10388         ASET (attrs, coding_attr_emacs_mule_full, Qt);
10389       ASET (attrs, coding_attr_ascii_compat, Qt);
10390       category = coding_category_emacs_mule;
10391     }
10392   else if (EQ (coding_type, Qshift_jis))
10393     {
10394
10395       struct charset *charset;
10396
10397       if (XINT (Flength (charset_list)) != 3
10398           && XINT (Flength (charset_list)) != 4)
10399         error ("There should be three or four charsets");
10400
10401       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10402       if (CHARSET_DIMENSION (charset) != 1)
10403         error ("Dimension of charset %s is not one",
10404                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10405       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10406         ASET (attrs, coding_attr_ascii_compat, Qt);
10407
10408       charset_list = XCDR (charset_list);
10409       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10410       if (CHARSET_DIMENSION (charset) != 1)
10411         error ("Dimension of charset %s is not one",
10412                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10413
10414       charset_list = XCDR (charset_list);
10415       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10416       if (CHARSET_DIMENSION (charset) != 2)
10417         error ("Dimension of charset %s is not two",
10418                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10419
10420       charset_list = XCDR (charset_list);
10421       if (! NILP (charset_list))
10422         {
10423           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10424           if (CHARSET_DIMENSION (charset) != 2)
10425             error ("Dimension of charset %s is not two",
10426                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10427         }
10428
10429       category = coding_category_sjis;
10430       Vsjis_coding_system = name;
10431     }
10432   else if (EQ (coding_type, Qbig5))
10433     {
10434       struct charset *charset;
10435
10436       if (XINT (Flength (charset_list)) != 2)
10437         error ("There should be just two charsets");
10438
10439       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10440       if (CHARSET_DIMENSION (charset) != 1)
10441         error ("Dimension of charset %s is not one",
10442                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10443       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10444         ASET (attrs, coding_attr_ascii_compat, Qt);
10445
10446       charset_list = XCDR (charset_list);
10447       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10448       if (CHARSET_DIMENSION (charset) != 2)
10449         error ("Dimension of charset %s is not two",
10450                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10451
10452       category = coding_category_big5;
10453       Vbig5_coding_system = name;
10454     }
10455   else if (EQ (coding_type, Qraw_text))
10456     {
10457       category = coding_category_raw_text;
10458       ASET (attrs, coding_attr_ascii_compat, Qt);
10459     }
10460   else if (EQ (coding_type, Qutf_8))
10461     {
10462       Lisp_Object bom;
10463
10464       if (nargs < coding_arg_utf8_max)
10465         goto short_args;
10466
10467       bom = args[coding_arg_utf8_bom];
10468       if (! NILP (bom) && ! EQ (bom, Qt))
10469         {
10470           CHECK_CONS (bom);
10471           val = XCAR (bom);
10472           CHECK_CODING_SYSTEM (val);
10473           val = XCDR (bom);
10474           CHECK_CODING_SYSTEM (val);
10475         }
10476       ASET (attrs, coding_attr_utf_bom, bom);
10477       if (NILP (bom))
10478         ASET (attrs, coding_attr_ascii_compat, Qt);
10479
10480       category = (CONSP (bom) ? coding_category_utf_8_auto
10481                   : NILP (bom) ? coding_category_utf_8_nosig
10482                   : coding_category_utf_8_sig);
10483     }
10484   else if (EQ (coding_type, Qundecided))
10485     {
10486       if (nargs < coding_arg_undecided_max)
10487         goto short_args;
10488       ASET (attrs, coding_attr_undecided_inhibit_null_byte_detection,
10489             args[coding_arg_undecided_inhibit_null_byte_detection]);
10490       ASET (attrs, coding_attr_undecided_inhibit_iso_escape_detection,
10491             args[coding_arg_undecided_inhibit_iso_escape_detection]);
10492       ASET (attrs, coding_attr_undecided_prefer_utf_8,
10493             args[coding_arg_undecided_prefer_utf_8]);
10494       category = coding_category_undecided;
10495     }
10496   else
10497     error ("Invalid coding system type: %s",
10498            SDATA (SYMBOL_NAME (coding_type)));
10499
10500   ASET (attrs, coding_attr_category, make_number (category));
10501   ASET (attrs, coding_attr_plist,
10502         Fcons (QCcategory,
10503                Fcons (AREF (Vcoding_category_table, category),
10504                       CODING_ATTR_PLIST (attrs))));
10505   ASET (attrs, coding_attr_plist,
10506         Fcons (QCascii_compatible_p,
10507                Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10508                       CODING_ATTR_PLIST (attrs))));
10509
10510   eol_type = args[coding_arg_eol_type];
10511   if (! NILP (eol_type)
10512       && ! EQ (eol_type, Qunix)
10513       && ! EQ (eol_type, Qdos)
10514       && ! EQ (eol_type, Qmac))
10515     error ("Invalid eol-type");
10516
10517   aliases = list1 (name);
10518
10519   if (NILP (eol_type))
10520     {
10521       eol_type = make_subsidiaries (name);
10522       for (i = 0; i < 3; i++)
10523         {
10524           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10525
10526           this_name = AREF (eol_type, i);
10527           this_aliases = list1 (this_name);
10528           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10529           this_spec = make_uninit_vector (3);
10530           ASET (this_spec, 0, attrs);
10531           ASET (this_spec, 1, this_aliases);
10532           ASET (this_spec, 2, this_eol_type);
10533           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10534           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10535           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10536           if (NILP (val))
10537             Vcoding_system_alist
10538               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10539                        Vcoding_system_alist);
10540         }
10541     }
10542
10543   spec_vec = make_uninit_vector (3);
10544   ASET (spec_vec, 0, attrs);
10545   ASET (spec_vec, 1, aliases);
10546   ASET (spec_vec, 2, eol_type);
10547
10548   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10549   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10550   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10551   if (NILP (val))
10552     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10553                                   Vcoding_system_alist);
10554
10555   {
10556     int id = coding_categories[category].id;
10557
10558     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10559       setup_coding_system (name, &coding_categories[category]);
10560   }
10561
10562   return Qnil;
10563
10564  short_args:
10565   return Fsignal (Qwrong_number_of_arguments,
10566                   Fcons (intern ("define-coding-system-internal"),
10567                          make_number (nargs)));
10568 }
10569
10570
10571 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10572        3, 3, 0,
10573        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10574   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10575 {
10576   Lisp_Object spec, attrs;
10577
10578   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10579   attrs = AREF (spec, 0);
10580   if (EQ (prop, QCmnemonic))
10581     {
10582       if (! STRINGP (val))
10583         CHECK_CHARACTER (val);
10584       ASET (attrs, coding_attr_mnemonic, val);
10585     }
10586   else if (EQ (prop, QCdefault_char))
10587     {
10588       if (NILP (val))
10589         val = make_number (' ');
10590       else
10591         CHECK_CHARACTER (val);
10592       ASET (attrs, coding_attr_default_char, val);
10593     }
10594   else if (EQ (prop, QCdecode_translation_table))
10595     {
10596       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10597         CHECK_SYMBOL (val);
10598       ASET (attrs, coding_attr_decode_tbl, val);
10599     }
10600   else if (EQ (prop, QCencode_translation_table))
10601     {
10602       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10603         CHECK_SYMBOL (val);
10604       ASET (attrs, coding_attr_encode_tbl, val);
10605     }
10606   else if (EQ (prop, QCpost_read_conversion))
10607     {
10608       CHECK_SYMBOL (val);
10609       ASET (attrs, coding_attr_post_read, val);
10610     }
10611   else if (EQ (prop, QCpre_write_conversion))
10612     {
10613       CHECK_SYMBOL (val);
10614       ASET (attrs, coding_attr_pre_write, val);
10615     }
10616   else if (EQ (prop, QCascii_compatible_p))
10617     {
10618       ASET (attrs, coding_attr_ascii_compat, val);
10619     }
10620
10621   ASET (attrs, coding_attr_plist,
10622         Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10623   return val;
10624 }
10625
10626
10627 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10628        Sdefine_coding_system_alias, 2, 2, 0,
10629        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10630   (Lisp_Object alias, Lisp_Object coding_system)
10631 {
10632   Lisp_Object spec, aliases, eol_type, val;
10633
10634   CHECK_SYMBOL (alias);
10635   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10636   aliases = AREF (spec, 1);
10637   /* ALIASES should be a list of length more than zero, and the first
10638      element is a base coding system.  Append ALIAS at the tail of the
10639      list.  */
10640   while (!NILP (XCDR (aliases)))
10641     aliases = XCDR (aliases);
10642   XSETCDR (aliases, list1 (alias));
10643
10644   eol_type = AREF (spec, 2);
10645   if (VECTORP (eol_type))
10646     {
10647       Lisp_Object subsidiaries;
10648       int i;
10649
10650       subsidiaries = make_subsidiaries (alias);
10651       for (i = 0; i < 3; i++)
10652         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10653                                      AREF (eol_type, i));
10654     }
10655
10656   Fputhash (alias, spec, Vcoding_system_hash_table);
10657   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10658   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10659   if (NILP (val))
10660     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10661                                   Vcoding_system_alist);
10662
10663   return Qnil;
10664 }
10665
10666 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10667        1, 1, 0,
10668        doc: /* Return the base of CODING-SYSTEM.
10669 Any alias or subsidiary coding system is not a base coding system.  */)
10670   (Lisp_Object coding_system)
10671 {
10672   Lisp_Object spec, attrs;
10673
10674   if (NILP (coding_system))
10675     return (Qno_conversion);
10676   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10677   attrs = AREF (spec, 0);
10678   return CODING_ATTR_BASE_NAME (attrs);
10679 }
10680
10681 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10682        1, 1, 0,
10683        doc: /* Return the property list of CODING-SYSTEM.  */)
10684   (Lisp_Object coding_system)
10685 {
10686   Lisp_Object spec, attrs;
10687
10688   if (NILP (coding_system))
10689     coding_system = Qno_conversion;
10690   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10691   attrs = AREF (spec, 0);
10692   return CODING_ATTR_PLIST (attrs);
10693 }
10694
10695
10696 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10697        1, 1, 0,
10698        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10699   (Lisp_Object coding_system)
10700 {
10701   Lisp_Object spec;
10702
10703   if (NILP (coding_system))
10704     coding_system = Qno_conversion;
10705   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10706   return AREF (spec, 1);
10707 }
10708
10709 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10710        Scoding_system_eol_type, 1, 1, 0,
10711        doc: /* Return eol-type of CODING-SYSTEM.
10712 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10713
10714 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10715 and CR respectively.
10716
10717 A vector value indicates that a format of end-of-line should be
10718 detected automatically.  Nth element of the vector is the subsidiary
10719 coding system whose eol-type is N.  */)
10720   (Lisp_Object coding_system)
10721 {
10722   Lisp_Object spec, eol_type;
10723   int n;
10724
10725   if (NILP (coding_system))
10726     coding_system = Qno_conversion;
10727   if (! CODING_SYSTEM_P (coding_system))
10728     return Qnil;
10729   spec = CODING_SYSTEM_SPEC (coding_system);
10730   eol_type = AREF (spec, 2);
10731   if (VECTORP (eol_type))
10732     return Fcopy_sequence (eol_type);
10733   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10734   return make_number (n);
10735 }
10736
10737 #endif /* emacs */
10738
10739 \f
10740 /*** 9. Post-amble ***/
10741
10742 void
10743 init_coding_once (void)
10744 {
10745   int i;
10746
10747   for (i = 0; i < coding_category_max; i++)
10748     {
10749       coding_categories[i].id = -1;
10750       coding_priorities[i] = i;
10751     }
10752
10753   /* ISO2022 specific initialize routine.  */
10754   for (i = 0; i < 0x20; i++)
10755     iso_code_class[i] = ISO_control_0;
10756   for (i = 0x21; i < 0x7F; i++)
10757     iso_code_class[i] = ISO_graphic_plane_0;
10758   for (i = 0x80; i < 0xA0; i++)
10759     iso_code_class[i] = ISO_control_1;
10760   for (i = 0xA1; i < 0xFF; i++)
10761     iso_code_class[i] = ISO_graphic_plane_1;
10762   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10763   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10764   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10765   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10766   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10767   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10768   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10769   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10770   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10771
10772   for (i = 0; i < 256; i++)
10773     {
10774       emacs_mule_bytes[i] = 1;
10775     }
10776   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10777   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10778   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10779   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10780 }
10781
10782 #ifdef emacs
10783
10784 void
10785 syms_of_coding (void)
10786 {
10787   staticpro (&Vcoding_system_hash_table);
10788   Vcoding_system_hash_table = CALLN (Fmake_hash_table, QCtest, Qeq);
10789
10790   staticpro (&Vsjis_coding_system);
10791   Vsjis_coding_system = Qnil;
10792
10793   staticpro (&Vbig5_coding_system);
10794   Vbig5_coding_system = Qnil;
10795
10796   staticpro (&Vcode_conversion_reused_workbuf);
10797   Vcode_conversion_reused_workbuf = Qnil;
10798
10799   staticpro (&Vcode_conversion_workbuf_name);
10800   Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10801
10802   reused_workbuf_in_use = 0;
10803
10804   DEFSYM (Qcharset, "charset");
10805   DEFSYM (Qtarget_idx, "target-idx");
10806   DEFSYM (Qcoding_system_history, "coding-system-history");
10807   Fset (Qcoding_system_history, Qnil);
10808
10809   /* Target FILENAME is the first argument.  */
10810   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10811   /* Target FILENAME is the third argument.  */
10812   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10813
10814   DEFSYM (Qcall_process, "call-process");
10815   /* Target PROGRAM is the first argument.  */
10816   Fput (Qcall_process, Qtarget_idx, make_number (0));
10817
10818   DEFSYM (Qcall_process_region, "call-process-region");
10819   /* Target PROGRAM is the third argument.  */
10820   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10821
10822   DEFSYM (Qstart_process, "start-process");
10823   /* Target PROGRAM is the third argument.  */
10824   Fput (Qstart_process, Qtarget_idx, make_number (2));
10825
10826   DEFSYM (Qopen_network_stream, "open-network-stream");
10827   /* Target SERVICE is the fourth argument.  */
10828   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10829
10830   DEFSYM (Qunix, "unix");
10831   DEFSYM (Qdos, "dos");
10832   DEFSYM (Qmac, "mac");
10833
10834   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10835   DEFSYM (Qundecided, "undecided");
10836   DEFSYM (Qno_conversion, "no-conversion");
10837   DEFSYM (Qraw_text, "raw-text");
10838
10839   DEFSYM (Qiso_2022, "iso-2022");
10840
10841   DEFSYM (Qutf_8, "utf-8");
10842   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10843
10844 #if defined (WINDOWSNT) || defined (CYGWIN)
10845   /* No, not utf-16-le: that one has a BOM.  */
10846   DEFSYM (Qutf_16le, "utf-16le");
10847 #endif
10848
10849   DEFSYM (Qutf_16, "utf-16");
10850   DEFSYM (Qbig, "big");
10851   DEFSYM (Qlittle, "little");
10852
10853   DEFSYM (Qshift_jis, "shift-jis");
10854   DEFSYM (Qbig5, "big5");
10855
10856   DEFSYM (Qcoding_system_p, "coding-system-p");
10857
10858   /* Error signaled when there's a problem with detecting a coding system.  */
10859   DEFSYM (Qcoding_system_error, "coding-system-error");
10860   Fput (Qcoding_system_error, Qerror_conditions,
10861         listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10862   Fput (Qcoding_system_error, Qerror_message,
10863         build_pure_c_string ("Invalid coding system"));
10864
10865   DEFSYM (Qtranslation_table, "translation-table");
10866   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10867   DEFSYM (Qtranslation_table_id, "translation-table-id");
10868
10869   /* Coding system emacs-mule and raw-text are for converting only
10870      end-of-line format.  */
10871   DEFSYM (Qemacs_mule, "emacs-mule");
10872
10873   DEFSYM (QCcategory, ":category");
10874   DEFSYM (QCmnemonic, ":mnemonic");
10875   DEFSYM (QCdefault_char, ":default-char");
10876   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10877   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10878   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10879   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10880   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10881
10882   Vcoding_category_table
10883     = Fmake_vector (make_number (coding_category_max), Qnil);
10884   staticpro (&Vcoding_category_table);
10885   /* Followings are target of code detection.  */
10886   ASET (Vcoding_category_table, coding_category_iso_7,
10887         intern_c_string ("coding-category-iso-7"));
10888   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10889         intern_c_string ("coding-category-iso-7-tight"));
10890   ASET (Vcoding_category_table, coding_category_iso_8_1,
10891         intern_c_string ("coding-category-iso-8-1"));
10892   ASET (Vcoding_category_table, coding_category_iso_8_2,
10893         intern_c_string ("coding-category-iso-8-2"));
10894   ASET (Vcoding_category_table, coding_category_iso_7_else,
10895         intern_c_string ("coding-category-iso-7-else"));
10896   ASET (Vcoding_category_table, coding_category_iso_8_else,
10897         intern_c_string ("coding-category-iso-8-else"));
10898   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10899         intern_c_string ("coding-category-utf-8-auto"));
10900   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10901         intern_c_string ("coding-category-utf-8"));
10902   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10903         intern_c_string ("coding-category-utf-8-sig"));
10904   ASET (Vcoding_category_table, coding_category_utf_16_be,
10905         intern_c_string ("coding-category-utf-16-be"));
10906   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10907         intern_c_string ("coding-category-utf-16-auto"));
10908   ASET (Vcoding_category_table, coding_category_utf_16_le,
10909         intern_c_string ("coding-category-utf-16-le"));
10910   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10911         intern_c_string ("coding-category-utf-16-be-nosig"));
10912   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10913         intern_c_string ("coding-category-utf-16-le-nosig"));
10914   ASET (Vcoding_category_table, coding_category_charset,
10915         intern_c_string ("coding-category-charset"));
10916   ASET (Vcoding_category_table, coding_category_sjis,
10917         intern_c_string ("coding-category-sjis"));
10918   ASET (Vcoding_category_table, coding_category_big5,
10919         intern_c_string ("coding-category-big5"));
10920   ASET (Vcoding_category_table, coding_category_ccl,
10921         intern_c_string ("coding-category-ccl"));
10922   ASET (Vcoding_category_table, coding_category_emacs_mule,
10923         intern_c_string ("coding-category-emacs-mule"));
10924   /* Followings are NOT target of code detection.  */
10925   ASET (Vcoding_category_table, coding_category_raw_text,
10926         intern_c_string ("coding-category-raw-text"));
10927   ASET (Vcoding_category_table, coding_category_undecided,
10928         intern_c_string ("coding-category-undecided"));
10929
10930   DEFSYM (Qinsufficient_source, "insufficient-source");
10931   DEFSYM (Qinvalid_source, "invalid-source");
10932   DEFSYM (Qinterrupted, "interrupted");
10933
10934   /* If a symbol has this property, evaluate the value to define the
10935      symbol as a coding system.  */
10936   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10937
10938   defsubr (&Scoding_system_p);
10939   defsubr (&Sread_coding_system);
10940   defsubr (&Sread_non_nil_coding_system);
10941   defsubr (&Scheck_coding_system);
10942   defsubr (&Sdetect_coding_region);
10943   defsubr (&Sdetect_coding_string);
10944   defsubr (&Sfind_coding_systems_region_internal);
10945   defsubr (&Sunencodable_char_position);
10946   defsubr (&Scheck_coding_systems_region);
10947   defsubr (&Sdecode_coding_region);
10948   defsubr (&Sencode_coding_region);
10949   defsubr (&Sdecode_coding_string);
10950   defsubr (&Sencode_coding_string);
10951   defsubr (&Sdecode_sjis_char);
10952   defsubr (&Sencode_sjis_char);
10953   defsubr (&Sdecode_big5_char);
10954   defsubr (&Sencode_big5_char);
10955   defsubr (&Sset_terminal_coding_system_internal);
10956   defsubr (&Sset_safe_terminal_coding_system_internal);
10957   defsubr (&Sterminal_coding_system);
10958   defsubr (&Sset_keyboard_coding_system_internal);
10959   defsubr (&Skeyboard_coding_system);
10960   defsubr (&Sfind_operation_coding_system);
10961   defsubr (&Sset_coding_system_priority);
10962   defsubr (&Sdefine_coding_system_internal);
10963   defsubr (&Sdefine_coding_system_alias);
10964   defsubr (&Scoding_system_put);
10965   defsubr (&Scoding_system_base);
10966   defsubr (&Scoding_system_plist);
10967   defsubr (&Scoding_system_aliases);
10968   defsubr (&Scoding_system_eol_type);
10969   defsubr (&Scoding_system_priority_list);
10970
10971   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10972                doc: /* List of coding systems.
10973
10974 Do not alter the value of this variable manually.  This variable should be
10975 updated by the functions `define-coding-system' and
10976 `define-coding-system-alias'.  */);
10977   Vcoding_system_list = Qnil;
10978
10979   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10980                doc: /* Alist of coding system names.
10981 Each element is one element list of coding system name.
10982 This variable is given to `completing-read' as COLLECTION argument.
10983
10984 Do not alter the value of this variable manually.  This variable should be
10985 updated by the functions `make-coding-system' and
10986 `define-coding-system-alias'.  */);
10987   Vcoding_system_alist = Qnil;
10988
10989   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10990                doc: /* List of coding-categories (symbols) ordered by priority.
10991
10992 On detecting a coding system, Emacs tries code detection algorithms
10993 associated with each coding-category one by one in this order.  When
10994 one algorithm agrees with a byte sequence of source text, the coding
10995 system bound to the corresponding coding-category is selected.
10996
10997 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
10998   {
10999     int i;
11000
11001     Vcoding_category_list = Qnil;
11002     for (i = coding_category_max - 1; i >= 0; i--)
11003       Vcoding_category_list
11004         = Fcons (AREF (Vcoding_category_table, i),
11005                  Vcoding_category_list);
11006   }
11007
11008   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
11009                doc: /* Specify the coding system for read operations.
11010 It is useful to bind this variable with `let', but do not set it globally.
11011 If the value is a coding system, it is used for decoding on read operation.
11012 If not, an appropriate element is used from one of the coding system alists.
11013 There are three such tables: `file-coding-system-alist',
11014 `process-coding-system-alist', and `network-coding-system-alist'.  */);
11015   Vcoding_system_for_read = Qnil;
11016
11017   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
11018                doc: /* Specify the coding system for write operations.
11019 Programs bind this variable with `let', but you should not set it globally.
11020 If the value is a coding system, it is used for encoding of output,
11021 when writing it to a file and when sending it to a file or subprocess.
11022
11023 If this does not specify a coding system, an appropriate element
11024 is used from one of the coding system alists.
11025 There are three such tables: `file-coding-system-alist',
11026 `process-coding-system-alist', and `network-coding-system-alist'.
11027 For output to files, if the above procedure does not specify a coding system,
11028 the value of `buffer-file-coding-system' is used.  */);
11029   Vcoding_system_for_write = Qnil;
11030
11031   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
11032                doc: /*
11033 Coding system used in the latest file or process I/O.  */);
11034   Vlast_coding_system_used = Qnil;
11035
11036   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
11037                doc: /*
11038 Error status of the last code conversion.
11039
11040 When an error was detected in the last code conversion, this variable
11041 is set to one of the following symbols.
11042   `insufficient-source'
11043   `inconsistent-eol'
11044   `invalid-source'
11045   `interrupted'
11046   `insufficient-memory'
11047 When no error was detected, the value doesn't change.  So, to check
11048 the error status of a code conversion by this variable, you must
11049 explicitly set this variable to nil before performing code
11050 conversion.  */);
11051   Vlast_code_conversion_error = Qnil;
11052
11053   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
11054                doc: /*
11055 Non-nil means always inhibit code conversion of end-of-line format.
11056 See info node `Coding Systems' and info node `Text and Binary' concerning
11057 such conversion.  */);
11058   inhibit_eol_conversion = 0;
11059
11060   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
11061                doc: /*
11062 Non-nil means process buffer inherits coding system of process output.
11063 Bind it to t if the process output is to be treated as if it were a file
11064 read from some filesystem.  */);
11065   inherit_process_coding_system = 0;
11066
11067   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
11068                doc: /*
11069 Alist to decide a coding system to use for a file I/O operation.
11070 The format is ((PATTERN . VAL) ...),
11071 where PATTERN is a regular expression matching a file name,
11072 VAL is a coding system, a cons of coding systems, or a function symbol.
11073 If VAL is a coding system, it is used for both decoding and encoding
11074 the file contents.
11075 If VAL is a cons of coding systems, the car part is used for decoding,
11076 and the cdr part is used for encoding.
11077 If VAL is a function symbol, the function must return a coding system
11078 or a cons of coding systems which are used as above.  The function is
11079 called with an argument that is a list of the arguments with which
11080 `find-operation-coding-system' was called.  If the function can't decide
11081 a coding system, it can return `undecided' so that the normal
11082 code-detection is performed.
11083
11084 See also the function `find-operation-coding-system'
11085 and the variable `auto-coding-alist'.  */);
11086   Vfile_coding_system_alist = Qnil;
11087
11088   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
11089                doc: /*
11090 Alist to decide a coding system to use for a process I/O operation.
11091 The format is ((PATTERN . VAL) ...),
11092 where PATTERN is a regular expression matching a program name,
11093 VAL is a coding system, a cons of coding systems, or a function symbol.
11094 If VAL is a coding system, it is used for both decoding what received
11095 from the program and encoding what sent to the program.
11096 If VAL is a cons of coding systems, the car part is used for decoding,
11097 and the cdr part is used for encoding.
11098 If VAL is a function symbol, the function must return a coding system
11099 or a cons of coding systems which are used as above.
11100
11101 See also the function `find-operation-coding-system'.  */);
11102   Vprocess_coding_system_alist = Qnil;
11103
11104   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
11105                doc: /*
11106 Alist to decide a coding system to use for a network I/O operation.
11107 The format is ((PATTERN . VAL) ...),
11108 where PATTERN is a regular expression matching a network service name
11109 or is a port number to connect to,
11110 VAL is a coding system, a cons of coding systems, or a function symbol.
11111 If VAL is a coding system, it is used for both decoding what received
11112 from the network stream and encoding what sent to the network stream.
11113 If VAL is a cons of coding systems, the car part is used for decoding,
11114 and the cdr part is used for encoding.
11115 If VAL is a function symbol, the function must return a coding system
11116 or a cons of coding systems which are used as above.
11117
11118 See also the function `find-operation-coding-system'.  */);
11119   Vnetwork_coding_system_alist = Qnil;
11120
11121   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
11122                doc: /* Coding system to use with system messages.
11123 Also used for decoding keyboard input on X Window system, and for
11124 encoding standard output and error streams.  */);
11125   Vlocale_coding_system = Qnil;
11126
11127   /* The eol mnemonics are reset in startup.el system-dependently.  */
11128   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
11129                doc: /*
11130 String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
11131   eol_mnemonic_unix = build_pure_c_string (":");
11132
11133   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
11134                doc: /*
11135 String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
11136   eol_mnemonic_dos = build_pure_c_string ("\\");
11137
11138   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
11139                doc: /*
11140 String displayed in mode line for MAC-like (CR) end-of-line format.  */);
11141   eol_mnemonic_mac = build_pure_c_string ("/");
11142
11143   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
11144                doc: /*
11145 String displayed in mode line when end-of-line format is not yet determined.  */);
11146   eol_mnemonic_undecided = build_pure_c_string (":");
11147
11148   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
11149                doc: /*
11150 Non-nil enables character translation while encoding and decoding.  */);
11151   Venable_character_translation = Qt;
11152
11153   DEFVAR_LISP ("standard-translation-table-for-decode",
11154                Vstandard_translation_table_for_decode,
11155                doc: /* Table for translating characters while decoding.  */);
11156   Vstandard_translation_table_for_decode = Qnil;
11157
11158   DEFVAR_LISP ("standard-translation-table-for-encode",
11159                Vstandard_translation_table_for_encode,
11160                doc: /* Table for translating characters while encoding.  */);
11161   Vstandard_translation_table_for_encode = Qnil;
11162
11163   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
11164                doc: /* Alist of charsets vs revision numbers.
11165 While encoding, if a charset (car part of an element) is found,
11166 designate it with the escape sequence identifying revision (cdr part
11167 of the element).  */);
11168   Vcharset_revision_table = Qnil;
11169
11170   DEFVAR_LISP ("default-process-coding-system",
11171                Vdefault_process_coding_system,
11172                doc: /* Cons of coding systems used for process I/O by default.
11173 The car part is used for decoding a process output,
11174 the cdr part is used for encoding a text to be sent to a process.  */);
11175   Vdefault_process_coding_system = Qnil;
11176
11177   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
11178                doc: /*
11179 Table of extra Latin codes in the range 128..159 (inclusive).
11180 This is a vector of length 256.
11181 If Nth element is non-nil, the existence of code N in a file
11182 (or output of subprocess) doesn't prevent it to be detected as
11183 a coding system of ISO 2022 variant which has a flag
11184 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
11185 or reading output of a subprocess.
11186 Only 128th through 159th elements have a meaning.  */);
11187   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
11188
11189   DEFVAR_LISP ("select-safe-coding-system-function",
11190                Vselect_safe_coding_system_function,
11191                doc: /*
11192 Function to call to select safe coding system for encoding a text.
11193
11194 If set, this function is called to force a user to select a proper
11195 coding system which can encode the text in the case that a default
11196 coding system used in each operation can't encode the text.  The
11197 function should take care that the buffer is not modified while
11198 the coding system is being selected.
11199
11200 The default value is `select-safe-coding-system' (which see).  */);
11201   Vselect_safe_coding_system_function = Qnil;
11202
11203   DEFVAR_BOOL ("coding-system-require-warning",
11204                coding_system_require_warning,
11205                doc: /* Internal use only.
11206 If non-nil, on writing a file, `select-safe-coding-system-function' is
11207 called even if `coding-system-for-write' is non-nil.  The command
11208 `universal-coding-system-argument' binds this variable to t temporarily.  */);
11209   coding_system_require_warning = 0;
11210
11211
11212   DEFVAR_BOOL ("inhibit-iso-escape-detection",
11213                inhibit_iso_escape_detection,
11214                doc: /*
11215 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
11216
11217 When Emacs reads text, it tries to detect how the text is encoded.
11218 This code detection is sensitive to escape sequences.  If Emacs sees
11219 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
11220 of the ISO2022 encodings, and decodes text by the corresponding coding
11221 system (e.g. `iso-2022-7bit').
11222
11223 However, there may be a case that you want to read escape sequences in
11224 a file as is.  In such a case, you can set this variable to non-nil.
11225 Then the code detection will ignore any escape sequences, and no text is
11226 detected as encoded in some ISO-2022 encoding.  The result is that all
11227 escape sequences become visible in a buffer.
11228
11229 The default value is nil, and it is strongly recommended not to change
11230 it.  That is because many Emacs Lisp source files that contain
11231 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
11232 in Emacs's distribution, and they won't be decoded correctly on
11233 reading if you suppress escape sequence detection.
11234
11235 The other way to read escape sequences in a file without decoding is
11236 to explicitly specify some coding system that doesn't use ISO-2022
11237 escape sequence (e.g., `latin-1') on reading by \\[universal-coding-system-argument].  */);
11238   inhibit_iso_escape_detection = 0;
11239
11240   DEFVAR_BOOL ("inhibit-null-byte-detection",
11241                inhibit_null_byte_detection,
11242                doc: /* If non-nil, Emacs ignores null bytes on code detection.
11243 By default, Emacs treats it as binary data, and does not attempt to
11244 decode it.  The effect is as if you specified `no-conversion' for
11245 reading that text.
11246
11247 Set this to non-nil when a regular text happens to include null bytes.
11248 Examples are Index nodes of Info files and null-byte delimited output
11249 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
11250 decode text as usual.  */);
11251   inhibit_null_byte_detection = 0;
11252
11253   DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
11254                doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
11255 Internal use only.  Remove after the experimental optimizer becomes stable.  */);
11256   disable_ascii_optimization = 0;
11257
11258   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
11259                doc: /* Char table for translating self-inserting characters.
11260 This is applied to the result of input methods, not their input.
11261 See also `keyboard-translate-table'.
11262
11263 Use of this variable for character code unification was rendered
11264 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
11265 internal character representation.  */);
11266   Vtranslation_table_for_input = Qnil;
11267
11268   Lisp_Object args[coding_arg_undecided_max];
11269   memclear (args, sizeof args);
11270
11271   Lisp_Object plist[] =
11272     {
11273       QCname,
11274       args[coding_arg_name] = Qno_conversion,
11275       QCmnemonic,
11276       args[coding_arg_mnemonic] = make_number ('='),
11277       intern_c_string (":coding-type"),
11278       args[coding_arg_coding_type] = Qraw_text,
11279       QCascii_compatible_p,
11280       args[coding_arg_ascii_compatible_p] = Qt,
11281       QCdefault_char,
11282       args[coding_arg_default_char] = make_number (0),
11283       intern_c_string (":for-unibyte"),
11284       args[coding_arg_for_unibyte] = Qt,
11285       intern_c_string (":docstring"),
11286       (build_pure_c_string
11287        ("Do no conversion.\n"
11288         "\n"
11289         "When you visit a file with this coding, the file is read into a\n"
11290         "unibyte buffer as is, thus each byte of a file is treated as a\n"
11291         "character.")),
11292       intern_c_string (":eol-type"),
11293       args[coding_arg_eol_type] = Qunix,
11294     };
11295   args[coding_arg_plist] = CALLMANY (Flist, plist);
11296   Fdefine_coding_system_internal (coding_arg_max, args);
11297
11298   plist[1] = args[coding_arg_name] = Qundecided;
11299   plist[3] = args[coding_arg_mnemonic] = make_number ('-');
11300   plist[5] = args[coding_arg_coding_type] = Qundecided;
11301   /* This is already set.
11302      plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
11303   plist[8] = intern_c_string (":charset-list");
11304   plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
11305   plist[11] = args[coding_arg_for_unibyte] = Qnil;
11306   plist[13] = build_pure_c_string ("No conversion on encoding, "
11307                                    "automatic conversion on decoding.");
11308   plist[15] = args[coding_arg_eol_type] = Qnil;
11309   args[coding_arg_plist] = CALLMANY (Flist, plist);
11310   args[coding_arg_undecided_inhibit_null_byte_detection] = make_number (0);
11311   args[coding_arg_undecided_inhibit_iso_escape_detection] = make_number (0);
11312   Fdefine_coding_system_internal (coding_arg_undecided_max, args);
11313
11314   setup_coding_system (Qno_conversion, &safe_terminal_coding);
11315
11316   for (int i = 0; i < coding_category_max; i++)
11317     Fset (AREF (Vcoding_category_table, i), Qno_conversion);
11318
11319 #if defined (DOS_NT)
11320   system_eol_type = Qdos;
11321 #else
11322   system_eol_type = Qunix;
11323 #endif
11324   staticpro (&system_eol_type);
11325 }
11326
11327 char *
11328 emacs_strerror (int error_number)
11329 {
11330   char *str;
11331
11332   synchronize_system_messages_locale ();
11333   str = strerror (error_number);
11334
11335   if (! NILP (Vlocale_coding_system))
11336     {
11337       Lisp_Object dec = code_convert_string_norecord (build_string (str),
11338                                                       Vlocale_coding_system,
11339                                                       0);
11340       str = SSDATA (dec);
11341     }
11342
11343   return str;
11344 }
11345
11346 #endif /* emacs */