code.delx.au - gnu-emacs/blob - src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001, 2002, 2003, 2004, 2005,
   3                  2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
   4    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   5      2005, 2006, 2007, 2008, 2009, 2010, 2011
   6      National Institute of Advanced Industrial Science and Technology (AIST)
   7      Registration Number H14PRO021
   8    Copyright (C) 2003
   9      National Institute of Advanced Industrial Science and Technology (AIST)
  10      Registration Number H13PRO009
  11
  12 This file is part of GNU Emacs.
  13
  14 GNU Emacs is free software: you can redistribute it and/or modify
  15 it under the terms of the GNU General Public License as published by
  16 the Free Software Foundation, either version 3 of the License, or
  17 (at your option) any later version.
  18
  19 GNU Emacs is distributed in the hope that it will be useful,
  20 but WITHOUT ANY WARRANTY; without even the implied warranty of
  21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  22 GNU General Public License for more details.
  23
  24 You should have received a copy of the GNU General Public License
  25 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  26
  27 /*** TABLE OF CONTENTS ***
  28
  29   0. General comments
  30   1. Preamble
  31   2. Emacs' internal format (emacs-utf-8) handlers
  32   3. UTF-8 handlers
  33   4. UTF-16 handlers
  34   5. Charset-base coding systems handlers
  35   6. emacs-mule (old Emacs' internal format) handlers
  36   7. ISO2022 handlers
  37   8. Shift-JIS and BIG5 handlers
  38   9. CCL handlers
  39   10. C library functions
  40   11. Emacs Lisp library functions
  41   12. Postamble
  42
  43 */
  44
  45 /*** 0. General comments ***
  46
  47
  48 CODING SYSTEM
  49
  50   A coding system is an object for an encoding mechanism that contains
  51   information about how to convert byte sequences to character
  52   sequences and vice versa.  When we say "decode", it means converting
  53   a byte sequence of a specific coding system into a character
  54   sequence that is represented by Emacs' internal coding system
  55   `emacs-utf-8', and when we say "encode", it means converting a
  56   character sequence of emacs-utf-8 to a byte sequence of a specific
  57   coding system.
  58
  59   In Emacs Lisp, a coding system is represented by a Lisp symbol.  In
  60   C level, a coding system is represented by a vector of attributes
  61   stored in the hash table Vcharset_hash_table.  The conversion from
  62   coding system symbol to attributes vector is done by looking up
  63   Vcharset_hash_table by the symbol.
  64
  65   Coding systems are classified into the following types depending on
  66   the encoding mechanism.  Here's a brief description of the types.
  67
  68   o UTF-8
  69
  70   o UTF-16
  71
  72   o Charset-base coding system
  73
  74   A coding system defined by one or more (coded) character sets.
  75   Decoding and encoding are done by a code converter defined for each
  76   character set.
  77
  78   o Old Emacs internal format (emacs-mule)
  79
  80   The coding system adopted by old versions of Emacs (20 and 21).
  81
  82   o ISO2022-base coding system
  83
  84   The most famous coding system for multiple character sets.  X's
  85   Compound Text, various EUCs (Extended Unix Code), and coding systems
  86   used in the Internet communication such as ISO-2022-JP are all
  87   variants of ISO2022.
  88
  89   o SJIS (or Shift-JIS or MS-Kanji-Code)
  90
  91   A coding system to encode character sets: ASCII, JISX0201, and
  92   JISX0208.  Widely used for PC's in Japan.  Details are described in
  93   section 8.
  94
  95   o BIG5
  96
  97   A coding system to encode character sets: ASCII and Big5.  Widely
  98   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  99   described in section 8.  In this file, when we write "big5" (all
 100   lowercase), we mean the coding system, and when we write "Big5"
 101   (capitalized), we mean the character set.
 102
 103   o CCL
 104
 105   If a user wants to decode/encode text encoded in a coding system
 106   not listed above, he can supply a decoder and an encoder for it in
 107   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 108   program while decoding/encoding.
 109
 110   o Raw-text
 111
 112   A coding system for text containing raw eight-bit data.  Emacs
 113   treats each byte of source text as a character (except for
 114   end-of-line conversion).
 115
 116   o No-conversion
 117
 118   Like raw text, but don't do end-of-line conversion.
 119
 120
 121 END-OF-LINE FORMAT
 122
 123   How text end-of-line is encoded depends on operating system.  For
 124   instance, Unix's format is just one byte of LF (line-feed) code,
 125   whereas DOS's format is two-byte sequence of `carriage-return' and
 126   `line-feed' codes.  MacOS's format is usually one byte of
 127   `carriage-return'.
 128
 129   Since text character encoding and end-of-line encoding are
 130   independent, any coding system described above can take any format
 131   of end-of-line (except for no-conversion).
 132
 133 STRUCT CODING_SYSTEM
 134
 135   Before using a coding system for code conversion (i.e. decoding and
 136   encoding), we setup a structure of type `struct coding_system'.
 137   This structure keeps various information about a specific code
 138   conversion (e.g. the location of source and destination data).
 139
 140 */
 141
 142 /* COMMON MACROS */
 143
 144
 145 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 146
 147   These functions check if a byte sequence specified as a source in
 148   CODING conforms to the format of XXX, and update the members of
 149   DETECT_INFO.
 150
 151   Return 1 if the byte sequence conforms to XXX, otherwise return 0.
 152
 153   Below is the template of these functions.  */
 154
 155 #if 0
 156 static int
 157 detect_coding_XXX (struct coding_system *coding,
 158                    struct coding_detection_info *detect_info)
 159 {
 160   const unsigned char *src = coding->source;
 161   const unsigned char *src_end = coding->source + coding->src_bytes;
 162   int multibytep = coding->src_multibyte;
 163   int consumed_chars = 0;
 164   int found = 0;
 165   ...;
 166
 167   while (1)
 168     {
 169       /* Get one byte from the source.  If the source is exhausted, jump
 170          to no_more_source:.  */
 171       ONE_MORE_BYTE (c);
 172
 173       if (! __C_conforms_to_XXX___ (c))
 174         break;
 175       if (! __C_strongly_suggests_XXX__ (c))
 176         found = CATEGORY_MASK_XXX;
 177     }
 178   /* The byte sequence is invalid for XXX.  */
 179   detect_info->rejected |= CATEGORY_MASK_XXX;
 180   return 0;
 181
 182  no_more_source:
 183   /* The source exhausted successfully.  */
 184   detect_info->found |= found;
 185   return 1;
 186 }
 187 #endif
 188
 189 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 190
 191   These functions decode a byte sequence specified as a source by
 192   CODING.  The resulting multibyte text goes to a place pointed to by
 193   CODING->charbuf, the length of which should not exceed
 194   CODING->charbuf_size;
 195
 196   These functions set the information of original and decoded texts in
 197   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 198   They also set CODING->result to one of CODING_RESULT_XXX indicating
 199   how the decoding is finished.
 200
 201   Below is the template of these functions.  */
 202
 203 #if 0
 204 static void
 205 decode_coding_XXXX (struct coding_system *coding)
 206 {
 207   const unsigned char *src = coding->source + coding->consumed;
 208   const unsigned char *src_end = coding->source + coding->src_bytes;
 209   /* SRC_BASE remembers the start position in source in each loop.
 210      The loop will be exited when there's not enough source code, or
 211      when there's no room in CHARBUF for a decoded character.  */
 212   const unsigned char *src_base;
 213   /* A buffer to produce decoded characters.  */
 214   int *charbuf = coding->charbuf + coding->charbuf_used;
 215   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 216   int multibytep = coding->src_multibyte;
 217
 218   while (1)
 219     {
 220       src_base = src;
 221       if (charbuf < charbuf_end)
 222         /* No more room to produce a decoded character.  */
 223         break;
 224       ONE_MORE_BYTE (c);
 225       /* Decode it. */
 226     }
 227
 228  no_more_source:
 229   if (src_base < src_end
 230       && coding->mode & CODING_MODE_LAST_BLOCK)
 231     /* If the source ends by partial bytes to construct a character,
 232        treat them as eight-bit raw data.  */
 233     while (src_base < src_end && charbuf < charbuf_end)
 234       *charbuf++ = *src_base++;
 235   /* Remember how many bytes and characters we consumed.  If the
 236      source is multibyte, the bytes and chars are not identical.  */
 237   coding->consumed = coding->consumed_char = src_base - coding->source;
 238   /* Remember how many characters we produced.  */
 239   coding->charbuf_used = charbuf - coding->charbuf;
 240 }
 241 #endif
 242
 243 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 244
 245   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 246   internal multibyte format by CODING.  The resulting byte sequence
 247   goes to a place pointed to by DESTINATION, the length of which
 248   should not exceed DST_BYTES.
 249
 250   These functions set the information of original and encoded texts in
 251   the members produced, produced_char, consumed, and consumed_char of
 252   the structure *CODING.  They also set the member result to one of
 253   CODING_RESULT_XXX indicating how the encoding finished.
 254
 255   DST_BYTES zero means that source area and destination area are
 256   overlapped, which means that we can produce a encoded text until it
 257   reaches at the head of not-yet-encoded source text.
 258
 259   Below is a template of these functions.  */
 260 #if 0
 261 static void
 262 encode_coding_XXX (struct coding_system *coding)
 263 {
 264   int multibytep = coding->dst_multibyte;
 265   int *charbuf = coding->charbuf;
 266   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 267   unsigned char *dst = coding->destination + coding->produced;
 268   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 269   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 270   int produced_chars = 0;
 271
 272   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 273     {
 274       int c = *charbuf;
 275       /* Encode C into DST, and increment DST.  */
 276     }
 277  label_no_more_destination:
 278   /* How many chars and bytes we produced.  */
 279   coding->produced_char += produced_chars;
 280   coding->produced = dst - coding->destination;
 281 }
 282 #endif
 283
 284 \f
 285 /*** 1. Preamble ***/
 286
 287 #include <config.h>
 288 #include <stdio.h>
 289 #include <setjmp.h>
 290
 291 #include "lisp.h"
 292 #include "buffer.h"
 293 #include "character.h"
 294 #include "charset.h"
 295 #include "ccl.h"
 296 #include "composite.h"
 297 #include "coding.h"
 298 #include "window.h"
 299 #include "frame.h"
 300 #include "termhooks.h"
 301
 302 Lisp_Object Vcoding_system_hash_table;
 303
 304 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
 305 Lisp_Object Qunix, Qdos;
 306 Lisp_Object Qbuffer_file_coding_system;
 307 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 308 Lisp_Object Qdefault_char;
 309 Lisp_Object Qno_conversion, Qundecided;
 310 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
 311 Lisp_Object Qbig, Qlittle;
 312 Lisp_Object Qcoding_system_history;
 313 Lisp_Object Qvalid_codes;
 314 Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
 315 Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 316 Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 317 Lisp_Object QCascii_compatible_p;
 318
 319 Lisp_Object Qcall_process, Qcall_process_region;
 320 Lisp_Object Qstart_process, Qopen_network_stream;
 321 Lisp_Object Qtarget_idx;
 322
 323 Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
 324 Lisp_Object Qinterrupted, Qinsufficient_memory;
 325
 326 /* If a symbol has this property, evaluate the value to define the
 327    symbol as a coding system.  */
 328 static Lisp_Object Qcoding_system_define_form;
 329
 330 /* Format of end-of-line decided by system.  This is Qunix on
 331    Unix and Mac, Qdos on DOS/Windows.
 332    This has an effect only for external encoding (i.e. for output to
 333    file and process), not for in-buffer or Lisp string encoding.  */
 334 static Lisp_Object system_eol_type;
 335
 336 #ifdef emacs
 337
 338 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 339
 340 /* Coding system emacs-mule and raw-text are for converting only
 341    end-of-line format.  */
 342 Lisp_Object Qemacs_mule, Qraw_text;
 343 Lisp_Object Qutf_8_emacs;
 344
 345 /* Coding-systems are handed between Emacs Lisp programs and C internal
 346    routines by the following three variables.  */
 347 /* Coding system to be used to encode text for terminal display when
 348    terminal coding system is nil.  */
 349 struct coding_system safe_terminal_coding;
 350
 351 #endif /* emacs */
 352
 353 Lisp_Object Qtranslation_table;
 354 Lisp_Object Qtranslation_table_id;
 355 Lisp_Object Qtranslation_table_for_decode;
 356 Lisp_Object Qtranslation_table_for_encode;
 357
 358 /* Two special coding systems.  */
 359 Lisp_Object Vsjis_coding_system;
 360 Lisp_Object Vbig5_coding_system;
 361
 362 /* ISO2022 section */
 363
 364 #define CODING_ISO_INITIAL(coding, reg)                 \
 365   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 366                      coding_attr_iso_initial),          \
 367                reg)))
 368
 369
 370 #define CODING_ISO_REQUEST(coding, charset_id)          \
 371   (((charset_id) <= (coding)->max_charset_id            \
 372     ? ((coding)->safe_charsets[charset_id] != 255       \
 373        ? (coding)->safe_charsets[charset_id]            \
 374        : -1)                                            \
 375     : -1))
 376
 377
 378 #define CODING_ISO_FLAGS(coding)        \
 379   ((coding)->spec.iso_2022.flags)
 380 #define CODING_ISO_DESIGNATION(coding, reg)     \
 381   ((coding)->spec.iso_2022.current_designation[reg])
 382 #define CODING_ISO_INVOCATION(coding, plane)    \
 383   ((coding)->spec.iso_2022.current_invocation[plane])
 384 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 385   ((coding)->spec.iso_2022.single_shifting)
 386 #define CODING_ISO_BOL(coding)  \
 387   ((coding)->spec.iso_2022.bol)
 388 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 389   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 390 #define CODING_ISO_CMP_STATUS(coding)   \
 391   (&(coding)->spec.iso_2022.cmp_status)
 392 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 393   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 394 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 395   ((coding)->spec.iso_2022.embedded_utf_8)
 396
 397 /* Control characters of ISO2022.  */
 398                         /* code */      /* function */
 399 #define ISO_CODE_LF     0x0A            /* line-feed */
 400 #define ISO_CODE_CR     0x0D            /* carriage-return */
 401 #define ISO_CODE_SO     0x0E            /* shift-out */
 402 #define ISO_CODE_SI     0x0F            /* shift-in */
 403 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 404 #define ISO_CODE_ESC    0x1B            /* escape */
 405 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 406 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 407 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 408
 409 /* All code (1-byte) of ISO2022 is classified into one of the
 410    followings.  */
 411 enum iso_code_class_type
 412   {
 413     ISO_control_0,              /* Control codes in the range
 414                                    0x00..0x1F and 0x7F, except for the
 415                                    following 5 codes.  */
 416     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 417     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 418     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 419     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 420     ISO_control_1,              /* Control codes in the range
 421                                    0x80..0x9F, except for the
 422                                    following 3 codes.  */
 423     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 424     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 425     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 426     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 427     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 428     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 429     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 430   };
 431
 432 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 433     `iso-flags' attribute of an iso2022 coding system.  */
 434
 435 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 436    instead of the correct short-form sequence (e.g. ESC $ A).  */
 437 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 438
 439 /* If set, reset graphic planes and registers at end-of-line to the
 440    initial state.  */
 441 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 442
 443 /* If set, reset graphic planes and registers before any control
 444    characters to the initial state.  */
 445 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 446
 447 /* If set, encode by 7-bit environment.  */
 448 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 449
 450 /* If set, use locking-shift function.  */
 451 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 452
 453 /* If set, use single-shift function.  Overwrite
 454    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 455 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 456
 457 /* If set, use designation escape sequence.  */
 458 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 459
 460 /* If set, produce revision number sequence.  */
 461 #define CODING_ISO_FLAG_REVISION        0x0080
 462
 463 /* If set, produce ISO6429's direction specifying sequence.  */
 464 #define CODING_ISO_FLAG_DIRECTION       0x0100
 465
 466 /* If set, assume designation states are reset at beginning of line on
 467    output.  */
 468 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 469
 470 /* If set, designation sequence should be placed at beginning of line
 471    on output.  */
 472 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 473
 474 /* If set, do not encode unsafe characters on output.  */
 475 #define CODING_ISO_FLAG_SAFE            0x0800
 476
 477 /* If set, extra latin codes (128..159) are accepted as a valid code
 478    on input.  */
 479 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 480
 481 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 482
 483 #define CODING_ISO_FLAG_EUC_TW_SHIFT    0x4000
 484
 485 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 486
 487 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 488
 489 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 490
 491 /* A character to be produced on output if encoding of the original
 492    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 493 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 494
 495 /* UTF-8 section */
 496 #define CODING_UTF_8_BOM(coding)        \
 497   ((coding)->spec.utf_8_bom)
 498
 499 /* UTF-16 section */
 500 #define CODING_UTF_16_BOM(coding)       \
 501   ((coding)->spec.utf_16.bom)
 502
 503 #define CODING_UTF_16_ENDIAN(coding)    \
 504   ((coding)->spec.utf_16.endian)
 505
 506 #define CODING_UTF_16_SURROGATE(coding) \
 507   ((coding)->spec.utf_16.surrogate)
 508
 509
 510 /* CCL section */
 511 #define CODING_CCL_DECODER(coding)      \
 512   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 513 #define CODING_CCL_ENCODER(coding)      \
 514   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 515 #define CODING_CCL_VALIDS(coding)                                          \
 516   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 517
 518 /* Index for each coding category in `coding_categories' */
 519
 520 enum coding_category
 521   {
 522     coding_category_iso_7,
 523     coding_category_iso_7_tight,
 524     coding_category_iso_8_1,
 525     coding_category_iso_8_2,
 526     coding_category_iso_7_else,
 527     coding_category_iso_8_else,
 528     coding_category_utf_8_auto,
 529     coding_category_utf_8_nosig,
 530     coding_category_utf_8_sig,
 531     coding_category_utf_16_auto,
 532     coding_category_utf_16_be,
 533     coding_category_utf_16_le,
 534     coding_category_utf_16_be_nosig,
 535     coding_category_utf_16_le_nosig,
 536     coding_category_charset,
 537     coding_category_sjis,
 538     coding_category_big5,
 539     coding_category_ccl,
 540     coding_category_emacs_mule,
 541     /* All above are targets of code detection.  */
 542     coding_category_raw_text,
 543     coding_category_undecided,
 544     coding_category_max
 545   };
 546
 547 /* Definitions of flag bits used in detect_coding_XXXX.  */
 548 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 549 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 550 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 551 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 552 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 553 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 554 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 555 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 556 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 557 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 558 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 559 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 560 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 561 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 562 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 563 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 564 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 565 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 566 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 567 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 568
 569 /* This value is returned if detect_coding_mask () find nothing other
 570    than ASCII characters.  */
 571 #define CATEGORY_MASK_ANY               \
 572   (CATEGORY_MASK_ISO_7                  \
 573    | CATEGORY_MASK_ISO_7_TIGHT          \
 574    | CATEGORY_MASK_ISO_8_1              \
 575    | CATEGORY_MASK_ISO_8_2              \
 576    | CATEGORY_MASK_ISO_7_ELSE           \
 577    | CATEGORY_MASK_ISO_8_ELSE           \
 578    | CATEGORY_MASK_UTF_8_AUTO           \
 579    | CATEGORY_MASK_UTF_8_NOSIG          \
 580    | CATEGORY_MASK_UTF_8_SIG            \
 581    | CATEGORY_MASK_UTF_16_AUTO          \
 582    | CATEGORY_MASK_UTF_16_BE            \
 583    | CATEGORY_MASK_UTF_16_LE            \
 584    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 585    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 586    | CATEGORY_MASK_CHARSET              \
 587    | CATEGORY_MASK_SJIS                 \
 588    | CATEGORY_MASK_BIG5                 \
 589    | CATEGORY_MASK_CCL                  \
 590    | CATEGORY_MASK_EMACS_MULE)
 591
 592
 593 #define CATEGORY_MASK_ISO_7BIT \
 594   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 595
 596 #define CATEGORY_MASK_ISO_8BIT \
 597   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 598
 599 #define CATEGORY_MASK_ISO_ELSE \
 600   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 601
 602 #define CATEGORY_MASK_ISO_ESCAPE        \
 603   (CATEGORY_MASK_ISO_7                  \
 604    | CATEGORY_MASK_ISO_7_TIGHT          \
 605    | CATEGORY_MASK_ISO_7_ELSE           \
 606    | CATEGORY_MASK_ISO_8_ELSE)
 607
 608 #define CATEGORY_MASK_ISO       \
 609   (  CATEGORY_MASK_ISO_7BIT     \
 610      | CATEGORY_MASK_ISO_8BIT   \
 611      | CATEGORY_MASK_ISO_ELSE)
 612
 613 #define CATEGORY_MASK_UTF_16            \
 614   (CATEGORY_MASK_UTF_16_AUTO            \
 615    | CATEGORY_MASK_UTF_16_BE            \
 616    | CATEGORY_MASK_UTF_16_LE            \
 617    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 618    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 619
 620 #define CATEGORY_MASK_UTF_8     \
 621   (CATEGORY_MASK_UTF_8_AUTO     \
 622    | CATEGORY_MASK_UTF_8_NOSIG  \
 623    | CATEGORY_MASK_UTF_8_SIG)
 624
 625 /* Table of coding categories (Lisp symbols).  This variable is for
 626    internal use only.  */
 627 static Lisp_Object Vcoding_category_table;
 628
 629 /* Table of coding-categories ordered by priority.  */
 630 static enum coding_category coding_priorities[coding_category_max];
 631
 632 /* Nth element is a coding context for the coding system bound to the
 633    Nth coding category.  */
 634 static struct coding_system coding_categories[coding_category_max];
 635
 636 /*** Commonly used macros and functions ***/
 637
 638 #ifndef min
 639 #define min(a, b) ((a) < (b) ? (a) : (b))
 640 #endif
 641 #ifndef max
 642 #define max(a, b) ((a) > (b) ? (a) : (b))
 643 #endif
 644
 645 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 646   do {                                                  \
 647     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 648     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 649   } while (0)
 650
 651
 652 /* Safely get one byte from the source text pointed by SRC which ends
 653    at SRC_END, and set C to that byte.  If there are not enough bytes
 654    in the source, it jumps to `no_more_source'.  If multibytep is
 655    nonzero, and a multibyte character is found at SRC, set C to the
 656    negative value of the character code.  The caller should declare
 657    and set these variables appropriately in advance:
 658         src, src_end, multibytep */
 659
 660 #define ONE_MORE_BYTE(c)                                \
 661   do {                                                  \
 662     if (src == src_end)                                 \
 663       {                                                 \
 664         if (src_base < src)                             \
 665           record_conversion_result                      \
 666             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 667         goto no_more_source;                            \
 668       }                                                 \
 669     c = *src++;                                         \
 670     if (multibytep && (c & 0x80))                       \
 671       {                                                 \
 672         if ((c & 0xFE) == 0xC0)                         \
 673           c = ((c & 1) << 6) | *src++;                  \
 674         else                                            \
 675           {                                             \
 676             src--;                                      \
 677             c = - string_char (src, &src, NULL);        \
 678             record_conversion_result                    \
 679               (coding, CODING_RESULT_INVALID_SRC);      \
 680           }                                             \
 681       }                                                 \
 682     consumed_chars++;                                   \
 683   } while (0)
 684
 685 /* Safely get two bytes from the source text pointed by SRC which ends
 686    at SRC_END, and set C1 and C2 to those bytes while skipping the
 687    heading multibyte characters.  If there are not enough bytes in the
 688    source, it jumps to `no_more_source'.  If multibytep is nonzero and
 689    a multibyte character is found for C2, set C2 to the negative value
 690    of the character code.  The caller should declare and set these
 691    variables appropriately in advance:
 692         src, src_end, multibytep
 693    It is intended that this macro is used in detect_coding_utf_16.  */
 694
 695 #define TWO_MORE_BYTES(c1, c2)                          \
 696   do {                                                  \
 697     do {                                                \
 698       if (src == src_end)                               \
 699         goto no_more_source;                            \
 700       c1 = *src++;                                      \
 701       if (multibytep && (c1 & 0x80))                    \
 702         {                                               \
 703           if ((c1 & 0xFE) == 0xC0)                      \
 704             c1 = ((c1 & 1) << 6) | *src++;              \
 705           else                                          \
 706             {                                           \
 707               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 708               c1 = -1;                                  \
 709             }                                           \
 710         }                                               \
 711     } while (c1 < 0);                                   \
 712     if (src == src_end)                                 \
 713       goto no_more_source;                              \
 714     c2 = *src++;                                        \
 715     if (multibytep && (c2 & 0x80))                      \
 716       {                                                 \
 717         if ((c2 & 0xFE) == 0xC0)                        \
 718           c2 = ((c2 & 1) << 6) | *src++;                \
 719         else                                            \
 720           c2 = -1;                                      \
 721       }                                                 \
 722   } while (0)
 723
 724
 725 #define ONE_MORE_BYTE_NO_CHECK(c)                       \
 726   do {                                                  \
 727     c = *src++;                                         \
 728     if (multibytep && (c & 0x80))                       \
 729       {                                                 \
 730         if ((c & 0xFE) == 0xC0)                         \
 731           c = ((c & 1) << 6) | *src++;                  \
 732         else                                            \
 733           {                                             \
 734             src--;                                      \
 735             c = - string_char (src, &src, NULL);        \
 736             record_conversion_result                    \
 737               (coding, CODING_RESULT_INVALID_SRC);      \
 738           }                                             \
 739       }                                                 \
 740     consumed_chars++;                                   \
 741   } while (0)
 742
 743
 744 /* Store a byte C in the place pointed by DST and increment DST to the
 745    next free point, and increment PRODUCED_CHARS.  The caller should
 746    assure that C is 0..127, and declare and set the variable `dst'
 747    appropriately in advance.
 748 */
 749
 750
 751 #define EMIT_ONE_ASCII_BYTE(c)  \
 752   do {                          \
 753     produced_chars++;           \
 754     *dst++ = (c);               \
 755   } while (0)
 756
 757
 758 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 759
 760 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 761   do {                                  \
 762     produced_chars += 2;                \
 763     *dst++ = (c1), *dst++ = (c2);       \
 764   } while (0)
 765
 766
 767 /* Store a byte C in the place pointed by DST and increment DST to the
 768    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP is
 769    nonzero, store in an appropriate multibyte from.  The caller should
 770    declare and set the variables `dst' and `multibytep' appropriately
 771    in advance.  */
 772
 773 #define EMIT_ONE_BYTE(c)                \
 774   do {                                  \
 775     produced_chars++;                   \
 776     if (multibytep)                     \
 777       {                                 \
 778         int ch = (c);                   \
 779         if (ch >= 0x80)                 \
 780           ch = BYTE8_TO_CHAR (ch);      \
 781         CHAR_STRING_ADVANCE (ch, dst);  \
 782       }                                 \
 783     else                                \
 784       *dst++ = (c);                     \
 785   } while (0)
 786
 787
 788 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 789
 790 #define EMIT_TWO_BYTES(c1, c2)          \
 791   do {                                  \
 792     produced_chars += 2;                \
 793     if (multibytep)                     \
 794       {                                 \
 795         int ch;                         \
 796                                         \
 797         ch = (c1);                      \
 798         if (ch >= 0x80)                 \
 799           ch = BYTE8_TO_CHAR (ch);      \
 800         CHAR_STRING_ADVANCE (ch, dst);  \
 801         ch = (c2);                      \
 802         if (ch >= 0x80)                 \
 803           ch = BYTE8_TO_CHAR (ch);      \
 804         CHAR_STRING_ADVANCE (ch, dst);  \
 805       }                                 \
 806     else                                \
 807       {                                 \
 808         *dst++ = (c1);                  \
 809         *dst++ = (c2);                  \
 810       }                                 \
 811   } while (0)
 812
 813
 814 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 815   do {                                  \
 816     EMIT_ONE_BYTE (c1);                 \
 817     EMIT_TWO_BYTES (c2, c3);            \
 818   } while (0)
 819
 820
 821 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 822   do {                                          \
 823     EMIT_TWO_BYTES (c1, c2);                    \
 824     EMIT_TWO_BYTES (c3, c4);                    \
 825   } while (0)
 826
 827
 828 /* Prototypes for static functions.  */
 829 static void record_conversion_result (struct coding_system *coding,
 830                                       enum coding_result_code result);
 831 static int detect_coding_utf_8 (struct coding_system *,
 832                                 struct coding_detection_info *info);
 833 static void decode_coding_utf_8 (struct coding_system *);
 834 static int encode_coding_utf_8 (struct coding_system *);
 835
 836 static int detect_coding_utf_16 (struct coding_system *,
 837                                  struct coding_detection_info *info);
 838 static void decode_coding_utf_16 (struct coding_system *);
 839 static int encode_coding_utf_16 (struct coding_system *);
 840
 841 static int detect_coding_iso_2022 (struct coding_system *,
 842                                    struct coding_detection_info *info);
 843 static void decode_coding_iso_2022 (struct coding_system *);
 844 static int encode_coding_iso_2022 (struct coding_system *);
 845
 846 static int detect_coding_emacs_mule (struct coding_system *,
 847                                      struct coding_detection_info *info);
 848 static void decode_coding_emacs_mule (struct coding_system *);
 849 static int encode_coding_emacs_mule (struct coding_system *);
 850
 851 static int detect_coding_sjis (struct coding_system *,
 852                                struct coding_detection_info *info);
 853 static void decode_coding_sjis (struct coding_system *);
 854 static int encode_coding_sjis (struct coding_system *);
 855
 856 static int detect_coding_big5 (struct coding_system *,
 857                                struct coding_detection_info *info);
 858 static void decode_coding_big5 (struct coding_system *);
 859 static int encode_coding_big5 (struct coding_system *);
 860
 861 static int detect_coding_ccl (struct coding_system *,
 862                               struct coding_detection_info *info);
 863 static void decode_coding_ccl (struct coding_system *);
 864 static int encode_coding_ccl (struct coding_system *);
 865
 866 static void decode_coding_raw_text (struct coding_system *);
 867 static int encode_coding_raw_text (struct coding_system *);
 868
 869 static void coding_set_source (struct coding_system *);
 870 static void coding_set_destination (struct coding_system *);
 871 static void coding_alloc_by_realloc (struct coding_system *, EMACS_INT);
 872 static void coding_alloc_by_making_gap (struct coding_system *,
 873                                         EMACS_INT, EMACS_INT);
 874 static unsigned char *alloc_destination (struct coding_system *,
 875                                          EMACS_INT, unsigned char *);
 876 static void setup_iso_safe_charsets (Lisp_Object);
 877 static unsigned char *encode_designation_at_bol (struct coding_system *,
 878                                                  int *, int *,
 879                                                  unsigned char *);
 880 static int detect_eol (const unsigned char *,
 881                        EMACS_INT, enum coding_category);
 882 static Lisp_Object adjust_coding_eol_type (struct coding_system *, int);
 883 static void decode_eol (struct coding_system *);
 884 static Lisp_Object get_translation_table (Lisp_Object, int, int *);
 885 static Lisp_Object get_translation (Lisp_Object, int *, int *);
 886 static int produce_chars (struct coding_system *, Lisp_Object, int);
 887 static INLINE void produce_charset (struct coding_system *, int *,
 888                                     EMACS_INT);
 889 static void produce_annotation (struct coding_system *, EMACS_INT);
 890 static int decode_coding (struct coding_system *);
 891 static INLINE int *handle_composition_annotation (EMACS_INT, EMACS_INT,
 892                                                   struct coding_system *,
 893                                                   int *, EMACS_INT *);
 894 static INLINE int *handle_charset_annotation (EMACS_INT, EMACS_INT,
 895                                               struct coding_system *,
 896                                               int *, EMACS_INT *);
 897 static void consume_chars (struct coding_system *, Lisp_Object, int);
 898 static int encode_coding (struct coding_system *);
 899 static Lisp_Object make_conversion_work_buffer (int);
 900 static Lisp_Object code_conversion_restore (Lisp_Object);
 901 static INLINE int char_encodable_p (int, Lisp_Object);
 902 static Lisp_Object make_subsidiaries (Lisp_Object);
 903
 904 static void
 905 record_conversion_result (struct coding_system *coding,
 906                           enum coding_result_code result)
 907 {
 908   coding->result = result;
 909   switch (result)
 910     {
 911     case CODING_RESULT_INSUFFICIENT_SRC:
 912       Vlast_code_conversion_error = Qinsufficient_source;
 913       break;
 914     case CODING_RESULT_INCONSISTENT_EOL:
 915       Vlast_code_conversion_error = Qinconsistent_eol;
 916       break;
 917     case CODING_RESULT_INVALID_SRC:
 918       Vlast_code_conversion_error = Qinvalid_source;
 919       break;
 920     case CODING_RESULT_INTERRUPT:
 921       Vlast_code_conversion_error = Qinterrupted;
 922       break;
 923     case CODING_RESULT_INSUFFICIENT_MEM:
 924       Vlast_code_conversion_error = Qinsufficient_memory;
 925       break;
 926     case CODING_RESULT_INSUFFICIENT_DST:
 927       /* Don't record this error in Vlast_code_conversion_error
 928          because it happens just temporarily and is resolved when the
 929          whole conversion is finished.  */
 930       break;
 931     case CODING_RESULT_SUCCESS:
 932       break;
 933     default:
 934       Vlast_code_conversion_error = intern ("Unknown error");
 935     }
 936 }
 937
 938 /* This wrapper macro is used to preserve validity of pointers into
 939    buffer text across calls to decode_char, which could cause
 940    relocation of buffers if it loads a charset map, because loading a
 941    charset map allocates large structures.  */
 942 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 943   do {                                                                       \
 944     charset_map_loaded = 0;                                                  \
 945     c = DECODE_CHAR (charset, code);                                         \
 946     if (charset_map_loaded)                                                  \
 947       {                                                                      \
 948         const unsigned char *orig = coding->source;                          \
 949         EMACS_INT offset;                                                    \
 950                                                                              \
 951         coding_set_source (coding);                                          \
 952         offset = coding->source - orig;                                      \
 953         src += offset;                                                       \
 954         src_base += offset;                                                  \
 955         src_end += offset;                                                   \
 956       }                                                                      \
 957   } while (0)
 958
 959
 960 /* If there are at least BYTES length of room at dst, allocate memory
 961    for coding->destination and update dst and dst_end.  We don't have
 962    to take care of coding->source which will be relocated.  It is
 963    handled by calling coding_set_source in encode_coding.  */
 964
 965 #define ASSURE_DESTINATION(bytes)                               \
 966   do {                                                          \
 967     if (dst + (bytes) >= dst_end)                               \
 968       {                                                         \
 969         int more_bytes = charbuf_end - charbuf + (bytes);       \
 970                                                                 \
 971         dst = alloc_destination (coding, more_bytes, dst);      \
 972         dst_end = coding->destination + coding->dst_bytes;      \
 973       }                                                         \
 974   } while (0)
 975
 976
 977 /* Store multibyte form of the character C in P, and advance P to the
 978    end of the multibyte form.  This is like CHAR_STRING_ADVANCE but it
 979    never calls MAYBE_UNIFY_CHAR.  */
 980
 981 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)      \
 982   do {                                          \
 983     if ((c) <= MAX_1_BYTE_CHAR)                 \
 984       *(p)++ = (c);                             \
 985     else if ((c) <= MAX_2_BYTE_CHAR)            \
 986       *(p)++ = (0xC0 | ((c) >> 6)),             \
 987         *(p)++ = (0x80 | ((c) & 0x3F));         \
 988     else if ((c) <= MAX_3_BYTE_CHAR)            \
 989       *(p)++ = (0xE0 | ((c) >> 12)),            \
 990         *(p)++ = (0x80 | (((c) >> 6) & 0x3F)),  \
 991         *(p)++ = (0x80 | ((c) & 0x3F));         \
 992     else if ((c) <= MAX_4_BYTE_CHAR)            \
 993       *(p)++ = (0xF0 | (c >> 18)),              \
 994         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
 995         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
 996         *(p)++ = (0x80 | (c & 0x3F));           \
 997     else if ((c) <= MAX_5_BYTE_CHAR)            \
 998       *(p)++ = 0xF8,                            \
 999         *(p)++ = (0x80 | ((c >> 18) & 0x0F)),   \
1000         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
1001         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
1002         *(p)++ = (0x80 | (c & 0x3F));           \
1003     else                                        \
1004       (p) += BYTE8_STRING ((c) - 0x3FFF80, p);  \
1005   } while (0)
1006
1007
1008 /* Return the character code of character whose multibyte form is at
1009    P, and advance P to the end of the multibyte form.  This is like
1010    STRING_CHAR_ADVANCE, but it never calls MAYBE_UNIFY_CHAR.  */
1011
1012 #define STRING_CHAR_ADVANCE_NO_UNIFY(p)                         \
1013   (!((p)[0] & 0x80)                                             \
1014    ? *(p)++                                                     \
1015    : ! ((p)[0] & 0x20)                                          \
1016    ? ((p) += 2,                                                 \
1017       ((((p)[-2] & 0x1F) << 6)                                  \
1018        | ((p)[-1] & 0x3F)                                       \
1019        | ((unsigned char) ((p)[-2]) < 0xC2 ? 0x3FFF80 : 0)))    \
1020    : ! ((p)[0] & 0x10)                                          \
1021    ? ((p) += 3,                                                 \
1022       ((((p)[-3] & 0x0F) << 12)                                 \
1023        | (((p)[-2] & 0x3F) << 6)                                \
1024        | ((p)[-1] & 0x3F)))                                     \
1025    : ! ((p)[0] & 0x08)                                          \
1026    ? ((p) += 4,                                                 \
1027       ((((p)[-4] & 0xF) << 18)                                  \
1028        | (((p)[-3] & 0x3F) << 12)                               \
1029        | (((p)[-2] & 0x3F) << 6)                                \
1030        | ((p)[-1] & 0x3F)))                                     \
1031    : ((p) += 5,                                                 \
1032       ((((p)[-4] & 0x3F) << 18)                                 \
1033        | (((p)[-3] & 0x3F) << 12)                               \
1034        | (((p)[-2] & 0x3F) << 6)                                \
1035        | ((p)[-1] & 0x3F))))
1036
1037
1038 static void
1039 coding_set_source (struct coding_system *coding)
1040 {
1041   if (BUFFERP (coding->src_object))
1042     {
1043       struct buffer *buf = XBUFFER (coding->src_object);
1044
1045       if (coding->src_pos < 0)
1046         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
1047       else
1048         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
1049     }
1050   else if (STRINGP (coding->src_object))
1051     {
1052       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
1053     }
1054   else
1055     /* Otherwise, the source is C string and is never relocated
1056        automatically.  Thus we don't have to update anything.  */
1057     ;
1058 }
1059
1060 static void
1061 coding_set_destination (struct coding_system *coding)
1062 {
1063   if (BUFFERP (coding->dst_object))
1064     {
1065       if (coding->src_pos < 0)
1066         {
1067           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1068           coding->dst_bytes = (GAP_END_ADDR
1069                                - (coding->src_bytes - coding->consumed)
1070                                - coding->destination);
1071         }
1072       else
1073         {
1074           /* We are sure that coding->dst_pos_byte is before the gap
1075              of the buffer. */
1076           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1077                                  + coding->dst_pos_byte - BEG_BYTE);
1078           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1079                                - coding->destination);
1080         }
1081     }
1082   else
1083     /* Otherwise, the destination is C string and is never relocated
1084        automatically.  Thus we don't have to update anything.  */
1085     ;
1086 }
1087
1088
1089 static void
1090 coding_alloc_by_realloc (struct coding_system *coding, EMACS_INT bytes)
1091 {
1092   coding->destination = (unsigned char *) xrealloc (coding->destination,
1093                                                     coding->dst_bytes + bytes);
1094   coding->dst_bytes += bytes;
1095 }
1096
1097 static void
1098 coding_alloc_by_making_gap (struct coding_system *coding,
1099                             EMACS_INT gap_head_used, EMACS_INT bytes)
1100 {
1101   if (EQ (coding->src_object, coding->dst_object))
1102     {
1103       /* The gap may contain the produced data at the head and not-yet
1104          consumed data at the tail.  To preserve those data, we at
1105          first make the gap size to zero, then increase the gap
1106          size.  */
1107       EMACS_INT add = GAP_SIZE;
1108
1109       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1110       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1111       make_gap (bytes);
1112       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1113       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1114     }
1115   else
1116     {
1117       Lisp_Object this_buffer;
1118
1119       this_buffer = Fcurrent_buffer ();
1120       set_buffer_internal (XBUFFER (coding->dst_object));
1121       make_gap (bytes);
1122       set_buffer_internal (XBUFFER (this_buffer));
1123     }
1124 }
1125
1126
1127 static unsigned char *
1128 alloc_destination (struct coding_system *coding, EMACS_INT nbytes,
1129                    unsigned char *dst)
1130 {
1131   EMACS_INT offset = dst - coding->destination;
1132
1133   if (BUFFERP (coding->dst_object))
1134     {
1135       struct buffer *buf = XBUFFER (coding->dst_object);
1136
1137       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1138     }
1139   else
1140     coding_alloc_by_realloc (coding, nbytes);
1141   coding_set_destination (coding);
1142   dst = coding->destination + offset;
1143   return dst;
1144 }
1145
1146 /** Macros for annotations.  */
1147
1148 /* An annotation data is stored in the array coding->charbuf in this
1149    format:
1150      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1151    LENGTH is the number of elements in the annotation.
1152    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1153    NCHARS is the number of characters in the text annotated.
1154
1155    The format of the following elements depend on ANNOTATION_MASK.
1156
1157    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1158    follows:
1159      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1160
1161    NBYTES is the number of bytes specified in the header part of
1162    old-style emacs-mule encoding, or 0 for the other kind of
1163    composition.
1164
1165    METHOD is one of enum composition_method.
1166
1167    Optional COMPOSITION-COMPONENTS are characters and composition
1168    rules.
1169
1170    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1171    follows.
1172
1173    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1174    recover from an invalid annotation, and should be skipped by
1175    produce_annotation.  */
1176
1177 /* Maximum length of the header of annotation data.  */
1178 #define MAX_ANNOTATION_LENGTH 5
1179
1180 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1181   do {                                                  \
1182     *(buf)++ = -(len);                                  \
1183     *(buf)++ = (mask);                                  \
1184     *(buf)++ = (nchars);                                \
1185     coding->annotated = 1;                              \
1186   } while (0);
1187
1188 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1189   do {                                                                      \
1190     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1191     *buf++ = nbytes;                                                        \
1192     *buf++ = method;                                                        \
1193   } while (0)
1194
1195
1196 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1197   do {                                                                  \
1198     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1199     *buf++ = id;                                                        \
1200   } while (0)
1201
1202 \f
1203 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1204
1205
1206
1207 \f
1208 /*** 3. UTF-8 ***/
1209
1210 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1211    Check if a text is encoded in UTF-8.  If it is, return 1, else
1212    return 0.  */
1213
1214 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1215 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1216 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1217 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1218 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1219 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1220
1221 #define UTF_BOM 0xFEFF
1222 #define UTF_8_BOM_1 0xEF
1223 #define UTF_8_BOM_2 0xBB
1224 #define UTF_8_BOM_3 0xBF
1225
1226 static int
1227 detect_coding_utf_8 (struct coding_system *coding,
1228                      struct coding_detection_info *detect_info)
1229 {
1230   const unsigned char *src = coding->source, *src_base;
1231   const unsigned char *src_end = coding->source + coding->src_bytes;
1232   int multibytep = coding->src_multibyte;
1233   int consumed_chars = 0;
1234   int bom_found = 0;
1235   int found = 0;
1236
1237   detect_info->checked |= CATEGORY_MASK_UTF_8;
1238   /* A coding system of this category is always ASCII compatible.  */
1239   src += coding->head_ascii;
1240
1241   while (1)
1242     {
1243       int c, c1, c2, c3, c4;
1244
1245       src_base = src;
1246       ONE_MORE_BYTE (c);
1247       if (c < 0 || UTF_8_1_OCTET_P (c))
1248         continue;
1249       ONE_MORE_BYTE (c1);
1250       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1251         break;
1252       if (UTF_8_2_OCTET_LEADING_P (c))
1253         {
1254           found = 1;
1255           continue;
1256         }
1257       ONE_MORE_BYTE (c2);
1258       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1259         break;
1260       if (UTF_8_3_OCTET_LEADING_P (c))
1261         {
1262           found = 1;
1263           if (src_base == coding->source
1264               && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1265             bom_found = 1;
1266           continue;
1267         }
1268       ONE_MORE_BYTE (c3);
1269       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1270         break;
1271       if (UTF_8_4_OCTET_LEADING_P (c))
1272         {
1273           found = 1;
1274           continue;
1275         }
1276       ONE_MORE_BYTE (c4);
1277       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1278         break;
1279       if (UTF_8_5_OCTET_LEADING_P (c))
1280         {
1281           found = 1;
1282           continue;
1283         }
1284       break;
1285     }
1286   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1287   return 0;
1288
1289  no_more_source:
1290   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1291     {
1292       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1293       return 0;
1294     }
1295   if (bom_found)
1296     {
1297       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1298       detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1299     }
1300   else
1301     {
1302       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1303       if (found)
1304         detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
1305     }
1306   return 1;
1307 }
1308
1309
1310 static void
1311 decode_coding_utf_8 (struct coding_system *coding)
1312 {
1313   const unsigned char *src = coding->source + coding->consumed;
1314   const unsigned char *src_end = coding->source + coding->src_bytes;
1315   const unsigned char *src_base;
1316   int *charbuf = coding->charbuf + coding->charbuf_used;
1317   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1318   int consumed_chars = 0, consumed_chars_base = 0;
1319   int multibytep = coding->src_multibyte;
1320   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1321   Lisp_Object attr, charset_list;
1322   int eol_crlf =
1323     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1324   int byte_after_cr = -1;
1325
1326   CODING_GET_INFO (coding, attr, charset_list);
1327
1328   if (bom != utf_without_bom)
1329     {
1330       int c1, c2, c3;
1331
1332       src_base = src;
1333       ONE_MORE_BYTE (c1);
1334       if (! UTF_8_3_OCTET_LEADING_P (c1))
1335         src = src_base;
1336       else
1337         {
1338           ONE_MORE_BYTE (c2);
1339           if (! UTF_8_EXTRA_OCTET_P (c2))
1340             src = src_base;
1341           else
1342             {
1343               ONE_MORE_BYTE (c3);
1344               if (! UTF_8_EXTRA_OCTET_P (c3))
1345                 src = src_base;
1346               else
1347                 {
1348                   if ((c1 != UTF_8_BOM_1)
1349                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1350                     src = src_base;
1351                   else
1352                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1353                 }
1354             }
1355         }
1356     }
1357   CODING_UTF_8_BOM (coding) = utf_without_bom;
1358
1359   while (1)
1360     {
1361       int c, c1, c2, c3, c4, c5;
1362
1363       src_base = src;
1364       consumed_chars_base = consumed_chars;
1365
1366       if (charbuf >= charbuf_end)
1367         {
1368           if (byte_after_cr >= 0)
1369             src_base--;
1370           break;
1371         }
1372
1373       if (byte_after_cr >= 0)
1374         c1 = byte_after_cr, byte_after_cr = -1;
1375       else
1376         ONE_MORE_BYTE (c1);
1377       if (c1 < 0)
1378         {
1379           c = - c1;
1380         }
1381       else if (UTF_8_1_OCTET_P (c1))
1382         {
1383           if (eol_crlf && c1 == '\r')
1384             ONE_MORE_BYTE (byte_after_cr);
1385           c = c1;
1386         }
1387       else
1388         {
1389           ONE_MORE_BYTE (c2);
1390           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1391             goto invalid_code;
1392           if (UTF_8_2_OCTET_LEADING_P (c1))
1393             {
1394               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1395               /* Reject overlong sequences here and below.  Encoders
1396                  producing them are incorrect, they can be misleading,
1397                  and they mess up read/write invariance.  */
1398               if (c < 128)
1399                 goto invalid_code;
1400             }
1401           else
1402             {
1403               ONE_MORE_BYTE (c3);
1404               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1405                 goto invalid_code;
1406               if (UTF_8_3_OCTET_LEADING_P (c1))
1407                 {
1408                   c = (((c1 & 0xF) << 12)
1409                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1410                   if (c < 0x800
1411                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1412                     goto invalid_code;
1413                 }
1414               else
1415                 {
1416                   ONE_MORE_BYTE (c4);
1417                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1418                     goto invalid_code;
1419                   if (UTF_8_4_OCTET_LEADING_P (c1))
1420                     {
1421                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1422                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1423                     if (c < 0x10000)
1424                       goto invalid_code;
1425                     }
1426                   else
1427                     {
1428                       ONE_MORE_BYTE (c5);
1429                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1430                         goto invalid_code;
1431                       if (UTF_8_5_OCTET_LEADING_P (c1))
1432                         {
1433                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1434                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1435                                | (c5 & 0x3F));
1436                           if ((c > MAX_CHAR) || (c < 0x200000))
1437                             goto invalid_code;
1438                         }
1439                       else
1440                         goto invalid_code;
1441                     }
1442                 }
1443             }
1444         }
1445
1446       *charbuf++ = c;
1447       continue;
1448
1449     invalid_code:
1450       src = src_base;
1451       consumed_chars = consumed_chars_base;
1452       ONE_MORE_BYTE (c);
1453       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1454       coding->errors++;
1455     }
1456
1457  no_more_source:
1458   coding->consumed_char += consumed_chars_base;
1459   coding->consumed = src_base - coding->source;
1460   coding->charbuf_used = charbuf - coding->charbuf;
1461 }
1462
1463
1464 static int
1465 encode_coding_utf_8 (struct coding_system *coding)
1466 {
1467   int multibytep = coding->dst_multibyte;
1468   int *charbuf = coding->charbuf;
1469   int *charbuf_end = charbuf + coding->charbuf_used;
1470   unsigned char *dst = coding->destination + coding->produced;
1471   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1472   int produced_chars = 0;
1473   int c;
1474
1475   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1476     {
1477       ASSURE_DESTINATION (3);
1478       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1479       CODING_UTF_8_BOM (coding) = utf_without_bom;
1480     }
1481
1482   if (multibytep)
1483     {
1484       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1485
1486       while (charbuf < charbuf_end)
1487         {
1488           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1489
1490           ASSURE_DESTINATION (safe_room);
1491           c = *charbuf++;
1492           if (CHAR_BYTE8_P (c))
1493             {
1494               c = CHAR_TO_BYTE8 (c);
1495               EMIT_ONE_BYTE (c);
1496             }
1497           else
1498             {
1499               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1500               for (p = str; p < pend; p++)
1501                 EMIT_ONE_BYTE (*p);
1502             }
1503         }
1504     }
1505   else
1506     {
1507       int safe_room = MAX_MULTIBYTE_LENGTH;
1508
1509       while (charbuf < charbuf_end)
1510         {
1511           ASSURE_DESTINATION (safe_room);
1512           c = *charbuf++;
1513           if (CHAR_BYTE8_P (c))
1514             *dst++ = CHAR_TO_BYTE8 (c);
1515           else
1516             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1517           produced_chars++;
1518         }
1519     }
1520   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1521   coding->produced_char += produced_chars;
1522   coding->produced = dst - coding->destination;
1523   return 0;
1524 }
1525
1526
1527 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1528    Check if a text is encoded in one of UTF-16 based coding systems.
1529    If it is, return 1, else return 0.  */
1530
1531 #define UTF_16_HIGH_SURROGATE_P(val) \
1532   (((val) & 0xFC00) == 0xD800)
1533
1534 #define UTF_16_LOW_SURROGATE_P(val) \
1535   (((val) & 0xFC00) == 0xDC00)
1536
1537 #define UTF_16_INVALID_P(val)   \
1538   (((val) == 0xFFFE)            \
1539    || ((val) == 0xFFFF)         \
1540    || UTF_16_LOW_SURROGATE_P (val))
1541
1542
1543 static int
1544 detect_coding_utf_16 (struct coding_system *coding,
1545                       struct coding_detection_info *detect_info)
1546 {
1547   const unsigned char *src = coding->source;
1548   const unsigned char *src_end = coding->source + coding->src_bytes;
1549   int multibytep = coding->src_multibyte;
1550   int c1, c2;
1551
1552   detect_info->checked |= CATEGORY_MASK_UTF_16;
1553   if (coding->mode & CODING_MODE_LAST_BLOCK
1554       && (coding->src_chars & 1))
1555     {
1556       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1557       return 0;
1558     }
1559
1560   TWO_MORE_BYTES (c1, c2);
1561   if ((c1 == 0xFF) && (c2 == 0xFE))
1562     {
1563       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1564                              | CATEGORY_MASK_UTF_16_AUTO);
1565       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1566                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1567                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1568     }
1569   else if ((c1 == 0xFE) && (c2 == 0xFF))
1570     {
1571       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1572                              | CATEGORY_MASK_UTF_16_AUTO);
1573       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1574                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1575                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1576     }
1577   else if (c2 < 0)
1578     {
1579       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1580       return 0;
1581     }
1582   else
1583     {
1584       /* We check the dispersion of Eth and Oth bytes where E is even and
1585          O is odd.  If both are high, we assume binary data.*/
1586       unsigned char e[256], o[256];
1587       unsigned e_num = 1, o_num = 1;
1588
1589       memset (e, 0, 256);
1590       memset (o, 0, 256);
1591       e[c1] = 1;
1592       o[c2] = 1;
1593
1594       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1595                                 |CATEGORY_MASK_UTF_16_BE
1596                                 | CATEGORY_MASK_UTF_16_LE);
1597
1598       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1599              != CATEGORY_MASK_UTF_16)
1600         {
1601           TWO_MORE_BYTES (c1, c2);
1602           if (c2 < 0)
1603             break;
1604           if (! e[c1])
1605             {
1606               e[c1] = 1;
1607               e_num++;
1608               if (e_num >= 128)
1609                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1610             }
1611           if (! o[c2])
1612             {
1613               o[c2] = 1;
1614               o_num++;
1615               if (o_num >= 128)
1616                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1617             }
1618         }
1619       return 0;
1620     }
1621
1622  no_more_source:
1623   return 1;
1624 }
1625
1626 static void
1627 decode_coding_utf_16 (struct coding_system *coding)
1628 {
1629   const unsigned char *src = coding->source + coding->consumed;
1630   const unsigned char *src_end = coding->source + coding->src_bytes;
1631   const unsigned char *src_base;
1632   int *charbuf = coding->charbuf + coding->charbuf_used;
1633   /* We may produces at most 3 chars in one loop.  */
1634   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1635   int consumed_chars = 0, consumed_chars_base = 0;
1636   int multibytep = coding->src_multibyte;
1637   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1638   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1639   int surrogate = CODING_UTF_16_SURROGATE (coding);
1640   Lisp_Object attr, charset_list;
1641   int eol_crlf =
1642     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1643   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1644
1645   CODING_GET_INFO (coding, attr, charset_list);
1646
1647   if (bom == utf_with_bom)
1648     {
1649       int c, c1, c2;
1650
1651       src_base = src;
1652       ONE_MORE_BYTE (c1);
1653       ONE_MORE_BYTE (c2);
1654       c = (c1 << 8) | c2;
1655
1656       if (endian == utf_16_big_endian
1657           ? c != 0xFEFF : c != 0xFFFE)
1658         {
1659           /* The first two bytes are not BOM.  Treat them as bytes
1660              for a normal character.  */
1661           src = src_base;
1662           coding->errors++;
1663         }
1664       CODING_UTF_16_BOM (coding) = utf_without_bom;
1665     }
1666   else if (bom == utf_detect_bom)
1667     {
1668       /* We have already tried to detect BOM and failed in
1669          detect_coding.  */
1670       CODING_UTF_16_BOM (coding) = utf_without_bom;
1671     }
1672
1673   while (1)
1674     {
1675       int c, c1, c2;
1676
1677       src_base = src;
1678       consumed_chars_base = consumed_chars;
1679
1680       if (charbuf >= charbuf_end)
1681         {
1682           if (byte_after_cr1 >= 0)
1683             src_base -= 2;
1684           break;
1685         }
1686
1687       if (byte_after_cr1 >= 0)
1688         c1 = byte_after_cr1, byte_after_cr1 = -1;
1689       else
1690         ONE_MORE_BYTE (c1);
1691       if (c1 < 0)
1692         {
1693           *charbuf++ = -c1;
1694           continue;
1695         }
1696       if (byte_after_cr2 >= 0)
1697         c2 = byte_after_cr2, byte_after_cr2 = -1;
1698       else
1699         ONE_MORE_BYTE (c2);
1700       if (c2 < 0)
1701         {
1702           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1703           *charbuf++ = -c2;
1704           continue;
1705         }
1706       c = (endian == utf_16_big_endian
1707            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1708
1709       if (surrogate)
1710         {
1711           if (! UTF_16_LOW_SURROGATE_P (c))
1712             {
1713               if (endian == utf_16_big_endian)
1714                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1715               else
1716                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1717               *charbuf++ = c1;
1718               *charbuf++ = c2;
1719               coding->errors++;
1720               if (UTF_16_HIGH_SURROGATE_P (c))
1721                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1722               else
1723                 *charbuf++ = c;
1724             }
1725           else
1726             {
1727               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1728               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1729               *charbuf++ = 0x10000 + c;
1730             }
1731         }
1732       else
1733         {
1734           if (UTF_16_HIGH_SURROGATE_P (c))
1735             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1736           else
1737             {
1738               if (eol_crlf && c == '\r')
1739                 {
1740                   ONE_MORE_BYTE (byte_after_cr1);
1741                   ONE_MORE_BYTE (byte_after_cr2);
1742                 }
1743               *charbuf++ = c;
1744             }
1745         }
1746     }
1747
1748  no_more_source:
1749   coding->consumed_char += consumed_chars_base;
1750   coding->consumed = src_base - coding->source;
1751   coding->charbuf_used = charbuf - coding->charbuf;
1752 }
1753
1754 static int
1755 encode_coding_utf_16 (struct coding_system *coding)
1756 {
1757   int multibytep = coding->dst_multibyte;
1758   int *charbuf = coding->charbuf;
1759   int *charbuf_end = charbuf + coding->charbuf_used;
1760   unsigned char *dst = coding->destination + coding->produced;
1761   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1762   int safe_room = 8;
1763   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1764   int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1765   int produced_chars = 0;
1766   Lisp_Object attrs, charset_list;
1767   int c;
1768
1769   CODING_GET_INFO (coding, attrs, charset_list);
1770
1771   if (bom != utf_without_bom)
1772     {
1773       ASSURE_DESTINATION (safe_room);
1774       if (big_endian)
1775         EMIT_TWO_BYTES (0xFE, 0xFF);
1776       else
1777         EMIT_TWO_BYTES (0xFF, 0xFE);
1778       CODING_UTF_16_BOM (coding) = utf_without_bom;
1779     }
1780
1781   while (charbuf < charbuf_end)
1782     {
1783       ASSURE_DESTINATION (safe_room);
1784       c = *charbuf++;
1785       if (c > MAX_UNICODE_CHAR)
1786         c = coding->default_char;
1787
1788       if (c < 0x10000)
1789         {
1790           if (big_endian)
1791             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1792           else
1793             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1794         }
1795       else
1796         {
1797           int c1, c2;
1798
1799           c -= 0x10000;
1800           c1 = (c >> 10) + 0xD800;
1801           c2 = (c & 0x3FF) + 0xDC00;
1802           if (big_endian)
1803             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1804           else
1805             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1806         }
1807     }
1808   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1809   coding->produced = dst - coding->destination;
1810   coding->produced_char += produced_chars;
1811   return 0;
1812 }
1813
1814 \f
1815 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1816
1817 /* Emacs' internal format for representation of multiple character
1818    sets is a kind of multi-byte encoding, i.e. characters are
1819    represented by variable-length sequences of one-byte codes.
1820
1821    ASCII characters and control characters (e.g. `tab', `newline') are
1822    represented by one-byte sequences which are their ASCII codes, in
1823    the range 0x00 through 0x7F.
1824
1825    8-bit characters of the range 0x80..0x9F are represented by
1826    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1827    code + 0x20).
1828
1829    8-bit characters of the range 0xA0..0xFF are represented by
1830    one-byte sequences which are their 8-bit code.
1831
1832    The other characters are represented by a sequence of `base
1833    leading-code', optional `extended leading-code', and one or two
1834    `position-code's.  The length of the sequence is determined by the
1835    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1836    whereas extended leading-code and position-code take the range 0xA0
1837    through 0xFF.  See `charset.h' for more details about leading-code
1838    and position-code.
1839
1840    --- CODE RANGE of Emacs' internal format ---
1841    character set        range
1842    -------------        -----
1843    ascii                0x00..0x7F
1844    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1845    eight-bit-graphic    0xA0..0xBF
1846    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1847    ---------------------------------------------
1848
1849    As this is the internal character representation, the format is
1850    usually not used externally (i.e. in a file or in a data sent to a
1851    process).  But, it is possible to have a text externally in this
1852    format (i.e. by encoding by the coding system `emacs-mule').
1853
1854    In that case, a sequence of one-byte codes has a slightly different
1855    form.
1856
1857    At first, all characters in eight-bit-control are represented by
1858    one-byte sequences which are their 8-bit code.
1859
1860    Next, character composition data are represented by the byte
1861    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1862    where,
1863         METHOD is 0xF2 plus one of composition method (enum
1864         composition_method),
1865
1866         BYTES is 0xA0 plus a byte length of this composition data,
1867
1868         CHARS is 0xA0 plus a number of characters composed by this
1869         data,
1870
1871         COMPONENTs are characters of multibyte form or composition
1872         rules encoded by two-byte of ASCII codes.
1873
1874    In addition, for backward compatibility, the following formats are
1875    also recognized as composition data on decoding.
1876
1877    0x80 MSEQ ...
1878    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1879
1880    Here,
1881         MSEQ is a multibyte form but in these special format:
1882           ASCII: 0xA0 ASCII_CODE+0x80,
1883           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1884         RULE is a one byte code of the range 0xA0..0xF0 that
1885         represents a composition rule.
1886   */
1887
1888 char emacs_mule_bytes[256];
1889
1890
1891 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1892    Check if a text is encoded in `emacs-mule'.  If it is, return 1,
1893    else return 0.  */
1894
1895 static int
1896 detect_coding_emacs_mule (struct coding_system *coding,
1897                           struct coding_detection_info *detect_info)
1898 {
1899   const unsigned char *src = coding->source, *src_base;
1900   const unsigned char *src_end = coding->source + coding->src_bytes;
1901   int multibytep = coding->src_multibyte;
1902   int consumed_chars = 0;
1903   int c;
1904   int found = 0;
1905
1906   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1907   /* A coding system of this category is always ASCII compatible.  */
1908   src += coding->head_ascii;
1909
1910   while (1)
1911     {
1912       src_base = src;
1913       ONE_MORE_BYTE (c);
1914       if (c < 0)
1915         continue;
1916       if (c == 0x80)
1917         {
1918           /* Perhaps the start of composite character.  We simply skip
1919              it because analyzing it is too heavy for detecting.  But,
1920              at least, we check that the composite character
1921              constitutes of more than 4 bytes.  */
1922           const unsigned char *src_base;
1923
1924         repeat:
1925           src_base = src;
1926           do
1927             {
1928               ONE_MORE_BYTE (c);
1929             }
1930           while (c >= 0xA0);
1931
1932           if (src - src_base <= 4)
1933             break;
1934           found = CATEGORY_MASK_EMACS_MULE;
1935           if (c == 0x80)
1936             goto repeat;
1937         }
1938
1939       if (c < 0x80)
1940         {
1941           if (c < 0x20
1942               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1943             break;
1944         }
1945       else
1946         {
1947           int more_bytes = emacs_mule_bytes[c] - 1;
1948
1949           while (more_bytes > 0)
1950             {
1951               ONE_MORE_BYTE (c);
1952               if (c < 0xA0)
1953                 {
1954                   src--;        /* Unread the last byte.  */
1955                   break;
1956                 }
1957               more_bytes--;
1958             }
1959           if (more_bytes != 0)
1960             break;
1961           found = CATEGORY_MASK_EMACS_MULE;
1962         }
1963     }
1964   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1965   return 0;
1966
1967  no_more_source:
1968   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1969     {
1970       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1971       return 0;
1972     }
1973   detect_info->found |= found;
1974   return 1;
1975 }
1976
1977
1978 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1979    character.  If CMP_STATUS indicates that we must expect MSEQ or
1980    RULE described above, decode it and return the negative value of
1981    the decoded character or rule.  If an invalid byte is found, return
1982    -1.  If SRC is too short, return -2.  */
1983
1984 int
1985 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1986                  int *nbytes, int *nchars, int *id,
1987                  struct composition_status *cmp_status)
1988 {
1989   const unsigned char *src_end = coding->source + coding->src_bytes;
1990   const unsigned char *src_base = src;
1991   int multibytep = coding->src_multibyte;
1992   int charset_id;
1993   unsigned code;
1994   int c;
1995   int consumed_chars = 0;
1996   int mseq_found = 0;
1997
1998   ONE_MORE_BYTE (c);
1999   if (c < 0)
2000     {
2001       c = -c;
2002       charset_id = emacs_mule_charset[0];
2003     }
2004   else
2005     {
2006       if (c >= 0xA0)
2007         {
2008           if (cmp_status->state != COMPOSING_NO
2009               && cmp_status->old_form)
2010             {
2011               if (cmp_status->state == COMPOSING_CHAR)
2012                 {
2013                   if (c == 0xA0)
2014                     {
2015                       ONE_MORE_BYTE (c);
2016                       c -= 0x80;
2017                       if (c < 0)
2018                         goto invalid_code;
2019                     }
2020                   else
2021                     c -= 0x20;
2022                   mseq_found = 1;
2023                 }
2024               else
2025                 {
2026                   *nbytes = src - src_base;
2027                   *nchars = consumed_chars;
2028                   return -c;
2029                 }
2030             }
2031           else
2032             goto invalid_code;
2033         }
2034
2035       switch (emacs_mule_bytes[c])
2036         {
2037         case 2:
2038           if ((charset_id = emacs_mule_charset[c]) < 0)
2039             goto invalid_code;
2040           ONE_MORE_BYTE (c);
2041           if (c < 0xA0)
2042             goto invalid_code;
2043           code = c & 0x7F;
2044           break;
2045
2046         case 3:
2047           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2048               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2049             {
2050               ONE_MORE_BYTE (c);
2051               if (c < 0xA0 || (charset_id = emacs_mule_charset[c]) < 0)
2052                 goto invalid_code;
2053               ONE_MORE_BYTE (c);
2054               if (c < 0xA0)
2055                 goto invalid_code;
2056               code = c & 0x7F;
2057             }
2058           else
2059             {
2060               if ((charset_id = emacs_mule_charset[c]) < 0)
2061                 goto invalid_code;
2062               ONE_MORE_BYTE (c);
2063               if (c < 0xA0)
2064                 goto invalid_code;
2065               code = (c & 0x7F) << 8;
2066               ONE_MORE_BYTE (c);
2067               if (c < 0xA0)
2068                 goto invalid_code;
2069               code |= c & 0x7F;
2070             }
2071           break;
2072
2073         case 4:
2074           ONE_MORE_BYTE (c);
2075           if (c < 0 || (charset_id = emacs_mule_charset[c]) < 0)
2076             goto invalid_code;
2077           ONE_MORE_BYTE (c);
2078           if (c < 0xA0)
2079             goto invalid_code;
2080           code = (c & 0x7F) << 8;
2081           ONE_MORE_BYTE (c);
2082           if (c < 0xA0)
2083             goto invalid_code;
2084           code |= c & 0x7F;
2085           break;
2086
2087         case 1:
2088           code = c;
2089           charset_id = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
2090           break;
2091
2092         default:
2093           abort ();
2094         }
2095       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2096                           CHARSET_FROM_ID (charset_id), code, c);
2097       if (c < 0)
2098         goto invalid_code;
2099     }
2100   *nbytes = src - src_base;
2101   *nchars = consumed_chars;
2102   if (id)
2103     *id = charset_id;
2104   return (mseq_found ? -c : c);
2105
2106  no_more_source:
2107   return -2;
2108
2109  invalid_code:
2110   return -1;
2111 }
2112
2113
2114 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2115
2116 /* Handle these composition sequence ('|': the end of header elements,
2117    BYTES and CHARS >= 0xA0):
2118
2119    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2120    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2121    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2122
2123    and these old form:
2124
2125    (4) relative composition: 0x80 | MSEQ ... MSEQ
2126    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2127
2128    When the starter 0x80 and the following header elements are found,
2129    this annotation header is produced.
2130
2131         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2132
2133    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2134    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2135
2136    Then, upon reading the following elements, these codes are produced
2137    until the composition end is found:
2138
2139    (1) CHAR ... CHAR
2140    (2) ALT ... ALT CHAR ... CHAR
2141    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2142    (4) CHAR ... CHAR
2143    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2144
2145    When the composition end is found, LENGTH and NCHARS in the
2146    annotation header is updated as below:
2147
2148    (1) LENGTH: unchanged, NCHARS: unchanged
2149    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2150    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2151    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2152    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2153
2154    If an error is found while composing, the annotation header is
2155    changed to the original composition header (plus filler -1s) as
2156    below:
2157
2158    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2159    (5)          [ 0x80 0xFF -1 -1- -1 ]
2160
2161    and the sequence [ -2 DECODED-RULE ] is changed to the original
2162    byte sequence as below:
2163         o the original byte sequence is B: [ B -1 ]
2164         o the original byte sequence is B1 B2: [ B1 B2 ]
2165
2166    Most of the routines are implemented by macros because many
2167    variables and labels in the caller decode_coding_emacs_mule must be
2168    accessible, and they are usually called just once (thus doesn't
2169    increase the size of compiled object).  */
2170
2171 /* Decode a composition rule represented by C as a component of
2172    composition sequence of Emacs 20 style.  Set RULE to the decoded
2173    rule. */
2174
2175 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2176   do {                                                  \
2177     int gref, nref;                                     \
2178                                                         \
2179     c -= 0xA0;                                          \
2180     if (c < 0 || c >= 81)                               \
2181       goto invalid_code;                                \
2182     gref = c / 9, nref = c % 9;                         \
2183     if (gref == 4) gref = 10;                           \
2184     if (nref == 4) nref = 10;                           \
2185     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2186   } while (0)
2187
2188
2189 /* Decode a composition rule represented by C and the following byte
2190    at SRC as a component of composition sequence of Emacs 21 style.
2191    Set RULE to the decoded rule.  */
2192
2193 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2194   do {                                                  \
2195     int gref, nref;                                     \
2196                                                         \
2197     gref = c - 0x20;                                    \
2198     if (gref < 0 || gref >= 81)                         \
2199       goto invalid_code;                                \
2200     ONE_MORE_BYTE (c);                                  \
2201     nref = c - 0x20;                                    \
2202     if (nref < 0 || nref >= 81)                         \
2203       goto invalid_code;                                \
2204     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2205   } while (0)
2206
2207
2208 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2209    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2210    byte length of this composition information, CHARS is the number of
2211    characters composed by this composition.  */
2212
2213 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2214   do {                                                                  \
2215     enum composition_method method = c - 0xF2;                          \
2216     int nbytes, nchars;                                                 \
2217                                                                         \
2218     ONE_MORE_BYTE (c);                                                  \
2219     if (c < 0)                                                          \
2220       goto invalid_code;                                                \
2221     nbytes = c - 0xA0;                                                  \
2222     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2223       goto invalid_code;                                                \
2224     ONE_MORE_BYTE (c);                                                  \
2225     nchars = c - 0xA0;                                                  \
2226     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2227       goto invalid_code;                                                \
2228     cmp_status->old_form = 0;                                           \
2229     cmp_status->method = method;                                        \
2230     if (method == COMPOSITION_RELATIVE)                                 \
2231       cmp_status->state = COMPOSING_CHAR;                               \
2232     else                                                                \
2233       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2234     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2235     cmp_status->nchars = nchars;                                        \
2236     cmp_status->ncomps = nbytes - 4;                                    \
2237     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2238   } while (0)
2239
2240
2241 /* Start of Emacs 20 style format for relative composition.  */
2242
2243 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2244   do {                                                          \
2245     cmp_status->old_form = 1;                                   \
2246     cmp_status->method = COMPOSITION_RELATIVE;                  \
2247     cmp_status->state = COMPOSING_CHAR;                         \
2248     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2249     cmp_status->nchars = cmp_status->ncomps = 0;                \
2250     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2251   } while (0)
2252
2253
2254 /* Start of Emacs 20 style format for rule-base composition.  */
2255
2256 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2257   do {                                                          \
2258     cmp_status->old_form = 1;                                   \
2259     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2260     cmp_status->state = COMPOSING_CHAR;                         \
2261     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2262     cmp_status->nchars = cmp_status->ncomps = 0;                \
2263     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2264   } while (0)
2265
2266
2267 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2268   do {                                                  \
2269     const unsigned char *current_src = src;             \
2270                                                         \
2271     ONE_MORE_BYTE (c);                                  \
2272     if (c < 0)                                          \
2273       goto invalid_code;                                \
2274     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2275         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2276       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2277     else if (c < 0xA0)                                  \
2278       goto invalid_code;                                \
2279     else if (c < 0xC0)                                  \
2280       {                                                 \
2281         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2282         /* Re-read C as a composition component.  */    \
2283         src = current_src;                              \
2284       }                                                 \
2285     else if (c == 0xFF)                                 \
2286       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2287     else                                                \
2288       goto invalid_code;                                \
2289   } while (0)
2290
2291 #define EMACS_MULE_COMPOSITION_END()                            \
2292   do {                                                          \
2293     int idx = - cmp_status->length;                             \
2294                                                                 \
2295     if (cmp_status->old_form)                                   \
2296       charbuf[idx + 2] = cmp_status->nchars;                    \
2297     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2298       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2299     cmp_status->state = COMPOSING_NO;                           \
2300   } while (0)
2301
2302
2303 static int
2304 emacs_mule_finish_composition (int *charbuf,
2305                                struct composition_status *cmp_status)
2306 {
2307   int idx = - cmp_status->length;
2308   int new_chars;
2309
2310   if (cmp_status->old_form && cmp_status->nchars > 0)
2311     {
2312       charbuf[idx + 2] = cmp_status->nchars;
2313       new_chars = 0;
2314       if (cmp_status->method == COMPOSITION_WITH_RULE
2315           && cmp_status->state == COMPOSING_CHAR)
2316         {
2317           /* The last rule was invalid.  */
2318           int rule = charbuf[-1] + 0xA0;
2319
2320           charbuf[-2] = BYTE8_TO_CHAR (rule);
2321           charbuf[-1] = -1;
2322           new_chars = 1;
2323         }
2324     }
2325   else
2326     {
2327       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2328
2329       if (cmp_status->method == COMPOSITION_WITH_RULE)
2330         {
2331           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2332           charbuf[idx++] = -3;
2333           charbuf[idx++] = 0;
2334           new_chars = 1;
2335         }
2336       else
2337         {
2338           int nchars = charbuf[idx + 1] + 0xA0;
2339           int nbytes = charbuf[idx + 2] + 0xA0;
2340
2341           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2342           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2343           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2344           charbuf[idx++] = -1;
2345           new_chars = 4;
2346         }
2347     }
2348   cmp_status->state = COMPOSING_NO;
2349   return new_chars;
2350 }
2351
2352 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2353   do {                                                                    \
2354     if (cmp_status->state != COMPOSING_NO)                                \
2355       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2356   } while (0)
2357
2358
2359 static void
2360 decode_coding_emacs_mule (struct coding_system *coding)
2361 {
2362   const unsigned char *src = coding->source + coding->consumed;
2363   const unsigned char *src_end = coding->source + coding->src_bytes;
2364   const unsigned char *src_base;
2365   int *charbuf = coding->charbuf + coding->charbuf_used;
2366   /* We may produce two annotations (charset and composition) in one
2367      loop and one more charset annotation at the end.  */
2368   int *charbuf_end
2369     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
2370   int consumed_chars = 0, consumed_chars_base;
2371   int multibytep = coding->src_multibyte;
2372   Lisp_Object attrs, charset_list;
2373   int char_offset = coding->produced_char;
2374   int last_offset = char_offset;
2375   int last_id = charset_ascii;
2376   int eol_crlf =
2377     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2378   int byte_after_cr = -1;
2379   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2380
2381   CODING_GET_INFO (coding, attrs, charset_list);
2382
2383   if (cmp_status->state != COMPOSING_NO)
2384     {
2385       int i;
2386
2387       for (i = 0; i < cmp_status->length; i++)
2388         *charbuf++ = cmp_status->carryover[i];
2389       coding->annotated = 1;
2390     }
2391
2392   while (1)
2393     {
2394       int c, id;
2395
2396       src_base = src;
2397       consumed_chars_base = consumed_chars;
2398
2399       if (charbuf >= charbuf_end)
2400         {
2401           if (byte_after_cr >= 0)
2402             src_base--;
2403           break;
2404         }
2405
2406       if (byte_after_cr >= 0)
2407         c = byte_after_cr, byte_after_cr = -1;
2408       else
2409         ONE_MORE_BYTE (c);
2410
2411       if (c < 0 || c == 0x80)
2412         {
2413           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2414           if (c < 0)
2415             {
2416               *charbuf++ = -c;
2417               char_offset++;
2418             }
2419           else
2420             DECODE_EMACS_MULE_COMPOSITION_START ();
2421           continue;
2422         }
2423
2424       if (c < 0x80)
2425         {
2426           if (eol_crlf && c == '\r')
2427             ONE_MORE_BYTE (byte_after_cr);
2428           id = charset_ascii;
2429           if (cmp_status->state != COMPOSING_NO)
2430             {
2431               if (cmp_status->old_form)
2432                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2433               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2434                 cmp_status->ncomps--;
2435             }
2436         }
2437       else
2438         {
2439           int nchars, nbytes;
2440           /* emacs_mule_char can load a charset map from a file, which
2441              allocates a large structure and might cause buffer text
2442              to be relocated as result.  Thus, we need to remember the
2443              original pointer to buffer text, and fix up all related
2444              pointers after the call.  */
2445           const unsigned char *orig = coding->source;
2446           EMACS_INT offset;
2447
2448           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2449                                cmp_status);
2450           offset = coding->source - orig;
2451           if (offset)
2452             {
2453               src += offset;
2454               src_base += offset;
2455               src_end += offset;
2456             }
2457           if (c < 0)
2458             {
2459               if (c == -1)
2460                 goto invalid_code;
2461               if (c == -2)
2462                 break;
2463             }
2464           src = src_base + nbytes;
2465           consumed_chars = consumed_chars_base + nchars;
2466           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2467             cmp_status->ncomps -= nchars;
2468         }
2469
2470       /* Now if C >= 0, we found a normally encoded character, if C <
2471          0, we found an old-style composition component character or
2472          rule.  */
2473
2474       if (cmp_status->state == COMPOSING_NO)
2475         {
2476           if (last_id != id)
2477             {
2478               if (last_id != charset_ascii)
2479                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2480                                   last_id);
2481               last_id = id;
2482               last_offset = char_offset;
2483             }
2484           *charbuf++ = c;
2485           char_offset++;
2486         }
2487       else if (cmp_status->state == COMPOSING_CHAR)
2488         {
2489           if (cmp_status->old_form)
2490             {
2491               if (c >= 0)
2492                 {
2493                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2494                   *charbuf++ = c;
2495                   char_offset++;
2496                 }
2497               else
2498                 {
2499                   *charbuf++ = -c;
2500                   cmp_status->nchars++;
2501                   cmp_status->length++;
2502                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2503                     EMACS_MULE_COMPOSITION_END ();
2504                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2505                     cmp_status->state = COMPOSING_RULE;
2506                 }
2507             }
2508           else
2509             {
2510               *charbuf++ = c;
2511               cmp_status->length++;
2512               cmp_status->nchars--;
2513               if (cmp_status->nchars == 0)
2514                 EMACS_MULE_COMPOSITION_END ();
2515             }
2516         }
2517       else if (cmp_status->state == COMPOSING_RULE)
2518         {
2519           int rule;
2520
2521           if (c >= 0)
2522             {
2523               EMACS_MULE_COMPOSITION_END ();
2524               *charbuf++ = c;
2525               char_offset++;
2526             }
2527           else
2528             {
2529               c = -c;
2530               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2531               if (rule < 0)
2532                 goto invalid_code;
2533               *charbuf++ = -2;
2534               *charbuf++ = rule;
2535               cmp_status->length += 2;
2536               cmp_status->state = COMPOSING_CHAR;
2537             }
2538         }
2539       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2540         {
2541           *charbuf++ = c;
2542           cmp_status->length++;
2543           if (cmp_status->ncomps == 0)
2544             cmp_status->state = COMPOSING_CHAR;
2545           else if (cmp_status->ncomps > 0)
2546             {
2547               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2548                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2549             }
2550           else
2551             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2552         }
2553       else                      /* COMPOSING_COMPONENT_RULE */
2554         {
2555           int rule;
2556
2557           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2558           if (rule < 0)
2559             goto invalid_code;
2560           *charbuf++ = -2;
2561           *charbuf++ = rule;
2562           cmp_status->length += 2;
2563           cmp_status->ncomps--;
2564           if (cmp_status->ncomps > 0)
2565             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2566           else
2567             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2568         }
2569       continue;
2570
2571     invalid_code:
2572       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2573       src = src_base;
2574       consumed_chars = consumed_chars_base;
2575       ONE_MORE_BYTE (c);
2576       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2577       char_offset++;
2578       coding->errors++;
2579     }
2580
2581  no_more_source:
2582   if (cmp_status->state != COMPOSING_NO)
2583     {
2584       if (coding->mode & CODING_MODE_LAST_BLOCK)
2585         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2586       else
2587         {
2588           int i;
2589
2590           charbuf -= cmp_status->length;
2591           for (i = 0; i < cmp_status->length; i++)
2592             cmp_status->carryover[i] = charbuf[i];
2593         }
2594     }
2595   if (last_id != charset_ascii)
2596     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2597   coding->consumed_char += consumed_chars_base;
2598   coding->consumed = src_base - coding->source;
2599   coding->charbuf_used = charbuf - coding->charbuf;
2600 }
2601
2602
2603 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2604   do {                                          \
2605     if (id < 0xA0)                              \
2606       codes[0] = id, codes[1] = 0;              \
2607     else if (id < 0xE0)                         \
2608       codes[0] = 0x9A, codes[1] = id;           \
2609     else if (id < 0xF0)                         \
2610       codes[0] = 0x9B, codes[1] = id;           \
2611     else if (id < 0xF5)                         \
2612       codes[0] = 0x9C, codes[1] = id;           \
2613     else                                        \
2614       codes[0] = 0x9D, codes[1] = id;           \
2615   } while (0);
2616
2617
2618 static int
2619 encode_coding_emacs_mule (struct coding_system *coding)
2620 {
2621   int multibytep = coding->dst_multibyte;
2622   int *charbuf = coding->charbuf;
2623   int *charbuf_end = charbuf + coding->charbuf_used;
2624   unsigned char *dst = coding->destination + coding->produced;
2625   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2626   int safe_room = 8;
2627   int produced_chars = 0;
2628   Lisp_Object attrs, charset_list;
2629   int c;
2630   int preferred_charset_id = -1;
2631
2632   CODING_GET_INFO (coding, attrs, charset_list);
2633   if (! EQ (charset_list, Vemacs_mule_charset_list))
2634     {
2635       CODING_ATTR_CHARSET_LIST (attrs)
2636         = charset_list = Vemacs_mule_charset_list;
2637     }
2638
2639   while (charbuf < charbuf_end)
2640     {
2641       ASSURE_DESTINATION (safe_room);
2642       c = *charbuf++;
2643
2644       if (c < 0)
2645         {
2646           /* Handle an annotation.  */
2647           switch (*charbuf)
2648             {
2649             case CODING_ANNOTATE_COMPOSITION_MASK:
2650               /* Not yet implemented.  */
2651               break;
2652             case CODING_ANNOTATE_CHARSET_MASK:
2653               preferred_charset_id = charbuf[3];
2654               if (preferred_charset_id >= 0
2655                   && NILP (Fmemq (make_number (preferred_charset_id),
2656                                   charset_list)))
2657                 preferred_charset_id = -1;
2658               break;
2659             default:
2660               abort ();
2661             }
2662           charbuf += -c - 1;
2663           continue;
2664         }
2665
2666       if (ASCII_CHAR_P (c))
2667         EMIT_ONE_ASCII_BYTE (c);
2668       else if (CHAR_BYTE8_P (c))
2669         {
2670           c = CHAR_TO_BYTE8 (c);
2671           EMIT_ONE_BYTE (c);
2672         }
2673       else
2674         {
2675           struct charset *charset;
2676           unsigned code;
2677           int dimension;
2678           int emacs_mule_id;
2679           unsigned char leading_codes[2];
2680
2681           if (preferred_charset_id >= 0)
2682             {
2683               charset = CHARSET_FROM_ID (preferred_charset_id);
2684               if (CHAR_CHARSET_P (c, charset))
2685                 code = ENCODE_CHAR (charset, c);
2686               else
2687                 charset = char_charset (c, charset_list, &code);
2688             }
2689           else
2690             charset = char_charset (c, charset_list, &code);
2691           if (! charset)
2692             {
2693               c = coding->default_char;
2694               if (ASCII_CHAR_P (c))
2695                 {
2696                   EMIT_ONE_ASCII_BYTE (c);
2697                   continue;
2698                 }
2699               charset = char_charset (c, charset_list, &code);
2700             }
2701           dimension = CHARSET_DIMENSION (charset);
2702           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2703           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2704           EMIT_ONE_BYTE (leading_codes[0]);
2705           if (leading_codes[1])
2706             EMIT_ONE_BYTE (leading_codes[1]);
2707           if (dimension == 1)
2708             EMIT_ONE_BYTE (code | 0x80);
2709           else
2710             {
2711               code |= 0x8080;
2712               EMIT_ONE_BYTE (code >> 8);
2713               EMIT_ONE_BYTE (code & 0xFF);
2714             }
2715         }
2716     }
2717   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2718   coding->produced_char += produced_chars;
2719   coding->produced = dst - coding->destination;
2720   return 0;
2721 }
2722
2723 \f
2724 /*** 7. ISO2022 handlers ***/
2725
2726 /* The following note describes the coding system ISO2022 briefly.
2727    Since the intention of this note is to help understand the
2728    functions in this file, some parts are NOT ACCURATE or are OVERLY
2729    SIMPLIFIED.  For thorough understanding, please refer to the
2730    original document of ISO2022.  This is equivalent to the standard
2731    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2732
2733    ISO2022 provides many mechanisms to encode several character sets
2734    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2735    is encoded using bytes less than 128.  This may make the encoded
2736    text a little bit longer, but the text passes more easily through
2737    several types of gateway, some of which strip off the MSB (Most
2738    Significant Bit).
2739
2740    There are two kinds of character sets: control character sets and
2741    graphic character sets.  The former contain control characters such
2742    as `newline' and `escape' to provide control functions (control
2743    functions are also provided by escape sequences).  The latter
2744    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2745    two control character sets and many graphic character sets.
2746
2747    Graphic character sets are classified into one of the following
2748    four classes, according to the number of bytes (DIMENSION) and
2749    number of characters in one dimension (CHARS) of the set:
2750    - DIMENSION1_CHARS94
2751    - DIMENSION1_CHARS96
2752    - DIMENSION2_CHARS94
2753    - DIMENSION2_CHARS96
2754
2755    In addition, each character set is assigned an identification tag,
2756    unique for each set, called the "final character" (denoted as <F>
2757    hereafter).  The <F> of each character set is decided by ECMA(*)
2758    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2759    (0x30..0x3F are for private use only).
2760
2761    Note (*): ECMA = European Computer Manufacturers Association
2762
2763    Here are examples of graphic character sets [NAME(<F>)]:
2764         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2765         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2766         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2767         o DIMENSION2_CHARS96 -- none for the moment
2768
2769    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2770         C0 [0x00..0x1F] -- control character plane 0
2771         GL [0x20..0x7F] -- graphic character plane 0
2772         C1 [0x80..0x9F] -- control character plane 1
2773         GR [0xA0..0xFF] -- graphic character plane 1
2774
2775    A control character set is directly designated and invoked to C0 or
2776    C1 by an escape sequence.  The most common case is that:
2777    - ISO646's  control character set is designated/invoked to C0, and
2778    - ISO6429's control character set is designated/invoked to C1,
2779    and usually these designations/invocations are omitted in encoded
2780    text.  In a 7-bit environment, only C0 can be used, and a control
2781    character for C1 is encoded by an appropriate escape sequence to
2782    fit into the environment.  All control characters for C1 are
2783    defined to have corresponding escape sequences.
2784
2785    A graphic character set is at first designated to one of four
2786    graphic registers (G0 through G3), then these graphic registers are
2787    invoked to GL or GR.  These designations and invocations can be
2788    done independently.  The most common case is that G0 is invoked to
2789    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2790    these invocations and designations are omitted in encoded text.
2791    In a 7-bit environment, only GL can be used.
2792
2793    When a graphic character set of CHARS94 is invoked to GL, codes
2794    0x20 and 0x7F of the GL area work as control characters SPACE and
2795    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2796    be used.
2797
2798    There are two ways of invocation: locking-shift and single-shift.
2799    With locking-shift, the invocation lasts until the next different
2800    invocation, whereas with single-shift, the invocation affects the
2801    following character only and doesn't affect the locking-shift
2802    state.  Invocations are done by the following control characters or
2803    escape sequences:
2804
2805    ----------------------------------------------------------------------
2806    abbrev  function                  cntrl escape seq   description
2807    ----------------------------------------------------------------------
2808    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2809    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2810    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2811    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2812    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2813    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2814    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2815    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2816    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2817    ----------------------------------------------------------------------
2818    (*) These are not used by any known coding system.
2819
2820    Control characters for these functions are defined by macros
2821    ISO_CODE_XXX in `coding.h'.
2822
2823    Designations are done by the following escape sequences:
2824    ----------------------------------------------------------------------
2825    escape sequence      description
2826    ----------------------------------------------------------------------
2827    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2828    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2829    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2830    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2831    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2832    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2833    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2834    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2835    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2836    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2837    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2838    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2839    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2840    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2841    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2842    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2843    ----------------------------------------------------------------------
2844
2845    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2846    of dimension 1, chars 94, and final character <F>, etc...
2847
2848    Note (*): Although these designations are not allowed in ISO2022,
2849    Emacs accepts them on decoding, and produces them on encoding
2850    CHARS96 character sets in a coding system which is characterized as
2851    7-bit environment, non-locking-shift, and non-single-shift.
2852
2853    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2854    '(' must be omitted.  We refer to this as "short-form" hereafter.
2855
2856    Now you may notice that there are a lot of ways of encoding the
2857    same multilingual text in ISO2022.  Actually, there exist many
2858    coding systems such as Compound Text (used in X11's inter client
2859    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2860    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2861    localized platforms), and all of these are variants of ISO2022.
2862
2863    In addition to the above, Emacs handles two more kinds of escape
2864    sequences: ISO6429's direction specification and Emacs' private
2865    sequence for specifying character composition.
2866
2867    ISO6429's direction specification takes the following form:
2868         o CSI ']'      -- end of the current direction
2869         o CSI '0' ']'  -- end of the current direction
2870         o CSI '1' ']'  -- start of left-to-right text
2871         o CSI '2' ']'  -- start of right-to-left text
2872    The control character CSI (0x9B: control sequence introducer) is
2873    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2874
2875    Character composition specification takes the following form:
2876         o ESC '0' -- start relative composition
2877         o ESC '1' -- end composition
2878         o ESC '2' -- start rule-base composition (*)
2879         o ESC '3' -- start relative composition with alternate chars  (**)
2880         o ESC '4' -- start rule-base composition with alternate chars  (**)
2881   Since these are not standard escape sequences of any ISO standard,
2882   the use of them with these meanings is restricted to Emacs only.
2883
2884   (*) This form is used only in Emacs 20.7 and older versions,
2885   but newer versions can safely decode it.
2886   (**) This form is used only in Emacs 21.1 and newer versions,
2887   and older versions can't decode it.
2888
2889   Here's a list of example usages of these composition escape
2890   sequences (categorized by `enum composition_method').
2891
2892   COMPOSITION_RELATIVE:
2893         ESC 0 CHAR [ CHAR ] ESC 1
2894   COMPOSITION_WITH_RULE:
2895         ESC 2 CHAR [ RULE CHAR ] ESC 1
2896   COMPOSITION_WITH_ALTCHARS:
2897         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2898   COMPOSITION_WITH_RULE_ALTCHARS:
2899         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2900
2901 enum iso_code_class_type iso_code_class[256];
2902
2903 #define SAFE_CHARSET_P(coding, id)      \
2904   ((id) <= (coding)->max_charset_id     \
2905    && (coding)->safe_charsets[id] != 255)
2906
2907
2908 #define SHIFT_OUT_OK(category)  \
2909   (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2910
2911 static void
2912 setup_iso_safe_charsets (Lisp_Object attrs)
2913 {
2914   Lisp_Object charset_list, safe_charsets;
2915   Lisp_Object request;
2916   Lisp_Object reg_usage;
2917   Lisp_Object tail;
2918   int reg94, reg96;
2919   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2920   int max_charset_id;
2921
2922   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2923   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2924       && ! EQ (charset_list, Viso_2022_charset_list))
2925     {
2926       CODING_ATTR_CHARSET_LIST (attrs)
2927         = charset_list = Viso_2022_charset_list;
2928       ASET (attrs, coding_attr_safe_charsets, Qnil);
2929     }
2930
2931   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2932     return;
2933
2934   max_charset_id = 0;
2935   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2936     {
2937       int id = XINT (XCAR (tail));
2938       if (max_charset_id < id)
2939         max_charset_id = id;
2940     }
2941
2942   safe_charsets = make_uninit_string (max_charset_id + 1);
2943   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2944   request = AREF (attrs, coding_attr_iso_request);
2945   reg_usage = AREF (attrs, coding_attr_iso_usage);
2946   reg94 = XINT (XCAR (reg_usage));
2947   reg96 = XINT (XCDR (reg_usage));
2948
2949   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2950     {
2951       Lisp_Object id;
2952       Lisp_Object reg;
2953       struct charset *charset;
2954
2955       id = XCAR (tail);
2956       charset = CHARSET_FROM_ID (XINT (id));
2957       reg = Fcdr (Fassq (id, request));
2958       if (! NILP (reg))
2959         SSET (safe_charsets, XINT (id), XINT (reg));
2960       else if (charset->iso_chars_96)
2961         {
2962           if (reg96 < 4)
2963             SSET (safe_charsets, XINT (id), reg96);
2964         }
2965       else
2966         {
2967           if (reg94 < 4)
2968             SSET (safe_charsets, XINT (id), reg94);
2969         }
2970     }
2971   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2972 }
2973
2974
2975 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2976    Check if a text is encoded in one of ISO-2022 based coding systems.
2977    If it is, return 1, else return 0.  */
2978
2979 static int
2980 detect_coding_iso_2022 (struct coding_system *coding,
2981                         struct coding_detection_info *detect_info)
2982 {
2983   const unsigned char *src = coding->source, *src_base = src;
2984   const unsigned char *src_end = coding->source + coding->src_bytes;
2985   int multibytep = coding->src_multibyte;
2986   int single_shifting = 0;
2987   int id;
2988   int c, c1;
2989   int consumed_chars = 0;
2990   int i;
2991   int rejected = 0;
2992   int found = 0;
2993   int composition_count = -1;
2994
2995   detect_info->checked |= CATEGORY_MASK_ISO;
2996
2997   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2998     {
2999       struct coding_system *this = &(coding_categories[i]);
3000       Lisp_Object attrs, val;
3001
3002       if (this->id < 0)
3003         continue;
3004       attrs = CODING_ID_ATTRS (this->id);
3005       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
3006           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
3007         setup_iso_safe_charsets (attrs);
3008       val = CODING_ATTR_SAFE_CHARSETS (attrs);
3009       this->max_charset_id = SCHARS (val) - 1;
3010       this->safe_charsets = SDATA (val);
3011     }
3012
3013   /* A coding system of this category is always ASCII compatible.  */
3014   src += coding->head_ascii;
3015
3016   while (rejected != CATEGORY_MASK_ISO)
3017     {
3018       src_base = src;
3019       ONE_MORE_BYTE (c);
3020       switch (c)
3021         {
3022         case ISO_CODE_ESC:
3023           if (inhibit_iso_escape_detection)
3024             break;
3025           single_shifting = 0;
3026           ONE_MORE_BYTE (c);
3027           if (c >= '(' && c <= '/')
3028             {
3029               /* Designation sequence for a charset of dimension 1.  */
3030               ONE_MORE_BYTE (c1);
3031               if (c1 < ' ' || c1 >= 0x80
3032                   || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3033                 /* Invalid designation sequence.  Just ignore.  */
3034                 break;
3035             }
3036           else if (c == '$')
3037             {
3038               /* Designation sequence for a charset of dimension 2.  */
3039               ONE_MORE_BYTE (c);
3040               if (c >= '@' && c <= 'B')
3041                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3042                 id = iso_charset_table[1][0][c];
3043               else if (c >= '(' && c <= '/')
3044                 {
3045                   ONE_MORE_BYTE (c1);
3046                   if (c1 < ' ' || c1 >= 0x80
3047                       || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3048                     /* Invalid designation sequence.  Just ignore.  */
3049                     break;
3050                 }
3051               else
3052                 /* Invalid designation sequence.  Just ignore it.  */
3053                 break;
3054             }
3055           else if (c == 'N' || c == 'O')
3056             {
3057               /* ESC <Fe> for SS2 or SS3.  */
3058               single_shifting = 1;
3059               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3060               break;
3061             }
3062           else if (c == '1')
3063             {
3064               /* End of composition.  */
3065               if (composition_count < 0
3066                   || composition_count > MAX_COMPOSITION_COMPONENTS)
3067                 /* Invalid */
3068                 break;
3069               composition_count = -1;
3070               found |= CATEGORY_MASK_ISO;
3071             }
3072           else if (c >= '0' && c <= '4')
3073             {
3074               /* ESC <Fp> for start/end composition.  */
3075               composition_count = 0;
3076               break;
3077             }
3078           else
3079             {
3080               /* Invalid escape sequence.  Just ignore it.  */
3081               break;
3082             }
3083
3084           /* We found a valid designation sequence for CHARSET.  */
3085           rejected |= CATEGORY_MASK_ISO_8BIT;
3086           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3087                               id))
3088             found |= CATEGORY_MASK_ISO_7;
3089           else
3090             rejected |= CATEGORY_MASK_ISO_7;
3091           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3092                               id))
3093             found |= CATEGORY_MASK_ISO_7_TIGHT;
3094           else
3095             rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3096           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3097                               id))
3098             found |= CATEGORY_MASK_ISO_7_ELSE;
3099           else
3100             rejected |= CATEGORY_MASK_ISO_7_ELSE;
3101           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3102                               id))
3103             found |= CATEGORY_MASK_ISO_8_ELSE;
3104           else
3105             rejected |= CATEGORY_MASK_ISO_8_ELSE;
3106           break;
3107
3108         case ISO_CODE_SO:
3109         case ISO_CODE_SI:
3110           /* Locking shift out/in.  */
3111           if (inhibit_iso_escape_detection)
3112             break;
3113           single_shifting = 0;
3114           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3115           break;
3116
3117         case ISO_CODE_CSI:
3118           /* Control sequence introducer.  */
3119           single_shifting = 0;
3120           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3121           found |= CATEGORY_MASK_ISO_8_ELSE;
3122           goto check_extra_latin;
3123
3124         case ISO_CODE_SS2:
3125         case ISO_CODE_SS3:
3126           /* Single shift.   */
3127           if (inhibit_iso_escape_detection)
3128             break;
3129           single_shifting = 0;
3130           rejected |= CATEGORY_MASK_ISO_7BIT;
3131           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3132               & CODING_ISO_FLAG_SINGLE_SHIFT)
3133             found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
3134           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3135               & CODING_ISO_FLAG_SINGLE_SHIFT)
3136             found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
3137           if (single_shifting)
3138             break;
3139           goto check_extra_latin;
3140
3141         default:
3142           if (c < 0)
3143             continue;
3144           if (c < 0x80)
3145             {
3146               if (composition_count >= 0)
3147                 composition_count++;
3148               single_shifting = 0;
3149               break;
3150             }
3151           if (c >= 0xA0)
3152             {
3153               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3154               found |= CATEGORY_MASK_ISO_8_1;
3155               /* Check the length of succeeding codes of the range
3156                  0xA0..0FF.  If the byte length is even, we include
3157                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3158                  only when we are not single shifting.  */
3159               if (! single_shifting
3160                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3161                 {
3162                   int i = 1;
3163                   while (src < src_end)
3164                     {
3165                       src_base = src;
3166                       ONE_MORE_BYTE (c);
3167                       if (c < 0xA0)
3168                         {
3169                           src = src_base;
3170                           break;
3171                         }
3172                       i++;
3173                     }
3174
3175                   if (i & 1 && src < src_end)
3176                     {
3177                       rejected |= CATEGORY_MASK_ISO_8_2;
3178                       if (composition_count >= 0)
3179                         composition_count += i;
3180                     }
3181                   else
3182                     {
3183                       found |= CATEGORY_MASK_ISO_8_2;
3184                       if (composition_count >= 0)
3185                         composition_count += i / 2;
3186                     }
3187                 }
3188               break;
3189             }
3190         check_extra_latin:
3191           single_shifting = 0;
3192           if (! VECTORP (Vlatin_extra_code_table)
3193               || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3194             {
3195               rejected = CATEGORY_MASK_ISO;
3196               break;
3197             }
3198           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3199               & CODING_ISO_FLAG_LATIN_EXTRA)
3200             found |= CATEGORY_MASK_ISO_8_1;
3201           else
3202             rejected |= CATEGORY_MASK_ISO_8_1;
3203           rejected |= CATEGORY_MASK_ISO_8_2;
3204         }
3205     }
3206   detect_info->rejected |= CATEGORY_MASK_ISO;
3207   return 0;
3208
3209  no_more_source:
3210   detect_info->rejected |= rejected;
3211   detect_info->found |= (found & ~rejected);
3212   return 1;
3213 }
3214
3215
3216 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3217    escape sequence should be kept.  */
3218 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3219   do {                                                                  \
3220     int id, prev;                                                       \
3221                                                                         \
3222     if (final < '0' || final >= 128                                     \
3223         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3224         || !SAFE_CHARSET_P (coding, id))                                \
3225       {                                                                 \
3226         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3227         chars_96 = -1;                                                  \
3228         break;                                                          \
3229       }                                                                 \
3230     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3231     if (id == charset_jisx0201_roman)                                   \
3232       {                                                                 \
3233         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3234           id = charset_ascii;                                           \
3235       }                                                                 \
3236     else if (id == charset_jisx0208_1978)                               \
3237       {                                                                 \
3238         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3239           id = charset_jisx0208;                                        \
3240       }                                                                 \
3241     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3242     /* If there was an invalid designation to REG previously, and this  \
3243        designation is ASCII to REG, we should keep this designation     \
3244        sequence.  */                                                    \
3245     if (prev == -2 && id == charset_ascii)                              \
3246       chars_96 = -1;                                                    \
3247   } while (0)
3248
3249
3250 /* Handle these composition sequence (ALT: alternate char):
3251
3252    (1) relative composition: ESC 0 CHAR ... ESC 1
3253    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3254    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3255    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3256
3257    When the start sequence (ESC 0/2/3/4) is found, this annotation
3258    header is produced.
3259
3260         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3261
3262    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3263    produced until the end sequence (ESC 1) is found:
3264
3265    (1) CHAR ... CHAR
3266    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3267    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3268    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3269
3270    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3271    annotation header is updated as below:
3272
3273    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3274    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3275    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3276    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3277
3278    If an error is found while composing, the annotation header is
3279    changed to:
3280
3281         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3282
3283    and the sequence [ -2 DECODED-RULE ] is changed to the original
3284    byte sequence as below:
3285         o the original byte sequence is B: [ B -1 ]
3286         o the original byte sequence is B1 B2: [ B1 B2 ]
3287    and the sequence [ -1 -1 ] is changed to the original byte
3288    sequence:
3289         [ ESC '0' ]
3290 */
3291
3292 /* Decode a composition rule C1 and maybe one more byte from the
3293    source, and set RULE to the encoded composition rule, NBYTES to the
3294    length of the composition rule.  If the rule is invalid, set RULE
3295    to some negative value.  */
3296
3297 #define DECODE_COMPOSITION_RULE(rule, nbytes)                           \
3298   do {                                                                  \
3299     rule = c1 - 32;                                                     \
3300     if (rule < 0)                                                       \
3301       break;                                                            \
3302     if (rule < 81)              /* old format (before ver.21) */        \
3303       {                                                                 \
3304         int gref = (rule) / 9;                                          \
3305         int nref = (rule) % 9;                                          \
3306         if (gref == 4) gref = 10;                                       \
3307         if (nref == 4) nref = 10;                                       \
3308         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3309         nbytes = 1;                                                     \
3310       }                                                                 \
3311     else                        /* new format (after ver.21) */         \
3312       {                                                                 \
3313         int c;                                                          \
3314                                                                         \
3315         ONE_MORE_BYTE (c);                                              \
3316         rule = COMPOSITION_ENCODE_RULE (rule - 81, c - 32);             \
3317         if (rule >= 0)                                                  \
3318           rule += 0x100;   /* to destinguish it from the old format */  \
3319         nbytes = 2;                                                     \
3320       }                                                                 \
3321   } while (0)
3322
3323 #define ENCODE_COMPOSITION_RULE(rule)                           \
3324   do {                                                          \
3325     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3326                                                                 \
3327     if (rule < 0x100)           /* old format */                \
3328       {                                                         \
3329         if (gref == 10) gref = 4;                               \
3330         if (nref == 10) nref = 4;                               \
3331         charbuf[idx] = 32 + gref * 9 + nref;                    \
3332         charbuf[idx + 1] = -1;                                  \
3333         new_chars++;                                            \
3334       }                                                         \
3335     else                                /* new format */        \
3336       {                                                         \
3337         charbuf[idx] = 32 + 81 + gref;                          \
3338         charbuf[idx + 1] = 32 + nref;                           \
3339         new_chars += 2;                                         \
3340       }                                                         \
3341   } while (0)
3342
3343 /* Finish the current composition as invalid.  */
3344
3345 static int finish_composition (int *, struct composition_status *);
3346
3347 static int
3348 finish_composition (int *charbuf, struct composition_status *cmp_status)
3349 {
3350   int idx = - cmp_status->length;
3351   int new_chars;
3352
3353   /* Recover the original ESC sequence */
3354   charbuf[idx++] = ISO_CODE_ESC;
3355   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3356                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3357                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3358                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3359                     : '4');
3360   charbuf[idx++] = -2;
3361   charbuf[idx++] = 0;
3362   charbuf[idx++] = -1;
3363   new_chars = cmp_status->nchars;
3364   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3365     for (; idx < 0; idx++)
3366       {
3367         int elt = charbuf[idx];
3368
3369         if (elt == -2)
3370           {
3371             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3372             idx++;
3373           }
3374         else if (elt == -1)
3375           {
3376             charbuf[idx++] = ISO_CODE_ESC;
3377             charbuf[idx] = '0';
3378             new_chars += 2;
3379           }
3380       }
3381   cmp_status->state = COMPOSING_NO;
3382   return new_chars;
3383 }
3384
3385 /* If characters are under composition, finish the composition.  */
3386 #define MAYBE_FINISH_COMPOSITION()                              \
3387   do {                                                          \
3388     if (cmp_status->state != COMPOSING_NO)                      \
3389       char_offset += finish_composition (charbuf, cmp_status);  \
3390   } while (0)
3391
3392 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3393
3394    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3395    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3396    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3397    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3398
3399    Produce this annotation sequence now:
3400
3401    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3402 */
3403
3404 #define DECODE_COMPOSITION_START(c1)                                       \
3405   do {                                                                     \
3406     if (c1 == '0'                                                          \
3407         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3408              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3409             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3410                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3411       {                                                                    \
3412         *charbuf++ = -1;                                                   \
3413         *charbuf++= -1;                                                    \
3414         cmp_status->state = COMPOSING_CHAR;                                \
3415         cmp_status->length += 2;                                           \
3416       }                                                                    \
3417     else                                                                   \
3418       {                                                                    \
3419         MAYBE_FINISH_COMPOSITION ();                                       \
3420         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3421                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3422                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3423                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3424         cmp_status->state                                                  \
3425           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3426         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3427         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3428         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3429         coding->annotated = 1;                                             \
3430       }                                                                    \
3431   } while (0)
3432
3433
3434 /* Handle composition end sequence ESC 1.  */
3435
3436 #define DECODE_COMPOSITION_END()                                        \
3437   do {                                                                  \
3438     if (cmp_status->nchars == 0                                         \
3439         || ((cmp_status->state == COMPOSING_CHAR)                       \
3440             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3441       {                                                                 \
3442         MAYBE_FINISH_COMPOSITION ();                                    \
3443         goto invalid_code;                                              \
3444       }                                                                 \
3445     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3446       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3447     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3448       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3449     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3450     char_offset += cmp_status->nchars;                                  \
3451     cmp_status->state = COMPOSING_NO;                                   \
3452   } while (0)
3453
3454 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3455
3456 #define STORE_COMPOSITION_RULE(rule)    \
3457   do {                                  \
3458     *charbuf++ = -2;                    \
3459     *charbuf++ = rule;                  \
3460     cmp_status->length += 2;            \
3461     cmp_status->state--;                \
3462   } while (0)
3463
3464 /* Store a composed char or a component char C in charbuf, and update
3465    cmp_status.  */
3466
3467 #define STORE_COMPOSITION_CHAR(c)                                       \
3468   do {                                                                  \
3469     *charbuf++ = (c);                                                   \
3470     cmp_status->length++;                                               \
3471     if (cmp_status->state == COMPOSING_CHAR)                            \
3472       cmp_status->nchars++;                                             \
3473     else                                                                \
3474       cmp_status->ncomps++;                                             \
3475     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3476         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3477             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3478       cmp_status->state++;                                              \
3479   } while (0)
3480
3481
3482 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3483
3484 static void
3485 decode_coding_iso_2022 (struct coding_system *coding)
3486 {
3487   const unsigned char *src = coding->source + coding->consumed;
3488   const unsigned char *src_end = coding->source + coding->src_bytes;
3489   const unsigned char *src_base;
3490   int *charbuf = coding->charbuf + coding->charbuf_used;
3491   /* We may produce two annotations (charset and composition) in one
3492      loop and one more charset annotation at the end.  */
3493   int *charbuf_end
3494     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3495   int consumed_chars = 0, consumed_chars_base;
3496   int multibytep = coding->src_multibyte;
3497   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3498   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3499   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3500   int charset_id_2, charset_id_3;
3501   struct charset *charset;
3502   int c;
3503   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3504   Lisp_Object attrs, charset_list;
3505   int char_offset = coding->produced_char;
3506   int last_offset = char_offset;
3507   int last_id = charset_ascii;
3508   int eol_crlf =
3509     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3510   int byte_after_cr = -1;
3511   int i;
3512
3513   CODING_GET_INFO (coding, attrs, charset_list);
3514   setup_iso_safe_charsets (attrs);
3515   /* Charset list may have been changed.  */
3516   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3517   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3518
3519   if (cmp_status->state != COMPOSING_NO)
3520     {
3521       for (i = 0; i < cmp_status->length; i++)
3522         *charbuf++ = cmp_status->carryover[i];
3523       coding->annotated = 1;
3524     }
3525
3526   while (1)
3527     {
3528       int c1, c2, c3;
3529
3530       src_base = src;
3531       consumed_chars_base = consumed_chars;
3532
3533       if (charbuf >= charbuf_end)
3534         {
3535           if (byte_after_cr >= 0)
3536             src_base--;
3537           break;
3538         }
3539
3540       if (byte_after_cr >= 0)
3541         c1 = byte_after_cr, byte_after_cr = -1;
3542       else
3543         ONE_MORE_BYTE (c1);
3544       if (c1 < 0)
3545         goto invalid_code;
3546
3547       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3548         {
3549           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3550           char_offset++;
3551           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3552           continue;
3553         }
3554
3555       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3556         {
3557           if (c1 == ISO_CODE_ESC)
3558             {
3559               if (src + 1 >= src_end)
3560                 goto no_more_source;
3561               *charbuf++ = ISO_CODE_ESC;
3562               char_offset++;
3563               if (src[0] == '%' && src[1] == '@')
3564                 {
3565                   src += 2;
3566                   consumed_chars += 2;
3567                   char_offset += 2;
3568                   /* We are sure charbuf can contain two more chars. */
3569                   *charbuf++ = '%';
3570                   *charbuf++ = '@';
3571                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3572                 }
3573             }
3574           else
3575             {
3576               *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3577               char_offset++;
3578             }
3579           continue;
3580         }
3581
3582       if ((cmp_status->state == COMPOSING_RULE
3583            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3584           && c1 != ISO_CODE_ESC)
3585         {
3586           int rule, nbytes;
3587
3588           DECODE_COMPOSITION_RULE (rule, nbytes);
3589           if (rule < 0)
3590             goto invalid_code;
3591           STORE_COMPOSITION_RULE (rule);
3592           continue;
3593         }
3594
3595       /* We produce at most one character.  */
3596       switch (iso_code_class [c1])
3597         {
3598         case ISO_0x20_or_0x7F:
3599           if (charset_id_0 < 0
3600               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3601             /* This is SPACE or DEL.  */
3602             charset = CHARSET_FROM_ID (charset_ascii);
3603           else
3604             charset = CHARSET_FROM_ID (charset_id_0);
3605           break;
3606
3607         case ISO_graphic_plane_0:
3608           if (charset_id_0 < 0)
3609             charset = CHARSET_FROM_ID (charset_ascii);
3610           else
3611             charset = CHARSET_FROM_ID (charset_id_0);
3612           break;
3613
3614         case ISO_0xA0_or_0xFF:
3615           if (charset_id_1 < 0
3616               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3617               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3618             goto invalid_code;
3619           /* This is a graphic character, we fall down ... */
3620
3621         case ISO_graphic_plane_1:
3622           if (charset_id_1 < 0)
3623             goto invalid_code;
3624           charset = CHARSET_FROM_ID (charset_id_1);
3625           break;
3626
3627         case ISO_control_0:
3628           if (eol_crlf && c1 == '\r')
3629             ONE_MORE_BYTE (byte_after_cr);
3630           MAYBE_FINISH_COMPOSITION ();
3631           charset = CHARSET_FROM_ID (charset_ascii);
3632           break;
3633
3634         case ISO_control_1:
3635           goto invalid_code;
3636
3637         case ISO_shift_out:
3638           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3639               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3640             goto invalid_code;
3641           CODING_ISO_INVOCATION (coding, 0) = 1;
3642           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3643           continue;
3644
3645         case ISO_shift_in:
3646           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3647             goto invalid_code;
3648           CODING_ISO_INVOCATION (coding, 0) = 0;
3649           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3650           continue;
3651
3652         case ISO_single_shift_2_7:
3653           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3654             goto invalid_code;
3655         case ISO_single_shift_2:
3656           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3657             goto invalid_code;
3658           /* SS2 is handled as an escape sequence of ESC 'N' */
3659           c1 = 'N';
3660           goto label_escape_sequence;
3661
3662         case ISO_single_shift_3:
3663           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3664             goto invalid_code;
3665           /* SS2 is handled as an escape sequence of ESC 'O' */
3666           c1 = 'O';
3667           goto label_escape_sequence;
3668
3669         case ISO_control_sequence_introducer:
3670           /* CSI is handled as an escape sequence of ESC '[' ...  */
3671           c1 = '[';
3672           goto label_escape_sequence;
3673
3674         case ISO_escape:
3675           ONE_MORE_BYTE (c1);
3676         label_escape_sequence:
3677           /* Escape sequences handled here are invocation,
3678              designation, direction specification, and character
3679              composition specification.  */
3680           switch (c1)
3681             {
3682             case '&':           /* revision of following character set */
3683               ONE_MORE_BYTE (c1);
3684               if (!(c1 >= '@' && c1 <= '~'))
3685                 goto invalid_code;
3686               ONE_MORE_BYTE (c1);
3687               if (c1 != ISO_CODE_ESC)
3688                 goto invalid_code;
3689               ONE_MORE_BYTE (c1);
3690               goto label_escape_sequence;
3691
3692             case '$':           /* designation of 2-byte character set */
3693               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3694                 goto invalid_code;
3695               {
3696                 int reg, chars96;
3697
3698                 ONE_MORE_BYTE (c1);
3699                 if (c1 >= '@' && c1 <= 'B')
3700                   {     /* designation of JISX0208.1978, GB2312.1980,
3701                            or JISX0208.1980 */
3702                     reg = 0, chars96 = 0;
3703                   }
3704                 else if (c1 >= 0x28 && c1 <= 0x2B)
3705                   { /* designation of DIMENSION2_CHARS94 character set */
3706                     reg = c1 - 0x28, chars96 = 0;
3707                     ONE_MORE_BYTE (c1);
3708                   }
3709                 else if (c1 >= 0x2C && c1 <= 0x2F)
3710                   { /* designation of DIMENSION2_CHARS96 character set */
3711                     reg = c1 - 0x2C, chars96 = 1;
3712                     ONE_MORE_BYTE (c1);
3713                   }
3714                 else
3715                   goto invalid_code;
3716                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3717                 /* We must update these variables now.  */
3718                 if (reg == 0)
3719                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3720                 else if (reg == 1)
3721                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3722                 if (chars96 < 0)
3723                   goto invalid_code;
3724               }
3725               continue;
3726
3727             case 'n':           /* invocation of locking-shift-2 */
3728               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3729                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3730                 goto invalid_code;
3731               CODING_ISO_INVOCATION (coding, 0) = 2;
3732               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3733               continue;
3734
3735             case 'o':           /* invocation of locking-shift-3 */
3736               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3737                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3738                 goto invalid_code;
3739               CODING_ISO_INVOCATION (coding, 0) = 3;
3740               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3741               continue;
3742
3743             case 'N':           /* invocation of single-shift-2 */
3744               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3745                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3746                 goto invalid_code;
3747               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3748               if (charset_id_2 < 0)
3749                 charset = CHARSET_FROM_ID (charset_ascii);
3750               else
3751                 charset = CHARSET_FROM_ID (charset_id_2);
3752               ONE_MORE_BYTE (c1);
3753               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3754                 goto invalid_code;
3755               break;
3756
3757             case 'O':           /* invocation of single-shift-3 */
3758               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3759                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3760                 goto invalid_code;
3761               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3762               if (charset_id_3 < 0)
3763                 charset = CHARSET_FROM_ID (charset_ascii);
3764               else
3765                 charset = CHARSET_FROM_ID (charset_id_3);
3766               ONE_MORE_BYTE (c1);
3767               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3768                 goto invalid_code;
3769               break;
3770
3771             case '0': case '2': case '3': case '4': /* start composition */
3772               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3773                 goto invalid_code;
3774               if (last_id != charset_ascii)
3775                 {
3776                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3777                   last_id = charset_ascii;
3778                   last_offset = char_offset;
3779                 }
3780               DECODE_COMPOSITION_START (c1);
3781               continue;
3782
3783             case '1':           /* end composition */
3784               if (cmp_status->state == COMPOSING_NO)
3785                 goto invalid_code;
3786               DECODE_COMPOSITION_END ();
3787               continue;
3788
3789             case '[':           /* specification of direction */
3790               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3791                 goto invalid_code;
3792               /* For the moment, nested direction is not supported.
3793                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3794                  left-to-right, and nonzero means right-to-left.  */
3795               ONE_MORE_BYTE (c1);
3796               switch (c1)
3797                 {
3798                 case ']':       /* end of the current direction */
3799                   coding->mode &= ~CODING_MODE_DIRECTION;
3800
3801                 case '0':       /* end of the current direction */
3802                 case '1':       /* start of left-to-right direction */
3803                   ONE_MORE_BYTE (c1);
3804                   if (c1 == ']')
3805                     coding->mode &= ~CODING_MODE_DIRECTION;
3806                   else
3807                     goto invalid_code;
3808                   break;
3809
3810                 case '2':       /* start of right-to-left direction */
3811                   ONE_MORE_BYTE (c1);
3812                   if (c1 == ']')
3813                     coding->mode |= CODING_MODE_DIRECTION;
3814                   else
3815                     goto invalid_code;
3816                   break;
3817
3818                 default:
3819                   goto invalid_code;
3820                 }
3821               continue;
3822
3823             case '%':
3824               ONE_MORE_BYTE (c1);
3825               if (c1 == '/')
3826                 {
3827                   /* CTEXT extended segment:
3828                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3829                      We keep these bytes as is for the moment.
3830                      They may be decoded by post-read-conversion.  */
3831                   int dim, M, L;
3832                   int size;
3833
3834                   ONE_MORE_BYTE (dim);
3835                   if (dim < '0' || dim > '4')
3836                     goto invalid_code;
3837                   ONE_MORE_BYTE (M);
3838                   if (M < 128)
3839                     goto invalid_code;
3840                   ONE_MORE_BYTE (L);
3841                   if (L < 128)
3842                     goto invalid_code;
3843                   size = ((M - 128) * 128) + (L - 128);
3844                   if (charbuf + 6 > charbuf_end)
3845                     goto break_loop;
3846                   *charbuf++ = ISO_CODE_ESC;
3847                   *charbuf++ = '%';
3848                   *charbuf++ = '/';
3849                   *charbuf++ = dim;
3850                   *charbuf++ = BYTE8_TO_CHAR (M);
3851                   *charbuf++ = BYTE8_TO_CHAR (L);
3852                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3853                 }
3854               else if (c1 == 'G')
3855                 {
3856                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3857                      ESC % G --UTF-8-BYTES-- ESC % @
3858                      We keep these bytes as is for the moment.
3859                      They may be decoded by post-read-conversion.  */
3860                   if (charbuf + 3 > charbuf_end)
3861                     goto break_loop;
3862                   *charbuf++ = ISO_CODE_ESC;
3863                   *charbuf++ = '%';
3864                   *charbuf++ = 'G';
3865                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3866                 }
3867               else
3868                 goto invalid_code;
3869               continue;
3870               break;
3871
3872             default:
3873               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3874                 goto invalid_code;
3875               {
3876                 int reg, chars96;
3877
3878                 if (c1 >= 0x28 && c1 <= 0x2B)
3879                   { /* designation of DIMENSION1_CHARS94 character set */
3880                     reg = c1 - 0x28, chars96 = 0;
3881                     ONE_MORE_BYTE (c1);
3882                   }
3883                 else if (c1 >= 0x2C && c1 <= 0x2F)
3884                   { /* designation of DIMENSION1_CHARS96 character set */
3885                     reg = c1 - 0x2C, chars96 = 1;
3886                     ONE_MORE_BYTE (c1);
3887                   }
3888                 else
3889                   goto invalid_code;
3890                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3891                 /* We must update these variables now.  */
3892                 if (reg == 0)
3893                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3894                 else if (reg == 1)
3895                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3896                 if (chars96 < 0)
3897                   goto invalid_code;
3898               }
3899               continue;
3900             }
3901         }
3902
3903       if (cmp_status->state == COMPOSING_NO
3904           && charset->id != charset_ascii
3905           && last_id != charset->id)
3906         {
3907           if (last_id != charset_ascii)
3908             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3909           last_id = charset->id;
3910           last_offset = char_offset;
3911         }
3912
3913       /* Now we know CHARSET and 1st position code C1 of a character.
3914          Produce a decoded character while getting 2nd and 3rd
3915          position codes C2, C3 if necessary.  */
3916       if (CHARSET_DIMENSION (charset) > 1)
3917         {
3918           ONE_MORE_BYTE (c2);
3919           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3920               || ((c1 & 0x80) != (c2 & 0x80)))
3921             /* C2 is not in a valid range.  */
3922             goto invalid_code;
3923           if (CHARSET_DIMENSION (charset) == 2)
3924             c1 = (c1 << 8) | c2;
3925           else
3926             {
3927               ONE_MORE_BYTE (c3);
3928               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3929                   || ((c1 & 0x80) != (c3 & 0x80)))
3930                 /* C3 is not in a valid range.  */
3931                 goto invalid_code;
3932               c1 = (c1 << 16) | (c2 << 8) | c2;
3933             }
3934         }
3935       c1 &= 0x7F7F7F;
3936       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3937       if (c < 0)
3938         {
3939           MAYBE_FINISH_COMPOSITION ();
3940           for (; src_base < src; src_base++, char_offset++)
3941             {
3942               if (ASCII_BYTE_P (*src_base))
3943                 *charbuf++ = *src_base;
3944               else
3945                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3946             }
3947         }
3948       else if (cmp_status->state == COMPOSING_NO)
3949         {
3950           *charbuf++ = c;
3951           char_offset++;
3952         }
3953       else if ((cmp_status->state == COMPOSING_CHAR
3954                 ? cmp_status->nchars
3955                 : cmp_status->ncomps)
3956                >= MAX_COMPOSITION_COMPONENTS)
3957         {
3958           /* Too long composition.  */
3959           MAYBE_FINISH_COMPOSITION ();
3960           *charbuf++ = c;
3961           char_offset++;
3962         }
3963       else
3964         STORE_COMPOSITION_CHAR (c);
3965       continue;
3966
3967     invalid_code:
3968       MAYBE_FINISH_COMPOSITION ();
3969       src = src_base;
3970       consumed_chars = consumed_chars_base;
3971       ONE_MORE_BYTE (c);
3972       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3973       char_offset++;
3974       coding->errors++;
3975       continue;
3976
3977     break_loop:
3978       break;
3979     }
3980
3981  no_more_source:
3982   if (cmp_status->state != COMPOSING_NO)
3983     {
3984       if (coding->mode & CODING_MODE_LAST_BLOCK)
3985         MAYBE_FINISH_COMPOSITION ();
3986       else
3987         {
3988           charbuf -= cmp_status->length;
3989           for (i = 0; i < cmp_status->length; i++)
3990             cmp_status->carryover[i] = charbuf[i];
3991         }
3992     }
3993   else if (last_id != charset_ascii)
3994     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3995   coding->consumed_char += consumed_chars_base;
3996   coding->consumed = src_base - coding->source;
3997   coding->charbuf_used = charbuf - coding->charbuf;
3998 }
3999
4000
4001 /* ISO2022 encoding stuff.  */
4002
4003 /*
4004    It is not enough to say just "ISO2022" on encoding, we have to
4005    specify more details.  In Emacs, each coding system of ISO2022
4006    variant has the following specifications:
4007         1. Initial designation to G0 thru G3.
4008         2. Allows short-form designation?
4009         3. ASCII should be designated to G0 before control characters?
4010         4. ASCII should be designated to G0 at end of line?
4011         5. 7-bit environment or 8-bit environment?
4012         6. Use locking-shift?
4013         7. Use Single-shift?
4014    And the following two are only for Japanese:
4015         8. Use ASCII in place of JIS0201-1976-Roman?
4016         9. Use JISX0208-1983 in place of JISX0208-1978?
4017    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4018    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
4019    details.
4020 */
4021
4022 /* Produce codes (escape sequence) for designating CHARSET to graphic
4023    register REG at DST, and increment DST.  If <final-char> of CHARSET is
4024    '@', 'A', or 'B' and the coding system CODING allows, produce
4025    designation sequence of short-form.  */
4026
4027 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
4028   do {                                                                  \
4029     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
4030     const char *intermediate_char_94 = "()*+";                          \
4031     const char *intermediate_char_96 = ",-./";                          \
4032     int revision = -1;                                                  \
4033     int c;                                                              \
4034                                                                         \
4035     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
4036       revision = CHARSET_ISO_REVISION (charset);                        \
4037                                                                         \
4038     if (revision >= 0)                                                  \
4039       {                                                                 \
4040         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4041         EMIT_ONE_BYTE ('@' + revision);                                 \
4042       }                                                                 \
4043     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4044     if (CHARSET_DIMENSION (charset) == 1)                               \
4045       {                                                                 \
4046         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4047           c = intermediate_char_94[reg];                                \
4048         else                                                            \
4049           c = intermediate_char_96[reg];                                \
4050         EMIT_ONE_ASCII_BYTE (c);                                        \
4051       }                                                                 \
4052     else                                                                \
4053       {                                                                 \
4054         EMIT_ONE_ASCII_BYTE ('$');                                      \
4055         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4056           {                                                             \
4057             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4058                 || reg != 0                                             \
4059                 || final_char < '@' || final_char > 'B')                \
4060               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4061           }                                                             \
4062         else                                                            \
4063           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4064       }                                                                 \
4065     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4066                                                                         \
4067     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4068   } while (0)
4069
4070
4071 /* The following two macros produce codes (control character or escape
4072    sequence) for ISO2022 single-shift functions (single-shift-2 and
4073    single-shift-3).  */
4074
4075 #define ENCODE_SINGLE_SHIFT_2                                           \
4076   do {                                                                  \
4077     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4078       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4079     else                                                                \
4080       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4081     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4082   } while (0)
4083
4084
4085 #define ENCODE_SINGLE_SHIFT_3                                           \
4086   do {                                                                  \
4087     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4088       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4089     else                                                                \
4090       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4091     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4092   } while (0)
4093
4094
4095 /* The following four macros produce codes (control character or
4096    escape sequence) for ISO2022 locking-shift functions (shift-in,
4097    shift-out, locking-shift-2, and locking-shift-3).  */
4098
4099 #define ENCODE_SHIFT_IN                                 \
4100   do {                                                  \
4101     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4102     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4103   } while (0)
4104
4105
4106 #define ENCODE_SHIFT_OUT                                \
4107   do {                                                  \
4108     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4109     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4110   } while (0)
4111
4112
4113 #define ENCODE_LOCKING_SHIFT_2                          \
4114   do {                                                  \
4115     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4116     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4117   } while (0)
4118
4119
4120 #define ENCODE_LOCKING_SHIFT_3                          \
4121   do {                                                  \
4122     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4123     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4124   } while (0)
4125
4126
4127 /* Produce codes for a DIMENSION1 character whose character set is
4128    CHARSET and whose position-code is C1.  Designation and invocation
4129    sequences are also produced in advance if necessary.  */
4130
4131 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4132   do {                                                                  \
4133     int id = CHARSET_ID (charset);                                      \
4134                                                                         \
4135     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4136         && id == charset_ascii)                                         \
4137       {                                                                 \
4138         id = charset_jisx0201_roman;                                    \
4139         charset = CHARSET_FROM_ID (id);                                 \
4140       }                                                                 \
4141                                                                         \
4142     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4143       {                                                                 \
4144         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4145           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4146         else                                                            \
4147           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4148         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4149         break;                                                          \
4150       }                                                                 \
4151     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4152       {                                                                 \
4153         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4154         break;                                                          \
4155       }                                                                 \
4156     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4157       {                                                                 \
4158         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4159         break;                                                          \
4160       }                                                                 \
4161     else                                                                \
4162       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4163          must invoke it, or, at first, designate it to some graphic     \
4164          register.  Then repeat the loop to actually produce the        \
4165          character.  */                                                 \
4166       dst = encode_invocation_designation (charset, coding, dst,        \
4167                                            &produced_chars);            \
4168   } while (1)
4169
4170
4171 /* Produce codes for a DIMENSION2 character whose character set is
4172    CHARSET and whose position-codes are C1 and C2.  Designation and
4173    invocation codes are also produced in advance if necessary.  */
4174
4175 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4176   do {                                                                  \
4177     int id = CHARSET_ID (charset);                                      \
4178                                                                         \
4179     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4180         && id == charset_jisx0208)                                      \
4181       {                                                                 \
4182         id = charset_jisx0208_1978;                                     \
4183         charset = CHARSET_FROM_ID (id);                                 \
4184       }                                                                 \
4185                                                                         \
4186     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4187       {                                                                 \
4188         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4189           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4190         else                                                            \
4191           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4192         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4193         break;                                                          \
4194       }                                                                 \
4195     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4196       {                                                                 \
4197         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4198         break;                                                          \
4199       }                                                                 \
4200     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4201       {                                                                 \
4202         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4203         break;                                                          \
4204       }                                                                 \
4205     else                                                                \
4206       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4207          must invoke it, or, at first, designate it to some graphic     \
4208          register.  Then repeat the loop to actually produce the        \
4209          character.  */                                                 \
4210       dst = encode_invocation_designation (charset, coding, dst,        \
4211                                            &produced_chars);            \
4212   } while (1)
4213
4214
4215 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4216   do {                                                                     \
4217     int code = ENCODE_CHAR ((charset), (c));                               \
4218                                                                            \
4219     if (CHARSET_DIMENSION (charset) == 1)                                  \
4220       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4221     else                                                                   \
4222       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4223   } while (0)
4224
4225
4226 /* Produce designation and invocation codes at a place pointed by DST
4227    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4228    Return new DST.  */
4229
4230 unsigned char *
4231 encode_invocation_designation (struct charset *charset,
4232                                struct coding_system *coding,
4233                                unsigned char *dst, int *p_nchars)
4234 {
4235   int multibytep = coding->dst_multibyte;
4236   int produced_chars = *p_nchars;
4237   int reg;                      /* graphic register number */
4238   int id = CHARSET_ID (charset);
4239
4240   /* At first, check designations.  */
4241   for (reg = 0; reg < 4; reg++)
4242     if (id == CODING_ISO_DESIGNATION (coding, reg))
4243       break;
4244
4245   if (reg >= 4)
4246     {
4247       /* CHARSET is not yet designated to any graphic registers.  */
4248       /* At first check the requested designation.  */
4249       reg = CODING_ISO_REQUEST (coding, id);
4250       if (reg < 0)
4251         /* Since CHARSET requests no special designation, designate it
4252            to graphic register 0.  */
4253         reg = 0;
4254
4255       ENCODE_DESIGNATION (charset, reg, coding);
4256     }
4257
4258   if (CODING_ISO_INVOCATION (coding, 0) != reg
4259       && CODING_ISO_INVOCATION (coding, 1) != reg)
4260     {
4261       /* Since the graphic register REG is not invoked to any graphic
4262          planes, invoke it to graphic plane 0.  */
4263       switch (reg)
4264         {
4265         case 0:                 /* graphic register 0 */
4266           ENCODE_SHIFT_IN;
4267           break;
4268
4269         case 1:                 /* graphic register 1 */
4270           ENCODE_SHIFT_OUT;
4271           break;
4272
4273         case 2:                 /* graphic register 2 */
4274           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4275             ENCODE_SINGLE_SHIFT_2;
4276           else
4277             ENCODE_LOCKING_SHIFT_2;
4278           break;
4279
4280         case 3:                 /* graphic register 3 */
4281           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4282             ENCODE_SINGLE_SHIFT_3;
4283           else
4284             ENCODE_LOCKING_SHIFT_3;
4285           break;
4286         }
4287     }
4288
4289   *p_nchars = produced_chars;
4290   return dst;
4291 }
4292
4293 /* The following three macros produce codes for indicating direction
4294    of text.  */
4295 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER                              \
4296   do {                                                                  \
4297     if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS)        \
4298       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '[');                         \
4299     else                                                                \
4300       EMIT_ONE_BYTE (ISO_CODE_CSI);                                     \
4301   } while (0)
4302
4303
4304 #define ENCODE_DIRECTION_R2L()                  \
4305   do {                                          \
4306     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
4307     EMIT_TWO_ASCII_BYTES ('2', ']');            \
4308   } while (0)
4309
4310
4311 #define ENCODE_DIRECTION_L2R()                  \
4312   do {                                          \
4313     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
4314     EMIT_TWO_ASCII_BYTES ('0', ']');            \
4315   } while (0)
4316
4317
4318 /* Produce codes for designation and invocation to reset the graphic
4319    planes and registers to initial state.  */
4320 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4321   do {                                                                  \
4322     int reg;                                                            \
4323     struct charset *charset;                                            \
4324                                                                         \
4325     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4326       ENCODE_SHIFT_IN;                                                  \
4327     for (reg = 0; reg < 4; reg++)                                       \
4328       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4329           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4330               != CODING_ISO_INITIAL (coding, reg)))                     \
4331         {                                                               \
4332           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4333           ENCODE_DESIGNATION (charset, reg, coding);                    \
4334         }                                                               \
4335   } while (0)
4336
4337
4338 /* Produce designation sequences of charsets in the line started from
4339    SRC to a place pointed by DST, and return updated DST.
4340
4341    If the current block ends before any end-of-line, we may fail to
4342    find all the necessary designations.  */
4343
4344 static unsigned char *
4345 encode_designation_at_bol (struct coding_system *coding, int *charbuf,
4346                            int *charbuf_end, unsigned char *dst)
4347 {
4348   struct charset *charset;
4349   /* Table of charsets to be designated to each graphic register.  */
4350   int r[4];
4351   int c, found = 0, reg;
4352   int produced_chars = 0;
4353   int multibytep = coding->dst_multibyte;
4354   Lisp_Object attrs;
4355   Lisp_Object charset_list;
4356
4357   attrs = CODING_ID_ATTRS (coding->id);
4358   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4359   if (EQ (charset_list, Qiso_2022))
4360     charset_list = Viso_2022_charset_list;
4361
4362   for (reg = 0; reg < 4; reg++)
4363     r[reg] = -1;
4364
4365   while (found < 4)
4366     {
4367       int id;
4368
4369       c = *charbuf++;
4370       if (c == '\n')
4371         break;
4372       charset = char_charset (c, charset_list, NULL);
4373       id = CHARSET_ID (charset);
4374       reg = CODING_ISO_REQUEST (coding, id);
4375       if (reg >= 0 && r[reg] < 0)
4376         {
4377           found++;
4378           r[reg] = id;
4379         }
4380     }
4381
4382   if (found)
4383     {
4384       for (reg = 0; reg < 4; reg++)
4385         if (r[reg] >= 0
4386             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4387           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4388     }
4389
4390   return dst;
4391 }
4392
4393 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4394
4395 static int
4396 encode_coding_iso_2022 (struct coding_system *coding)
4397 {
4398   int multibytep = coding->dst_multibyte;
4399   int *charbuf = coding->charbuf;
4400   int *charbuf_end = charbuf + coding->charbuf_used;
4401   unsigned char *dst = coding->destination + coding->produced;
4402   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4403   int safe_room = 16;
4404   int bol_designation
4405     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4406        && CODING_ISO_BOL (coding));
4407   int produced_chars = 0;
4408   Lisp_Object attrs, eol_type, charset_list;
4409   int ascii_compatible;
4410   int c;
4411   int preferred_charset_id = -1;
4412
4413   CODING_GET_INFO (coding, attrs, charset_list);
4414   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4415   if (VECTORP (eol_type))
4416     eol_type = Qunix;
4417
4418   setup_iso_safe_charsets (attrs);
4419   /* Charset list may have been changed.  */
4420   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4421   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4422
4423   ascii_compatible
4424     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4425        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4426                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4427
4428   while (charbuf < charbuf_end)
4429     {
4430       ASSURE_DESTINATION (safe_room);
4431
4432       if (bol_designation)
4433         {
4434           unsigned char *dst_prev = dst;
4435
4436           /* We have to produce designation sequences if any now.  */
4437           dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
4438           bol_designation = 0;
4439           /* We are sure that designation sequences are all ASCII bytes.  */
4440           produced_chars += dst - dst_prev;
4441         }
4442
4443       c = *charbuf++;
4444
4445       if (c < 0)
4446         {
4447           /* Handle an annotation.  */
4448           switch (*charbuf)
4449             {
4450             case CODING_ANNOTATE_COMPOSITION_MASK:
4451               /* Not yet implemented.  */
4452               break;
4453             case CODING_ANNOTATE_CHARSET_MASK:
4454               preferred_charset_id = charbuf[2];
4455               if (preferred_charset_id >= 0
4456                   && NILP (Fmemq (make_number (preferred_charset_id),
4457                                   charset_list)))
4458                 preferred_charset_id = -1;
4459               break;
4460             default:
4461               abort ();
4462             }
4463           charbuf += -c - 1;
4464           continue;
4465         }
4466
4467       /* Now encode the character C.  */
4468       if (c < 0x20 || c == 0x7F)
4469         {
4470           if (c == '\n'
4471               || (c == '\r' && EQ (eol_type, Qmac)))
4472             {
4473               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4474                 ENCODE_RESET_PLANE_AND_REGISTER ();
4475               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4476                 {
4477                   int i;
4478
4479                   for (i = 0; i < 4; i++)
4480                     CODING_ISO_DESIGNATION (coding, i)
4481                       = CODING_ISO_INITIAL (coding, i);
4482                 }
4483               bol_designation
4484                 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
4485             }
4486           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4487             ENCODE_RESET_PLANE_AND_REGISTER ();
4488           EMIT_ONE_ASCII_BYTE (c);
4489         }
4490       else if (ASCII_CHAR_P (c))
4491         {
4492           if (ascii_compatible)
4493             EMIT_ONE_ASCII_BYTE (c);
4494           else
4495             {
4496               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4497               ENCODE_ISO_CHARACTER (charset, c);
4498             }
4499         }
4500       else if (CHAR_BYTE8_P (c))
4501         {
4502           c = CHAR_TO_BYTE8 (c);
4503           EMIT_ONE_BYTE (c);
4504         }
4505       else
4506         {
4507           struct charset *charset;
4508
4509           if (preferred_charset_id >= 0)
4510             {
4511               charset = CHARSET_FROM_ID (preferred_charset_id);
4512               if (! CHAR_CHARSET_P (c, charset))
4513                 charset = char_charset (c, charset_list, NULL);
4514             }
4515           else
4516             charset = char_charset (c, charset_list, NULL);
4517           if (!charset)
4518             {
4519               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4520                 {
4521                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4522                   charset = CHARSET_FROM_ID (charset_ascii);
4523                 }
4524               else
4525                 {
4526                   c = coding->default_char;
4527                   charset = char_charset (c, charset_list, NULL);
4528                 }
4529             }
4530           ENCODE_ISO_CHARACTER (charset, c);
4531         }
4532     }
4533
4534   if (coding->mode & CODING_MODE_LAST_BLOCK
4535       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4536     {
4537       ASSURE_DESTINATION (safe_room);
4538       ENCODE_RESET_PLANE_AND_REGISTER ();
4539     }
4540   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4541   CODING_ISO_BOL (coding) = bol_designation;
4542   coding->produced_char += produced_chars;
4543   coding->produced = dst - coding->destination;
4544   return 0;
4545 }
4546
4547 \f
4548 /*** 8,9. SJIS and BIG5 handlers ***/
4549
4550 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4551    quite widely.  So, for the moment, Emacs supports them in the bare
4552    C code.  But, in the future, they may be supported only by CCL.  */
4553
4554 /* SJIS is a coding system encoding three character sets: ASCII, right
4555    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4556    as is.  A character of charset katakana-jisx0201 is encoded by
4557    "position-code + 0x80".  A character of charset japanese-jisx0208
4558    is encoded in 2-byte but two position-codes are divided and shifted
4559    so that it fit in the range below.
4560
4561    --- CODE RANGE of SJIS ---
4562    (character set)      (range)
4563    ASCII                0x00 .. 0x7F
4564    KATAKANA-JISX0201    0xA0 .. 0xDF
4565    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4566             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4567    -------------------------------
4568
4569 */
4570
4571 /* BIG5 is a coding system encoding two character sets: ASCII and
4572    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4573    character set and is encoded in two-byte.
4574
4575    --- CODE RANGE of BIG5 ---
4576    (character set)      (range)
4577    ASCII                0x00 .. 0x7F
4578    Big5 (1st byte)      0xA1 .. 0xFE
4579         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4580    --------------------------
4581
4582   */
4583
4584 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4585    Check if a text is encoded in SJIS.  If it is, return
4586    CATEGORY_MASK_SJIS, else return 0.  */
4587
4588 static int
4589 detect_coding_sjis (struct coding_system *coding,
4590                     struct coding_detection_info *detect_info)
4591 {
4592   const unsigned char *src = coding->source, *src_base;
4593   const unsigned char *src_end = coding->source + coding->src_bytes;
4594   int multibytep = coding->src_multibyte;
4595   int consumed_chars = 0;
4596   int found = 0;
4597   int c;
4598   Lisp_Object attrs, charset_list;
4599   int max_first_byte_of_2_byte_code;
4600
4601   CODING_GET_INFO (coding, attrs, charset_list);
4602   max_first_byte_of_2_byte_code
4603     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4604
4605   detect_info->checked |= CATEGORY_MASK_SJIS;
4606   /* A coding system of this category is always ASCII compatible.  */
4607   src += coding->head_ascii;
4608
4609   while (1)
4610     {
4611       src_base = src;
4612       ONE_MORE_BYTE (c);
4613       if (c < 0x80)
4614         continue;
4615       if ((c >= 0x81 && c <= 0x9F)
4616           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4617         {
4618           ONE_MORE_BYTE (c);
4619           if (c < 0x40 || c == 0x7F || c > 0xFC)
4620             break;
4621           found = CATEGORY_MASK_SJIS;
4622         }
4623       else if (c >= 0xA0 && c < 0xE0)
4624         found = CATEGORY_MASK_SJIS;
4625       else
4626         break;
4627     }
4628   detect_info->rejected |= CATEGORY_MASK_SJIS;
4629   return 0;
4630
4631  no_more_source:
4632   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4633     {
4634       detect_info->rejected |= CATEGORY_MASK_SJIS;
4635       return 0;
4636     }
4637   detect_info->found |= found;
4638   return 1;
4639 }
4640
4641 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4642    Check if a text is encoded in BIG5.  If it is, return
4643    CATEGORY_MASK_BIG5, else return 0.  */
4644
4645 static int
4646 detect_coding_big5 (struct coding_system *coding,
4647                     struct coding_detection_info *detect_info)
4648 {
4649   const unsigned char *src = coding->source, *src_base;
4650   const unsigned char *src_end = coding->source + coding->src_bytes;
4651   int multibytep = coding->src_multibyte;
4652   int consumed_chars = 0;
4653   int found = 0;
4654   int c;
4655
4656   detect_info->checked |= CATEGORY_MASK_BIG5;
4657   /* A coding system of this category is always ASCII compatible.  */
4658   src += coding->head_ascii;
4659
4660   while (1)
4661     {
4662       src_base = src;
4663       ONE_MORE_BYTE (c);
4664       if (c < 0x80)
4665         continue;
4666       if (c >= 0xA1)
4667         {
4668           ONE_MORE_BYTE (c);
4669           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4670             return 0;
4671           found = CATEGORY_MASK_BIG5;
4672         }
4673       else
4674         break;
4675     }
4676   detect_info->rejected |= CATEGORY_MASK_BIG5;
4677   return 0;
4678
4679  no_more_source:
4680   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4681     {
4682       detect_info->rejected |= CATEGORY_MASK_BIG5;
4683       return 0;
4684     }
4685   detect_info->found |= found;
4686   return 1;
4687 }
4688
4689 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4690    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
4691
4692 static void
4693 decode_coding_sjis (struct coding_system *coding)
4694 {
4695   const unsigned char *src = coding->source + coding->consumed;
4696   const unsigned char *src_end = coding->source + coding->src_bytes;
4697   const unsigned char *src_base;
4698   int *charbuf = coding->charbuf + coding->charbuf_used;
4699   /* We may produce one charset annotation in one loop and one more at
4700      the end.  */
4701   int *charbuf_end
4702     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4703   int consumed_chars = 0, consumed_chars_base;
4704   int multibytep = coding->src_multibyte;
4705   struct charset *charset_roman, *charset_kanji, *charset_kana;
4706   struct charset *charset_kanji2;
4707   Lisp_Object attrs, charset_list, val;
4708   int char_offset = coding->produced_char;
4709   int last_offset = char_offset;
4710   int last_id = charset_ascii;
4711   int eol_crlf =
4712     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4713   int byte_after_cr = -1;
4714
4715   CODING_GET_INFO (coding, attrs, charset_list);
4716
4717   val = charset_list;
4718   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4719   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4720   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4721   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4722
4723   while (1)
4724     {
4725       int c, c1;
4726       struct charset *charset;
4727
4728       src_base = src;
4729       consumed_chars_base = consumed_chars;
4730
4731       if (charbuf >= charbuf_end)
4732         {
4733           if (byte_after_cr >= 0)
4734             src_base--;
4735           break;
4736         }
4737
4738       if (byte_after_cr >= 0)
4739         c = byte_after_cr, byte_after_cr = -1;
4740       else
4741         ONE_MORE_BYTE (c);
4742       if (c < 0)
4743         goto invalid_code;
4744       if (c < 0x80)
4745         {
4746           if (eol_crlf && c == '\r')
4747             ONE_MORE_BYTE (byte_after_cr);
4748           charset = charset_roman;
4749         }
4750       else if (c == 0x80 || c == 0xA0)
4751         goto invalid_code;
4752       else if (c >= 0xA1 && c <= 0xDF)
4753         {
4754           /* SJIS -> JISX0201-Kana */
4755           c &= 0x7F;
4756           charset = charset_kana;
4757         }
4758       else if (c <= 0xEF)
4759         {
4760           /* SJIS -> JISX0208 */
4761           ONE_MORE_BYTE (c1);
4762           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4763             goto invalid_code;
4764           c = (c << 8) | c1;
4765           SJIS_TO_JIS (c);
4766           charset = charset_kanji;
4767         }
4768       else if (c <= 0xFC && charset_kanji2)
4769         {
4770           /* SJIS -> JISX0213-2 */
4771           ONE_MORE_BYTE (c1);
4772           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4773             goto invalid_code;
4774           c = (c << 8) | c1;
4775           SJIS_TO_JIS2 (c);
4776           charset = charset_kanji2;
4777         }
4778       else
4779         goto invalid_code;
4780       if (charset->id != charset_ascii
4781           && last_id != charset->id)
4782         {
4783           if (last_id != charset_ascii)
4784             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4785           last_id = charset->id;
4786           last_offset = char_offset;
4787         }
4788       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4789       *charbuf++ = c;
4790       char_offset++;
4791       continue;
4792
4793     invalid_code:
4794       src = src_base;
4795       consumed_chars = consumed_chars_base;
4796       ONE_MORE_BYTE (c);
4797       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4798       char_offset++;
4799       coding->errors++;
4800     }
4801
4802  no_more_source:
4803   if (last_id != charset_ascii)
4804     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4805   coding->consumed_char += consumed_chars_base;
4806   coding->consumed = src_base - coding->source;
4807   coding->charbuf_used = charbuf - coding->charbuf;
4808 }
4809
4810 static void
4811 decode_coding_big5 (struct coding_system *coding)
4812 {
4813   const unsigned char *src = coding->source + coding->consumed;
4814   const unsigned char *src_end = coding->source + coding->src_bytes;
4815   const unsigned char *src_base;
4816   int *charbuf = coding->charbuf + coding->charbuf_used;
4817   /* We may produce one charset annotation in one loop and one more at
4818      the end.  */
4819   int *charbuf_end
4820     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4821   int consumed_chars = 0, consumed_chars_base;
4822   int multibytep = coding->src_multibyte;
4823   struct charset *charset_roman, *charset_big5;
4824   Lisp_Object attrs, charset_list, val;
4825   int char_offset = coding->produced_char;
4826   int last_offset = char_offset;
4827   int last_id = charset_ascii;
4828   int eol_crlf =
4829     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4830   int byte_after_cr = -1;
4831
4832   CODING_GET_INFO (coding, attrs, charset_list);
4833   val = charset_list;
4834   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4835   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4836
4837   while (1)
4838     {
4839       int c, c1;
4840       struct charset *charset;
4841
4842       src_base = src;
4843       consumed_chars_base = consumed_chars;
4844
4845       if (charbuf >= charbuf_end)
4846         {
4847           if (byte_after_cr >= 0)
4848             src_base--;
4849           break;
4850         }
4851
4852       if (byte_after_cr >= 0)
4853         c = byte_after_cr, byte_after_cr = -1;
4854       else
4855         ONE_MORE_BYTE (c);
4856
4857       if (c < 0)
4858         goto invalid_code;
4859       if (c < 0x80)
4860         {
4861           if (eol_crlf && c == '\r')
4862             ONE_MORE_BYTE (byte_after_cr);
4863           charset = charset_roman;
4864         }
4865       else
4866         {
4867           /* BIG5 -> Big5 */
4868           if (c < 0xA1 || c > 0xFE)
4869             goto invalid_code;
4870           ONE_MORE_BYTE (c1);
4871           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4872             goto invalid_code;
4873           c = c << 8 | c1;
4874           charset = charset_big5;
4875         }
4876       if (charset->id != charset_ascii
4877           && last_id != charset->id)
4878         {
4879           if (last_id != charset_ascii)
4880             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4881           last_id = charset->id;
4882           last_offset = char_offset;
4883         }
4884       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4885       *charbuf++ = c;
4886       char_offset++;
4887       continue;
4888
4889     invalid_code:
4890       src = src_base;
4891       consumed_chars = consumed_chars_base;
4892       ONE_MORE_BYTE (c);
4893       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4894       char_offset++;
4895       coding->errors++;
4896     }
4897
4898  no_more_source:
4899   if (last_id != charset_ascii)
4900     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4901   coding->consumed_char += consumed_chars_base;
4902   coding->consumed = src_base - coding->source;
4903   coding->charbuf_used = charbuf - coding->charbuf;
4904 }
4905
4906 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4907    This function can encode charsets `ascii', `katakana-jisx0201',
4908    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4909    are sure that all these charsets are registered as official charset
4910    (i.e. do not have extended leading-codes).  Characters of other
4911    charsets are produced without any encoding.  If SJIS_P is 1, encode
4912    SJIS text, else encode BIG5 text.  */
4913
4914 static int
4915 encode_coding_sjis (struct coding_system *coding)
4916 {
4917   int multibytep = coding->dst_multibyte;
4918   int *charbuf = coding->charbuf;
4919   int *charbuf_end = charbuf + coding->charbuf_used;
4920   unsigned char *dst = coding->destination + coding->produced;
4921   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4922   int safe_room = 4;
4923   int produced_chars = 0;
4924   Lisp_Object attrs, charset_list, val;
4925   int ascii_compatible;
4926   struct charset *charset_roman, *charset_kanji, *charset_kana;
4927   struct charset *charset_kanji2;
4928   int c;
4929
4930   CODING_GET_INFO (coding, attrs, charset_list);
4931   val = charset_list;
4932   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4933   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4934   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4935   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4936
4937   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4938
4939   while (charbuf < charbuf_end)
4940     {
4941       ASSURE_DESTINATION (safe_room);
4942       c = *charbuf++;
4943       /* Now encode the character C.  */
4944       if (ASCII_CHAR_P (c) && ascii_compatible)
4945         EMIT_ONE_ASCII_BYTE (c);
4946       else if (CHAR_BYTE8_P (c))
4947         {
4948           c = CHAR_TO_BYTE8 (c);
4949           EMIT_ONE_BYTE (c);
4950         }
4951       else
4952         {
4953           unsigned code;
4954           struct charset *charset = char_charset (c, charset_list, &code);
4955
4956           if (!charset)
4957             {
4958               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4959                 {
4960                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4961                   charset = CHARSET_FROM_ID (charset_ascii);
4962                 }
4963               else
4964                 {
4965                   c = coding->default_char;
4966                   charset = char_charset (c, charset_list, &code);
4967                 }
4968             }
4969           if (code == CHARSET_INVALID_CODE (charset))
4970             abort ();
4971           if (charset == charset_kanji)
4972             {
4973               int c1, c2;
4974               JIS_TO_SJIS (code);
4975               c1 = code >> 8, c2 = code & 0xFF;
4976               EMIT_TWO_BYTES (c1, c2);
4977             }
4978           else if (charset == charset_kana)
4979             EMIT_ONE_BYTE (code | 0x80);
4980           else if (charset_kanji2 && charset == charset_kanji2)
4981             {
4982               int c1, c2;
4983
4984               c1 = code >> 8;
4985               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4986                   || c1 == 0x28
4987                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4988                 {
4989                   JIS_TO_SJIS2 (code);
4990                   c1 = code >> 8, c2 = code & 0xFF;
4991                   EMIT_TWO_BYTES (c1, c2);
4992                 }
4993               else
4994                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4995             }
4996           else
4997             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4998         }
4999     }
5000   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5001   coding->produced_char += produced_chars;
5002   coding->produced = dst - coding->destination;
5003   return 0;
5004 }
5005
5006 static int
5007 encode_coding_big5 (struct coding_system *coding)
5008 {
5009   int multibytep = coding->dst_multibyte;
5010   int *charbuf = coding->charbuf;
5011   int *charbuf_end = charbuf + coding->charbuf_used;
5012   unsigned char *dst = coding->destination + coding->produced;
5013   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5014   int safe_room = 4;
5015   int produced_chars = 0;
5016   Lisp_Object attrs, charset_list, val;
5017   int ascii_compatible;
5018   struct charset *charset_roman, *charset_big5;
5019   int c;
5020
5021   CODING_GET_INFO (coding, attrs, charset_list);
5022   val = charset_list;
5023   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5024   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5025   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5026
5027   while (charbuf < charbuf_end)
5028     {
5029       ASSURE_DESTINATION (safe_room);
5030       c = *charbuf++;
5031       /* Now encode the character C.  */
5032       if (ASCII_CHAR_P (c) && ascii_compatible)
5033         EMIT_ONE_ASCII_BYTE (c);
5034       else if (CHAR_BYTE8_P (c))
5035         {
5036           c = CHAR_TO_BYTE8 (c);
5037           EMIT_ONE_BYTE (c);
5038         }
5039       else
5040         {
5041           unsigned code;
5042           struct charset *charset = char_charset (c, charset_list, &code);
5043
5044           if (! charset)
5045             {
5046               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5047                 {
5048                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5049                   charset = CHARSET_FROM_ID (charset_ascii);
5050                 }
5051               else
5052                 {
5053                   c = coding->default_char;
5054                   charset = char_charset (c, charset_list, &code);
5055                 }
5056             }
5057           if (code == CHARSET_INVALID_CODE (charset))
5058             abort ();
5059           if (charset == charset_big5)
5060             {
5061               int c1, c2;
5062
5063               c1 = code >> 8, c2 = code & 0xFF;
5064               EMIT_TWO_BYTES (c1, c2);
5065             }
5066           else
5067             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5068         }
5069     }
5070   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5071   coding->produced_char += produced_chars;
5072   coding->produced = dst - coding->destination;
5073   return 0;
5074 }
5075
5076 \f
5077 /*** 10. CCL handlers ***/
5078
5079 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5080    Check if a text is encoded in a coding system of which
5081    encoder/decoder are written in CCL program.  If it is, return
5082    CATEGORY_MASK_CCL, else return 0.  */
5083
5084 static int
5085 detect_coding_ccl (struct coding_system *coding,
5086                    struct coding_detection_info *detect_info)
5087 {
5088   const unsigned char *src = coding->source, *src_base;
5089   const unsigned char *src_end = coding->source + coding->src_bytes;
5090   int multibytep = coding->src_multibyte;
5091   int consumed_chars = 0;
5092   int found = 0;
5093   unsigned char *valids;
5094   int head_ascii = coding->head_ascii;
5095   Lisp_Object attrs;
5096
5097   detect_info->checked |= CATEGORY_MASK_CCL;
5098
5099   coding = &coding_categories[coding_category_ccl];
5100   valids = CODING_CCL_VALIDS (coding);
5101   attrs = CODING_ID_ATTRS (coding->id);
5102   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5103     src += head_ascii;
5104
5105   while (1)
5106     {
5107       int c;
5108
5109       src_base = src;
5110       ONE_MORE_BYTE (c);
5111       if (c < 0 || ! valids[c])
5112         break;
5113       if ((valids[c] > 1))
5114         found = CATEGORY_MASK_CCL;
5115     }
5116   detect_info->rejected |= CATEGORY_MASK_CCL;
5117   return 0;
5118
5119  no_more_source:
5120   detect_info->found |= found;
5121   return 1;
5122 }
5123
5124 static void
5125 decode_coding_ccl (struct coding_system *coding)
5126 {
5127   const unsigned char *src = coding->source + coding->consumed;
5128   const unsigned char *src_end = coding->source + coding->src_bytes;
5129   int *charbuf = coding->charbuf + coding->charbuf_used;
5130   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5131   int consumed_chars = 0;
5132   int multibytep = coding->src_multibyte;
5133   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5134   int source_charbuf[1024];
5135   int source_byteidx[1025];
5136   Lisp_Object attrs, charset_list;
5137
5138   CODING_GET_INFO (coding, attrs, charset_list);
5139
5140   while (1)
5141     {
5142       const unsigned char *p = src;
5143       int i = 0;
5144
5145       if (multibytep)
5146         {
5147           while (i < 1024 && p < src_end)
5148             {
5149               source_byteidx[i] = p - src;
5150               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5151             }
5152           source_byteidx[i] = p - src;
5153         }
5154       else
5155         while (i < 1024 && p < src_end)
5156           source_charbuf[i++] = *p++;
5157
5158       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5159         ccl->last_block = 1;
5160       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5161                   charset_list);
5162       charbuf += ccl->produced;
5163       if (multibytep)
5164         src += source_byteidx[ccl->consumed];
5165       else
5166         src += ccl->consumed;
5167       consumed_chars += ccl->consumed;
5168       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5169         break;
5170     }
5171
5172   switch (ccl->status)
5173     {
5174     case CCL_STAT_SUSPEND_BY_SRC:
5175       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5176       break;
5177     case CCL_STAT_SUSPEND_BY_DST:
5178       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5179       break;
5180     case CCL_STAT_QUIT:
5181     case CCL_STAT_INVALID_CMD:
5182       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5183       break;
5184     default:
5185       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5186       break;
5187     }
5188   coding->consumed_char += consumed_chars;
5189   coding->consumed = src - coding->source;
5190   coding->charbuf_used = charbuf - coding->charbuf;
5191 }
5192
5193 static int
5194 encode_coding_ccl (struct coding_system *coding)
5195 {
5196   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5197   int multibytep = coding->dst_multibyte;
5198   int *charbuf = coding->charbuf;
5199   int *charbuf_end = charbuf + coding->charbuf_used;
5200   unsigned char *dst = coding->destination + coding->produced;
5201   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5202   int destination_charbuf[1024];
5203   int i, produced_chars = 0;
5204   Lisp_Object attrs, charset_list;
5205
5206   CODING_GET_INFO (coding, attrs, charset_list);
5207   if (coding->consumed_char == coding->src_chars
5208       && coding->mode & CODING_MODE_LAST_BLOCK)
5209     ccl->last_block = 1;
5210
5211   while (charbuf < charbuf_end)
5212     {
5213       ccl_driver (ccl, charbuf, destination_charbuf,
5214                   charbuf_end - charbuf, 1024, charset_list);
5215       if (multibytep)
5216         {
5217           ASSURE_DESTINATION (ccl->produced * 2);
5218           for (i = 0; i < ccl->produced; i++)
5219             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5220         }
5221       else
5222         {
5223           ASSURE_DESTINATION (ccl->produced);
5224           for (i = 0; i < ccl->produced; i++)
5225             *dst++ = destination_charbuf[i] & 0xFF;
5226           produced_chars += ccl->produced;
5227         }
5228       charbuf += ccl->consumed;
5229       if (ccl->status == CCL_STAT_QUIT
5230           || ccl->status == CCL_STAT_INVALID_CMD)
5231         break;
5232     }
5233
5234   switch (ccl->status)
5235     {
5236     case CCL_STAT_SUSPEND_BY_SRC:
5237       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5238       break;
5239     case CCL_STAT_SUSPEND_BY_DST:
5240       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5241       break;
5242     case CCL_STAT_QUIT:
5243     case CCL_STAT_INVALID_CMD:
5244       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5245       break;
5246     default:
5247       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5248       break;
5249     }
5250
5251   coding->produced_char += produced_chars;
5252   coding->produced = dst - coding->destination;
5253   return 0;
5254 }
5255
5256
5257 \f
5258 /*** 10, 11. no-conversion handlers ***/
5259
5260 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5261
5262 static void
5263 decode_coding_raw_text (struct coding_system *coding)
5264 {
5265   int eol_crlf =
5266     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5267
5268   coding->chars_at_source = 1;
5269   coding->consumed_char = coding->src_chars;
5270   coding->consumed = coding->src_bytes;
5271   if (eol_crlf && coding->source[coding->src_bytes - 1] == '\r')
5272     {
5273       coding->consumed_char--;
5274       coding->consumed--;
5275       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5276     }
5277   else
5278     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5279 }
5280
5281 static int
5282 encode_coding_raw_text (struct coding_system *coding)
5283 {
5284   int multibytep = coding->dst_multibyte;
5285   int *charbuf = coding->charbuf;
5286   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5287   unsigned char *dst = coding->destination + coding->produced;
5288   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5289   int produced_chars = 0;
5290   int c;
5291
5292   if (multibytep)
5293     {
5294       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5295
5296       if (coding->src_multibyte)
5297         while (charbuf < charbuf_end)
5298           {
5299             ASSURE_DESTINATION (safe_room);
5300             c = *charbuf++;
5301             if (ASCII_CHAR_P (c))
5302               EMIT_ONE_ASCII_BYTE (c);
5303             else if (CHAR_BYTE8_P (c))
5304               {
5305                 c = CHAR_TO_BYTE8 (c);
5306                 EMIT_ONE_BYTE (c);
5307               }
5308             else
5309               {
5310                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5311
5312                 CHAR_STRING_ADVANCE (c, p1);
5313                 while (p0 < p1)
5314                   {
5315                     EMIT_ONE_BYTE (*p0);
5316                     p0++;
5317                   }
5318               }
5319           }
5320       else
5321         while (charbuf < charbuf_end)
5322           {
5323             ASSURE_DESTINATION (safe_room);
5324             c = *charbuf++;
5325             EMIT_ONE_BYTE (c);
5326           }
5327     }
5328   else
5329     {
5330       if (coding->src_multibyte)
5331         {
5332           int safe_room = MAX_MULTIBYTE_LENGTH;
5333
5334           while (charbuf < charbuf_end)
5335             {
5336               ASSURE_DESTINATION (safe_room);
5337               c = *charbuf++;
5338               if (ASCII_CHAR_P (c))
5339                 *dst++ = c;
5340               else if (CHAR_BYTE8_P (c))
5341                 *dst++ = CHAR_TO_BYTE8 (c);
5342               else
5343                 CHAR_STRING_ADVANCE (c, dst);
5344             }
5345         }
5346       else
5347         {
5348           ASSURE_DESTINATION (charbuf_end - charbuf);
5349           while (charbuf < charbuf_end && dst < dst_end)
5350             *dst++ = *charbuf++;
5351         }
5352       produced_chars = dst - (coding->destination + coding->produced);
5353     }
5354   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5355   coding->produced_char += produced_chars;
5356   coding->produced = dst - coding->destination;
5357   return 0;
5358 }
5359
5360 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5361    Check if a text is encoded in a charset-based coding system.  If it
5362    is, return 1, else return 0.  */
5363
5364 static int
5365 detect_coding_charset (struct coding_system *coding,
5366                        struct coding_detection_info *detect_info)
5367 {
5368   const unsigned char *src = coding->source, *src_base;
5369   const unsigned char *src_end = coding->source + coding->src_bytes;
5370   int multibytep = coding->src_multibyte;
5371   int consumed_chars = 0;
5372   Lisp_Object attrs, valids, name;
5373   int found = 0;
5374   int head_ascii = coding->head_ascii;
5375   int check_latin_extra = 0;
5376
5377   detect_info->checked |= CATEGORY_MASK_CHARSET;
5378
5379   coding = &coding_categories[coding_category_charset];
5380   attrs = CODING_ID_ATTRS (coding->id);
5381   valids = AREF (attrs, coding_attr_charset_valids);
5382   name = CODING_ID_NAME (coding->id);
5383   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5384                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5385       || strncmp (SSDATA (SYMBOL_NAME (name)),
5386                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5387     check_latin_extra = 1;
5388
5389   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5390     src += head_ascii;
5391
5392   while (1)
5393     {
5394       int c;
5395       Lisp_Object val;
5396       struct charset *charset;
5397       int dim, idx;
5398
5399       src_base = src;
5400       ONE_MORE_BYTE (c);
5401       if (c < 0)
5402         continue;
5403       val = AREF (valids, c);
5404       if (NILP (val))
5405         break;
5406       if (c >= 0x80)
5407         {
5408           if (c < 0xA0
5409               && check_latin_extra
5410               && (!VECTORP (Vlatin_extra_code_table)
5411                   || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])))
5412             break;
5413           found = CATEGORY_MASK_CHARSET;
5414         }
5415       if (INTEGERP (val))
5416         {
5417           charset = CHARSET_FROM_ID (XFASTINT (val));
5418           dim = CHARSET_DIMENSION (charset);
5419           for (idx = 1; idx < dim; idx++)
5420             {
5421               if (src == src_end)
5422                 goto too_short;
5423               ONE_MORE_BYTE (c);
5424               if (c < charset->code_space[(dim - 1 - idx) * 2]
5425                   || c > charset->code_space[(dim - 1 - idx) * 2 + 1])
5426                 break;
5427             }
5428           if (idx < dim)
5429             break;
5430         }
5431       else
5432         {
5433           idx = 1;
5434           for (; CONSP (val); val = XCDR (val))
5435             {
5436               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5437               dim = CHARSET_DIMENSION (charset);
5438               while (idx < dim)
5439                 {
5440                   if (src == src_end)
5441                     goto too_short;
5442                   ONE_MORE_BYTE (c);
5443                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5444                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5445                     break;
5446                   idx++;
5447                 }
5448               if (idx == dim)
5449                 {
5450                   val = Qnil;
5451                   break;
5452                 }
5453             }
5454           if (CONSP (val))
5455             break;
5456         }
5457     }
5458  too_short:
5459   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5460   return 0;
5461
5462  no_more_source:
5463   detect_info->found |= found;
5464   return 1;
5465 }
5466
5467 static void
5468 decode_coding_charset (struct coding_system *coding)
5469 {
5470   const unsigned char *src = coding->source + coding->consumed;
5471   const unsigned char *src_end = coding->source + coding->src_bytes;
5472   const unsigned char *src_base;
5473   int *charbuf = coding->charbuf + coding->charbuf_used;
5474   /* We may produce one charset annotation in one loop and one more at
5475      the end.  */
5476   int *charbuf_end
5477     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5478   int consumed_chars = 0, consumed_chars_base;
5479   int multibytep = coding->src_multibyte;
5480   Lisp_Object attrs, charset_list, valids;
5481   int char_offset = coding->produced_char;
5482   int last_offset = char_offset;
5483   int last_id = charset_ascii;
5484   int eol_crlf =
5485     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5486   int byte_after_cr = -1;
5487
5488   CODING_GET_INFO (coding, attrs, charset_list);
5489   valids = AREF (attrs, coding_attr_charset_valids);
5490
5491   while (1)
5492     {
5493       int c;
5494       Lisp_Object val;
5495       struct charset *charset;
5496       int dim;
5497       int len = 1;
5498       unsigned code;
5499
5500       src_base = src;
5501       consumed_chars_base = consumed_chars;
5502
5503       if (charbuf >= charbuf_end)
5504         {
5505           if (byte_after_cr >= 0)
5506             src_base--;
5507           break;
5508         }
5509
5510       if (byte_after_cr >= 0)
5511         {
5512           c = byte_after_cr;
5513           byte_after_cr = -1;
5514         }
5515       else
5516         {
5517           ONE_MORE_BYTE (c);
5518           if (eol_crlf && c == '\r')
5519             ONE_MORE_BYTE (byte_after_cr);
5520         }
5521       if (c < 0)
5522         goto invalid_code;
5523       code = c;
5524
5525       val = AREF (valids, c);
5526       if (! INTEGERP (val) && ! CONSP (val))
5527         goto invalid_code;
5528       if (INTEGERP (val))
5529         {
5530           charset = CHARSET_FROM_ID (XFASTINT (val));
5531           dim = CHARSET_DIMENSION (charset);
5532           while (len < dim)
5533             {
5534               ONE_MORE_BYTE (c);
5535               code = (code << 8) | c;
5536               len++;
5537             }
5538           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5539                               charset, code, c);
5540         }
5541       else
5542         {
5543           /* VAL is a list of charset IDs.  It is assured that the
5544              list is sorted by charset dimensions (smaller one
5545              comes first).  */
5546           while (CONSP (val))
5547             {
5548               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5549               dim = CHARSET_DIMENSION (charset);
5550               while (len < dim)
5551                 {
5552                   ONE_MORE_BYTE (c);
5553                   code = (code << 8) | c;
5554                   len++;
5555                 }
5556               CODING_DECODE_CHAR (coding, src, src_base,
5557                                   src_end, charset, code, c);
5558               if (c >= 0)
5559                 break;
5560               val = XCDR (val);
5561             }
5562         }
5563       if (c < 0)
5564         goto invalid_code;
5565       if (charset->id != charset_ascii
5566           && last_id != charset->id)
5567         {
5568           if (last_id != charset_ascii)
5569             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5570           last_id = charset->id;
5571           last_offset = char_offset;
5572         }
5573
5574       *charbuf++ = c;
5575       char_offset++;
5576       continue;
5577
5578     invalid_code:
5579       src = src_base;
5580       consumed_chars = consumed_chars_base;
5581       ONE_MORE_BYTE (c);
5582       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5583       char_offset++;
5584       coding->errors++;
5585     }
5586
5587  no_more_source:
5588   if (last_id != charset_ascii)
5589     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5590   coding->consumed_char += consumed_chars_base;
5591   coding->consumed = src_base - coding->source;
5592   coding->charbuf_used = charbuf - coding->charbuf;
5593 }
5594
5595 static int
5596 encode_coding_charset (struct coding_system *coding)
5597 {
5598   int multibytep = coding->dst_multibyte;
5599   int *charbuf = coding->charbuf;
5600   int *charbuf_end = charbuf + coding->charbuf_used;
5601   unsigned char *dst = coding->destination + coding->produced;
5602   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5603   int safe_room = MAX_MULTIBYTE_LENGTH;
5604   int produced_chars = 0;
5605   Lisp_Object attrs, charset_list;
5606   int ascii_compatible;
5607   int c;
5608
5609   CODING_GET_INFO (coding, attrs, charset_list);
5610   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5611
5612   while (charbuf < charbuf_end)
5613     {
5614       struct charset *charset;
5615       unsigned code;
5616
5617       ASSURE_DESTINATION (safe_room);
5618       c = *charbuf++;
5619       if (ascii_compatible && ASCII_CHAR_P (c))
5620         EMIT_ONE_ASCII_BYTE (c);
5621       else if (CHAR_BYTE8_P (c))
5622         {
5623           c = CHAR_TO_BYTE8 (c);
5624           EMIT_ONE_BYTE (c);
5625         }
5626       else
5627         {
5628           charset = char_charset (c, charset_list, &code);
5629           if (charset)
5630             {
5631               if (CHARSET_DIMENSION (charset) == 1)
5632                 EMIT_ONE_BYTE (code);
5633               else if (CHARSET_DIMENSION (charset) == 2)
5634                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5635               else if (CHARSET_DIMENSION (charset) == 3)
5636                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5637               else
5638                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5639                                  (code >> 8) & 0xFF, code & 0xFF);
5640             }
5641           else
5642             {
5643               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5644                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5645               else
5646                 c = coding->default_char;
5647               EMIT_ONE_BYTE (c);
5648             }
5649         }
5650     }
5651
5652   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5653   coding->produced_char += produced_chars;
5654   coding->produced = dst - coding->destination;
5655   return 0;
5656 }
5657
5658 \f
5659 /*** 7. C library functions ***/
5660
5661 /* Setup coding context CODING from information about CODING_SYSTEM.
5662    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5663    CODING_SYSTEM is invalid, signal an error.  */
5664
5665 void
5666 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5667 {
5668   Lisp_Object attrs;
5669   Lisp_Object eol_type;
5670   Lisp_Object coding_type;
5671   Lisp_Object val;
5672
5673   if (NILP (coding_system))
5674     coding_system = Qundecided;
5675
5676   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5677
5678   attrs = CODING_ID_ATTRS (coding->id);
5679   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5680
5681   coding->mode = 0;
5682   coding->head_ascii = -1;
5683   if (VECTORP (eol_type))
5684     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5685                             | CODING_REQUIRE_DETECTION_MASK);
5686   else if (! EQ (eol_type, Qunix))
5687     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5688                             | CODING_REQUIRE_ENCODING_MASK);
5689   else
5690     coding->common_flags = 0;
5691   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5692     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5693   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5694     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5695   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5696     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5697
5698   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5699   coding->max_charset_id = SCHARS (val) - 1;
5700   coding->safe_charsets = SDATA (val);
5701   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5702   coding->carryover_bytes = 0;
5703
5704   coding_type = CODING_ATTR_TYPE (attrs);
5705   if (EQ (coding_type, Qundecided))
5706     {
5707       coding->detector = NULL;
5708       coding->decoder = decode_coding_raw_text;
5709       coding->encoder = encode_coding_raw_text;
5710       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5711     }
5712   else if (EQ (coding_type, Qiso_2022))
5713     {
5714       int i;
5715       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5716
5717       /* Invoke graphic register 0 to plane 0.  */
5718       CODING_ISO_INVOCATION (coding, 0) = 0;
5719       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5720       CODING_ISO_INVOCATION (coding, 1)
5721         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5722       /* Setup the initial status of designation.  */
5723       for (i = 0; i < 4; i++)
5724         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5725       /* Not single shifting initially.  */
5726       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5727       /* Beginning of buffer should also be regarded as bol. */
5728       CODING_ISO_BOL (coding) = 1;
5729       coding->detector = detect_coding_iso_2022;
5730       coding->decoder = decode_coding_iso_2022;
5731       coding->encoder = encode_coding_iso_2022;
5732       if (flags & CODING_ISO_FLAG_SAFE)
5733         coding->mode |= CODING_MODE_SAFE_ENCODING;
5734       coding->common_flags
5735         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5736             | CODING_REQUIRE_FLUSHING_MASK);
5737       if (flags & CODING_ISO_FLAG_COMPOSITION)
5738         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5739       if (flags & CODING_ISO_FLAG_DESIGNATION)
5740         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5741       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5742         {
5743           setup_iso_safe_charsets (attrs);
5744           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5745           coding->max_charset_id = SCHARS (val) - 1;
5746           coding->safe_charsets = SDATA (val);
5747         }
5748       CODING_ISO_FLAGS (coding) = flags;
5749       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5750       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5751       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5752       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5753     }
5754   else if (EQ (coding_type, Qcharset))
5755     {
5756       coding->detector = detect_coding_charset;
5757       coding->decoder = decode_coding_charset;
5758       coding->encoder = encode_coding_charset;
5759       coding->common_flags
5760         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5761     }
5762   else if (EQ (coding_type, Qutf_8))
5763     {
5764       val = AREF (attrs, coding_attr_utf_bom);
5765       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5766                                    : EQ (val, Qt) ? utf_with_bom
5767                                    : utf_without_bom);
5768       coding->detector = detect_coding_utf_8;
5769       coding->decoder = decode_coding_utf_8;
5770       coding->encoder = encode_coding_utf_8;
5771       coding->common_flags
5772         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5773       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5774         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5775     }
5776   else if (EQ (coding_type, Qutf_16))
5777     {
5778       val = AREF (attrs, coding_attr_utf_bom);
5779       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5780                                     : EQ (val, Qt) ? utf_with_bom
5781                                     : utf_without_bom);
5782       val = AREF (attrs, coding_attr_utf_16_endian);
5783       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5784                                        : utf_16_little_endian);
5785       CODING_UTF_16_SURROGATE (coding) = 0;
5786       coding->detector = detect_coding_utf_16;
5787       coding->decoder = decode_coding_utf_16;
5788       coding->encoder = encode_coding_utf_16;
5789       coding->common_flags
5790         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5791       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5792         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5793     }
5794   else if (EQ (coding_type, Qccl))
5795     {
5796       coding->detector = detect_coding_ccl;
5797       coding->decoder = decode_coding_ccl;
5798       coding->encoder = encode_coding_ccl;
5799       coding->common_flags
5800         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5801             | CODING_REQUIRE_FLUSHING_MASK);
5802     }
5803   else if (EQ (coding_type, Qemacs_mule))
5804     {
5805       coding->detector = detect_coding_emacs_mule;
5806       coding->decoder = decode_coding_emacs_mule;
5807       coding->encoder = encode_coding_emacs_mule;
5808       coding->common_flags
5809         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5810       coding->spec.emacs_mule.full_support = 1;
5811       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5812           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5813         {
5814           Lisp_Object tail, safe_charsets;
5815           int max_charset_id = 0;
5816
5817           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5818                tail = XCDR (tail))
5819             if (max_charset_id < XFASTINT (XCAR (tail)))
5820               max_charset_id = XFASTINT (XCAR (tail));
5821           safe_charsets = make_uninit_string (max_charset_id + 1);
5822           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5823           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5824                tail = XCDR (tail))
5825             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5826           coding->max_charset_id = max_charset_id;
5827           coding->safe_charsets = SDATA (safe_charsets);
5828           coding->spec.emacs_mule.full_support = 1;
5829         }
5830       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5831       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5832     }
5833   else if (EQ (coding_type, Qshift_jis))
5834     {
5835       coding->detector = detect_coding_sjis;
5836       coding->decoder = decode_coding_sjis;
5837       coding->encoder = encode_coding_sjis;
5838       coding->common_flags
5839         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5840     }
5841   else if (EQ (coding_type, Qbig5))
5842     {
5843       coding->detector = detect_coding_big5;
5844       coding->decoder = decode_coding_big5;
5845       coding->encoder = encode_coding_big5;
5846       coding->common_flags
5847         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5848     }
5849   else                          /* EQ (coding_type, Qraw_text) */
5850     {
5851       coding->detector = NULL;
5852       coding->decoder = decode_coding_raw_text;
5853       coding->encoder = encode_coding_raw_text;
5854       if (! EQ (eol_type, Qunix))
5855         {
5856           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5857           if (! VECTORP (eol_type))
5858             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5859         }
5860
5861     }
5862
5863   return;
5864 }
5865
5866 /* Return a list of charsets supported by CODING.  */
5867
5868 Lisp_Object
5869 coding_charset_list (struct coding_system *coding)
5870 {
5871   Lisp_Object attrs, charset_list;
5872
5873   CODING_GET_INFO (coding, attrs, charset_list);
5874   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5875     {
5876       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5877
5878       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5879         charset_list = Viso_2022_charset_list;
5880     }
5881   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5882     {
5883       charset_list = Vemacs_mule_charset_list;
5884     }
5885   return charset_list;
5886 }
5887
5888
5889 /* Return a list of charsets supported by CODING-SYSTEM.  */
5890
5891 Lisp_Object
5892 coding_system_charset_list (Lisp_Object coding_system)
5893 {
5894   int id;
5895   Lisp_Object attrs, charset_list;
5896
5897   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5898   attrs = CODING_ID_ATTRS (id);
5899
5900   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5901     {
5902       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5903
5904       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5905         charset_list = Viso_2022_charset_list;
5906       else
5907         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5908     }
5909   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5910     {
5911       charset_list = Vemacs_mule_charset_list;
5912     }
5913   else
5914     {
5915       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5916     }
5917   return charset_list;
5918 }
5919
5920
5921 /* Return raw-text or one of its subsidiaries that has the same
5922    eol_type as CODING-SYSTEM.  */
5923
5924 Lisp_Object
5925 raw_text_coding_system (Lisp_Object coding_system)
5926 {
5927   Lisp_Object spec, attrs;
5928   Lisp_Object eol_type, raw_text_eol_type;
5929
5930   if (NILP (coding_system))
5931     return Qraw_text;
5932   spec = CODING_SYSTEM_SPEC (coding_system);
5933   attrs = AREF (spec, 0);
5934
5935   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5936     return coding_system;
5937
5938   eol_type = AREF (spec, 2);
5939   if (VECTORP (eol_type))
5940     return Qraw_text;
5941   spec = CODING_SYSTEM_SPEC (Qraw_text);
5942   raw_text_eol_type = AREF (spec, 2);
5943   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5944           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5945           : AREF (raw_text_eol_type, 2));
5946 }
5947
5948
5949 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5950    the subsidiary that has the same eol-spec as PARENT (if it is not
5951    nil and specifies end-of-line format) or the system's setting
5952    (system_eol_type).  */
5953
5954 Lisp_Object
5955 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
5956 {
5957   Lisp_Object spec, eol_type;
5958
5959   if (NILP (coding_system))
5960     coding_system = Qraw_text;
5961   spec = CODING_SYSTEM_SPEC (coding_system);
5962   eol_type = AREF (spec, 2);
5963   if (VECTORP (eol_type))
5964     {
5965       Lisp_Object parent_eol_type;
5966
5967       if (! NILP (parent))
5968         {
5969           Lisp_Object parent_spec;
5970
5971           parent_spec = CODING_SYSTEM_SPEC (parent);
5972           parent_eol_type = AREF (parent_spec, 2);
5973           if (VECTORP (parent_eol_type))
5974             parent_eol_type = system_eol_type;
5975         }
5976       else
5977         parent_eol_type = system_eol_type;
5978       if (EQ (parent_eol_type, Qunix))
5979         coding_system = AREF (eol_type, 0);
5980       else if (EQ (parent_eol_type, Qdos))
5981         coding_system = AREF (eol_type, 1);
5982       else if (EQ (parent_eol_type, Qmac))
5983         coding_system = AREF (eol_type, 2);
5984     }
5985   return coding_system;
5986 }
5987
5988
5989 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
5990    decided for writing to a process.  If not, complement them, and
5991    return a new coding system.  */
5992
5993 Lisp_Object
5994 complement_process_encoding_system (Lisp_Object coding_system)
5995 {
5996   Lisp_Object coding_base = Qnil, eol_base = Qnil;
5997   Lisp_Object spec, attrs;
5998   int i;
5999
6000   for (i = 0; i < 3; i++)
6001     {
6002       if (i == 1)
6003         coding_system = CDR_SAFE (Vdefault_process_coding_system);
6004       else if (i == 2)
6005         coding_system = preferred_coding_system ();
6006       spec = CODING_SYSTEM_SPEC (coding_system);
6007       if (NILP (spec))
6008         continue;
6009       attrs = AREF (spec, 0);
6010       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6011         coding_base = CODING_ATTR_BASE_NAME (attrs);
6012       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
6013         eol_base = coding_system;
6014       if (! NILP (coding_base) && ! NILP (eol_base))
6015         break;
6016     }
6017
6018   if (i > 0)
6019     /* The original CODING_SYSTEM didn't specify text-conversion or
6020        eol-conversion.  Be sure that we return a fully complemented
6021        coding system.  */
6022     coding_system = coding_inherit_eol_type (coding_base, eol_base);
6023   return coding_system;
6024 }
6025
6026
6027 /* Emacs has a mechanism to automatically detect a coding system if it
6028    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
6029    it's impossible to distinguish some coding systems accurately
6030    because they use the same range of codes.  So, at first, coding
6031    systems are categorized into 7, those are:
6032
6033    o coding-category-emacs-mule
6034
6035         The category for a coding system which has the same code range
6036         as Emacs' internal format.  Assigned the coding-system (Lisp
6037         symbol) `emacs-mule' by default.
6038
6039    o coding-category-sjis
6040
6041         The category for a coding system which has the same code range
6042         as SJIS.  Assigned the coding-system (Lisp
6043         symbol) `japanese-shift-jis' by default.
6044
6045    o coding-category-iso-7
6046
6047         The category for a coding system which has the same code range
6048         as ISO2022 of 7-bit environment.  This doesn't use any locking
6049         shift and single shift functions.  This can encode/decode all
6050         charsets.  Assigned the coding-system (Lisp symbol)
6051         `iso-2022-7bit' by default.
6052
6053    o coding-category-iso-7-tight
6054
6055         Same as coding-category-iso-7 except that this can
6056         encode/decode only the specified charsets.
6057
6058    o coding-category-iso-8-1
6059
6060         The category for a coding system which has the same code range
6061         as ISO2022 of 8-bit environment and graphic plane 1 used only
6062         for DIMENSION1 charset.  This doesn't use any locking shift
6063         and single shift functions.  Assigned the coding-system (Lisp
6064         symbol) `iso-latin-1' by default.
6065
6066    o coding-category-iso-8-2
6067
6068         The category for a coding system which has the same code range
6069         as ISO2022 of 8-bit environment and graphic plane 1 used only
6070         for DIMENSION2 charset.  This doesn't use any locking shift
6071         and single shift functions.  Assigned the coding-system (Lisp
6072         symbol) `japanese-iso-8bit' by default.
6073
6074    o coding-category-iso-7-else
6075
6076         The category for a coding system which has the same code range
6077         as ISO2022 of 7-bit environment but uses locking shift or
6078         single shift functions.  Assigned the coding-system (Lisp
6079         symbol) `iso-2022-7bit-lock' by default.
6080
6081    o coding-category-iso-8-else
6082
6083         The category for a coding system which has the same code range
6084         as ISO2022 of 8-bit environment but uses locking shift or
6085         single shift functions.  Assigned the coding-system (Lisp
6086         symbol) `iso-2022-8bit-ss2' by default.
6087
6088    o coding-category-big5
6089
6090         The category for a coding system which has the same code range
6091         as BIG5.  Assigned the coding-system (Lisp symbol)
6092         `cn-big5' by default.
6093
6094    o coding-category-utf-8
6095
6096         The category for a coding system which has the same code range
6097         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6098         symbol) `utf-8' by default.
6099
6100    o coding-category-utf-16-be
6101
6102         The category for a coding system in which a text has an
6103         Unicode signature (cf. Unicode Standard) in the order of BIG
6104         endian at the head.  Assigned the coding-system (Lisp symbol)
6105         `utf-16-be' by default.
6106
6107    o coding-category-utf-16-le
6108
6109         The category for a coding system in which a text has an
6110         Unicode signature (cf. Unicode Standard) in the order of
6111         LITTLE endian at the head.  Assigned the coding-system (Lisp
6112         symbol) `utf-16-le' by default.
6113
6114    o coding-category-ccl
6115
6116         The category for a coding system of which encoder/decoder is
6117         written in CCL programs.  The default value is nil, i.e., no
6118         coding system is assigned.
6119
6120    o coding-category-binary
6121
6122         The category for a coding system not categorized in any of the
6123         above.  Assigned the coding-system (Lisp symbol)
6124         `no-conversion' by default.
6125
6126    Each of them is a Lisp symbol and the value is an actual
6127    `coding-system's (this is also a Lisp symbol) assigned by a user.
6128    What Emacs does actually is to detect a category of coding system.
6129    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6130    decide only one possible category, it selects a category of the
6131    highest priority.  Priorities of categories are also specified by a
6132    user in a Lisp variable `coding-category-list'.
6133
6134 */
6135
6136 #define EOL_SEEN_NONE   0
6137 #define EOL_SEEN_LF     1
6138 #define EOL_SEEN_CR     2
6139 #define EOL_SEEN_CRLF   4
6140
6141 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6142    SOURCE is encoded.  If CATEGORY is one of
6143    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6144    two-byte, else they are encoded by one-byte.
6145
6146    Return one of EOL_SEEN_XXX.  */
6147
6148 #define MAX_EOL_CHECK_COUNT 3
6149
6150 static int
6151 detect_eol (const unsigned char *source, EMACS_INT src_bytes,
6152             enum coding_category category)
6153 {
6154   const unsigned char *src = source, *src_end = src + src_bytes;
6155   unsigned char c;
6156   int total  = 0;
6157   int eol_seen = EOL_SEEN_NONE;
6158
6159   if ((1 << category) & CATEGORY_MASK_UTF_16)
6160     {
6161       int msb, lsb;
6162
6163       msb = category == (coding_category_utf_16_le
6164                          | coding_category_utf_16_le_nosig);
6165       lsb = 1 - msb;
6166
6167       while (src + 1 < src_end)
6168         {
6169           c = src[lsb];
6170           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6171             {
6172               int this_eol;
6173
6174               if (c == '\n')
6175                 this_eol = EOL_SEEN_LF;
6176               else if (src + 3 >= src_end
6177                        || src[msb + 2] != 0
6178                        || src[lsb + 2] != '\n')
6179                 this_eol = EOL_SEEN_CR;
6180               else
6181                 {
6182                   this_eol = EOL_SEEN_CRLF;
6183                   src += 2;
6184                 }
6185
6186               if (eol_seen == EOL_SEEN_NONE)
6187                 /* This is the first end-of-line.  */
6188                 eol_seen = this_eol;
6189               else if (eol_seen != this_eol)
6190                 {
6191                   /* The found type is different from what found before.
6192                      Allow for stray ^M characters in DOS EOL files.  */
6193                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6194                       || (eol_seen == EOL_SEEN_CRLF
6195                           && this_eol == EOL_SEEN_CR))
6196                     eol_seen = EOL_SEEN_CRLF;
6197                   else
6198                     {
6199                       eol_seen = EOL_SEEN_LF;
6200                       break;
6201                     }
6202                 }
6203               if (++total == MAX_EOL_CHECK_COUNT)
6204                 break;
6205             }
6206           src += 2;
6207         }
6208     }
6209   else
6210     while (src < src_end)
6211       {
6212         c = *src++;
6213         if (c == '\n' || c == '\r')
6214           {
6215             int this_eol;
6216
6217             if (c == '\n')
6218               this_eol = EOL_SEEN_LF;
6219             else if (src >= src_end || *src != '\n')
6220               this_eol = EOL_SEEN_CR;
6221             else
6222               this_eol = EOL_SEEN_CRLF, src++;
6223
6224             if (eol_seen == EOL_SEEN_NONE)
6225               /* This is the first end-of-line.  */
6226               eol_seen = this_eol;
6227             else if (eol_seen != this_eol)
6228               {
6229                 /* The found type is different from what found before.
6230                    Allow for stray ^M characters in DOS EOL files.  */
6231                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6232                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6233                   eol_seen = EOL_SEEN_CRLF;
6234                 else
6235                   {
6236                     eol_seen = EOL_SEEN_LF;
6237                     break;
6238                   }
6239               }
6240             if (++total == MAX_EOL_CHECK_COUNT)
6241               break;
6242           }
6243       }
6244   return eol_seen;
6245 }
6246
6247
6248 static Lisp_Object
6249 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6250 {
6251   Lisp_Object eol_type;
6252
6253   eol_type = CODING_ID_EOL_TYPE (coding->id);
6254   if (eol_seen & EOL_SEEN_LF)
6255     {
6256       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6257       eol_type = Qunix;
6258     }
6259   else if (eol_seen & EOL_SEEN_CRLF)
6260     {
6261       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6262       eol_type = Qdos;
6263     }
6264   else if (eol_seen & EOL_SEEN_CR)
6265     {
6266       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6267       eol_type = Qmac;
6268     }
6269   return eol_type;
6270 }
6271
6272 /* Detect how a text specified in CODING is encoded.  If a coding
6273    system is detected, update fields of CODING by the detected coding
6274    system.  */
6275
6276 void
6277 detect_coding (struct coding_system *coding)
6278 {
6279   const unsigned char *src, *src_end;
6280   int saved_mode = coding->mode;
6281
6282   coding->consumed = coding->consumed_char = 0;
6283   coding->produced = coding->produced_char = 0;
6284   coding_set_source (coding);
6285
6286   src_end = coding->source + coding->src_bytes;
6287   coding->head_ascii = 0;
6288
6289   /* If we have not yet decided the text encoding type, detect it
6290      now.  */
6291   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6292     {
6293       int c, i;
6294       struct coding_detection_info detect_info;
6295       int null_byte_found = 0, eight_bit_found = 0;
6296
6297       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6298       for (src = coding->source; src < src_end; src++)
6299         {
6300           c = *src;
6301           if (c & 0x80)
6302             {
6303               eight_bit_found = 1;
6304               if (null_byte_found)
6305                 break;
6306             }
6307           else if (c < 0x20)
6308             {
6309               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6310                   && ! inhibit_iso_escape_detection
6311                   && ! detect_info.checked)
6312                 {
6313                   if (detect_coding_iso_2022 (coding, &detect_info))
6314                     {
6315                       /* We have scanned the whole data.  */
6316                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6317                         {
6318                           /* We didn't find an 8-bit code.  We may
6319                              have found a null-byte, but it's very
6320                              rare that a binary file conforms to
6321                              ISO-2022.  */
6322                           src = src_end;
6323                           coding->head_ascii = src - coding->source;
6324                         }
6325                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6326                       break;
6327                     }
6328                 }
6329               else if (! c && !inhibit_null_byte_detection)
6330                 {
6331                   null_byte_found = 1;
6332                   if (eight_bit_found)
6333                     break;
6334                 }
6335               if (! eight_bit_found)
6336                 coding->head_ascii++;
6337             }
6338           else if (! eight_bit_found)
6339             coding->head_ascii++;
6340         }
6341
6342       if (null_byte_found || eight_bit_found
6343           || coding->head_ascii < coding->src_bytes
6344           || detect_info.found)
6345         {
6346           enum coding_category category;
6347           struct coding_system *this;
6348
6349           if (coding->head_ascii == coding->src_bytes)
6350             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6351             for (i = 0; i < coding_category_raw_text; i++)
6352               {
6353                 category = coding_priorities[i];
6354                 this = coding_categories + category;
6355                 if (detect_info.found & (1 << category))
6356                   break;
6357               }
6358           else
6359             {
6360               if (null_byte_found)
6361                 {
6362                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6363                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6364                 }
6365               for (i = 0; i < coding_category_raw_text; i++)
6366                 {
6367                   category = coding_priorities[i];
6368                   this = coding_categories + category;
6369                   if (this->id < 0)
6370                     {
6371                       /* No coding system of this category is defined.  */
6372                       detect_info.rejected |= (1 << category);
6373                     }
6374                   else if (category >= coding_category_raw_text)
6375                     continue;
6376                   else if (detect_info.checked & (1 << category))
6377                     {
6378                       if (detect_info.found & (1 << category))
6379                         break;
6380                     }
6381                   else if ((*(this->detector)) (coding, &detect_info)
6382                            && detect_info.found & (1 << category))
6383                     {
6384                       if (category == coding_category_utf_16_auto)
6385                         {
6386                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6387                             category = coding_category_utf_16_le;
6388                           else
6389                             category = coding_category_utf_16_be;
6390                         }
6391                       break;
6392                     }
6393                 }
6394             }
6395
6396           if (i < coding_category_raw_text)
6397             setup_coding_system (CODING_ID_NAME (this->id), coding);
6398           else if (null_byte_found)
6399             setup_coding_system (Qno_conversion, coding);
6400           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6401                    == CATEGORY_MASK_ANY)
6402             setup_coding_system (Qraw_text, coding);
6403           else if (detect_info.rejected)
6404             for (i = 0; i < coding_category_raw_text; i++)
6405               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6406                 {
6407                   this = coding_categories + coding_priorities[i];
6408                   setup_coding_system (CODING_ID_NAME (this->id), coding);
6409                   break;
6410                 }
6411         }
6412     }
6413   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6414            == coding_category_utf_8_auto)
6415     {
6416       Lisp_Object coding_systems;
6417       struct coding_detection_info detect_info;
6418
6419       coding_systems
6420         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6421       detect_info.found = detect_info.rejected = 0;
6422       coding->head_ascii = 0;
6423       if (CONSP (coding_systems)
6424           && detect_coding_utf_8 (coding, &detect_info))
6425         {
6426           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6427             setup_coding_system (XCAR (coding_systems), coding);
6428           else
6429             setup_coding_system (XCDR (coding_systems), coding);
6430         }
6431     }
6432   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6433            == coding_category_utf_16_auto)
6434     {
6435       Lisp_Object coding_systems;
6436       struct coding_detection_info detect_info;
6437
6438       coding_systems
6439         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6440       detect_info.found = detect_info.rejected = 0;
6441       coding->head_ascii = 0;
6442       if (CONSP (coding_systems)
6443           && detect_coding_utf_16 (coding, &detect_info))
6444         {
6445           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6446             setup_coding_system (XCAR (coding_systems), coding);
6447           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6448             setup_coding_system (XCDR (coding_systems), coding);
6449         }
6450     }
6451   coding->mode = saved_mode;
6452 }
6453
6454
6455 static void
6456 decode_eol (struct coding_system *coding)
6457 {
6458   Lisp_Object eol_type;
6459   unsigned char *p, *pbeg, *pend;
6460
6461   eol_type = CODING_ID_EOL_TYPE (coding->id);
6462   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6463     return;
6464
6465   if (NILP (coding->dst_object))
6466     pbeg = coding->destination;
6467   else
6468     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6469   pend = pbeg + coding->produced;
6470
6471   if (VECTORP (eol_type))
6472     {
6473       int eol_seen = EOL_SEEN_NONE;
6474
6475       for (p = pbeg; p < pend; p++)
6476         {
6477           if (*p == '\n')
6478             eol_seen |= EOL_SEEN_LF;
6479           else if (*p == '\r')
6480             {
6481               if (p + 1 < pend && *(p + 1) == '\n')
6482                 {
6483                   eol_seen |= EOL_SEEN_CRLF;
6484                   p++;
6485                 }
6486               else
6487                 eol_seen |= EOL_SEEN_CR;
6488             }
6489         }
6490       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6491       if ((eol_seen & EOL_SEEN_CRLF) != 0
6492           && (eol_seen & EOL_SEEN_CR) != 0
6493           && (eol_seen & EOL_SEEN_LF) == 0)
6494         eol_seen = EOL_SEEN_CRLF;
6495       else if (eol_seen != EOL_SEEN_NONE
6496           && eol_seen != EOL_SEEN_LF
6497           && eol_seen != EOL_SEEN_CRLF
6498           && eol_seen != EOL_SEEN_CR)
6499         eol_seen = EOL_SEEN_LF;
6500       if (eol_seen != EOL_SEEN_NONE)
6501         eol_type = adjust_coding_eol_type (coding, eol_seen);
6502     }
6503
6504   if (EQ (eol_type, Qmac))
6505     {
6506       for (p = pbeg; p < pend; p++)
6507         if (*p == '\r')
6508           *p = '\n';
6509     }
6510   else if (EQ (eol_type, Qdos))
6511     {
6512       int n = 0;
6513
6514       if (NILP (coding->dst_object))
6515         {
6516           /* Start deleting '\r' from the tail to minimize the memory
6517              movement.  */
6518           for (p = pend - 2; p >= pbeg; p--)
6519             if (*p == '\r')
6520               {
6521                 memmove (p, p + 1, pend-- - p - 1);
6522                 n++;
6523               }
6524         }
6525       else
6526         {
6527           int pos_byte = coding->dst_pos_byte;
6528           int pos = coding->dst_pos;
6529           int pos_end = pos + coding->produced_char - 1;
6530
6531           while (pos < pos_end)
6532             {
6533               p = BYTE_POS_ADDR (pos_byte);
6534               if (*p == '\r' && p[1] == '\n')
6535                 {
6536                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6537                   n++;
6538                   pos_end--;
6539                 }
6540               pos++;
6541               if (coding->dst_multibyte)
6542                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6543               else
6544                 pos_byte++;
6545             }
6546         }
6547       coding->produced -= n;
6548       coding->produced_char -= n;
6549     }
6550 }
6551
6552
6553 /* Return a translation table (or list of them) from coding system
6554    attribute vector ATTRS for encoding (ENCODEP is nonzero) or
6555    decoding (ENCODEP is zero). */
6556
6557 static Lisp_Object
6558 get_translation_table (Lisp_Object attrs, int encodep, int *max_lookup)
6559 {
6560   Lisp_Object standard, translation_table;
6561   Lisp_Object val;
6562
6563   if (NILP (Venable_character_translation))
6564     {
6565       if (max_lookup)
6566         *max_lookup = 0;
6567       return Qnil;
6568     }
6569   if (encodep)
6570     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6571       standard = Vstandard_translation_table_for_encode;
6572   else
6573     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6574       standard = Vstandard_translation_table_for_decode;
6575   if (NILP (translation_table))
6576     translation_table = standard;
6577   else
6578     {
6579       if (SYMBOLP (translation_table))
6580         translation_table = Fget (translation_table, Qtranslation_table);
6581       else if (CONSP (translation_table))
6582         {
6583           translation_table = Fcopy_sequence (translation_table);
6584           for (val = translation_table; CONSP (val); val = XCDR (val))
6585             if (SYMBOLP (XCAR (val)))
6586               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6587         }
6588       if (CHAR_TABLE_P (standard))
6589         {
6590           if (CONSP (translation_table))
6591             translation_table = nconc2 (translation_table,
6592                                         Fcons (standard, Qnil));
6593           else
6594             translation_table = Fcons (translation_table,
6595                                        Fcons (standard, Qnil));
6596         }
6597     }
6598
6599   if (max_lookup)
6600     {
6601       *max_lookup = 1;
6602       if (CHAR_TABLE_P (translation_table)
6603           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6604         {
6605           val = XCHAR_TABLE (translation_table)->extras[1];
6606           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6607             *max_lookup = XFASTINT (val);
6608         }
6609       else if (CONSP (translation_table))
6610         {
6611           Lisp_Object tail, val;
6612
6613           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6614             if (CHAR_TABLE_P (XCAR (tail))
6615                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6616               {
6617                 val = XCHAR_TABLE (XCAR (tail))->extras[1];
6618                 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6619                   *max_lookup = XFASTINT (val);
6620               }
6621         }
6622     }
6623   return translation_table;
6624 }
6625
6626 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6627   do {                                                          \
6628     trans = Qnil;                                               \
6629     if (CHAR_TABLE_P (table))                                   \
6630       {                                                         \
6631         trans = CHAR_TABLE_REF (table, c);                      \
6632         if (CHARACTERP (trans))                                 \
6633           c = XFASTINT (trans), trans = Qnil;                   \
6634       }                                                         \
6635     else if (CONSP (table))                                     \
6636       {                                                         \
6637         Lisp_Object tail;                                       \
6638                                                                 \
6639         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6640           if (CHAR_TABLE_P (XCAR (tail)))                       \
6641             {                                                   \
6642               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6643               if (CHARACTERP (trans))                           \
6644                 c = XFASTINT (trans), trans = Qnil;             \
6645               else if (! NILP (trans))                          \
6646                 break;                                          \
6647             }                                                   \
6648       }                                                         \
6649   } while (0)
6650
6651
6652 /* Return a translation of character(s) at BUF according to TRANS.
6653    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6654    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6655    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6656    translation is found, and Qnil if not found..
6657    If BUF is too short to lookup characters in FROM, return Qt.  */
6658
6659 static Lisp_Object
6660 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6661 {
6662
6663   if (INTEGERP (trans))
6664     return trans;
6665   for (; CONSP (trans); trans = XCDR (trans))
6666     {
6667       Lisp_Object val = XCAR (trans);
6668       Lisp_Object from = XCAR (val);
6669       int len = ASIZE (from);
6670       int i;
6671
6672       for (i = 0; i < len; i++)
6673         {
6674           if (buf + i == buf_end)
6675             return Qt;
6676           if (XINT (AREF (from, i)) != buf[i])
6677             break;
6678         }
6679       if (i == len)
6680         return val;
6681     }
6682   return Qnil;
6683 }
6684
6685
6686 static int
6687 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6688                int last_block)
6689 {
6690   unsigned char *dst = coding->destination + coding->produced;
6691   unsigned char *dst_end = coding->destination + coding->dst_bytes;
6692   EMACS_INT produced;
6693   EMACS_INT produced_chars = 0;
6694   int carryover = 0;
6695
6696   if (! coding->chars_at_source)
6697     {
6698       /* Source characters are in coding->charbuf.  */
6699       int *buf = coding->charbuf;
6700       int *buf_end = buf + coding->charbuf_used;
6701
6702       if (EQ (coding->src_object, coding->dst_object))
6703         {
6704           coding_set_source (coding);
6705           dst_end = ((unsigned char *) coding->source) + coding->consumed;
6706         }
6707
6708       while (buf < buf_end)
6709         {
6710           int c = *buf, i;
6711
6712           if (c >= 0)
6713             {
6714               int from_nchars = 1, to_nchars = 1;
6715               Lisp_Object trans = Qnil;
6716
6717               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6718               if (! NILP (trans))
6719                 {
6720                   trans = get_translation (trans, buf, buf_end);
6721                   if (INTEGERP (trans))
6722                     c = XINT (trans);
6723                   else if (CONSP (trans))
6724                     {
6725                       from_nchars = ASIZE (XCAR (trans));
6726                       trans = XCDR (trans);
6727                       if (INTEGERP (trans))
6728                         c = XINT (trans);
6729                       else
6730                         {
6731                           to_nchars = ASIZE (trans);
6732                           c = XINT (AREF (trans, 0));
6733                         }
6734                     }
6735                   else if (EQ (trans, Qt) && ! last_block)
6736                     break;
6737                 }
6738
6739               if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end)
6740                 {
6741                   dst = alloc_destination (coding,
6742                                            buf_end - buf
6743                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
6744                                            dst);
6745                   if (EQ (coding->src_object, coding->dst_object))
6746                     {
6747                       coding_set_source (coding);
6748                       dst_end = (((unsigned char *) coding->source)
6749                                  + coding->consumed);
6750                     }
6751                   else
6752                     dst_end = coding->destination + coding->dst_bytes;
6753                 }
6754
6755               for (i = 0; i < to_nchars; i++)
6756                 {
6757                   if (i > 0)
6758                     c = XINT (AREF (trans, i));
6759                   if (coding->dst_multibyte
6760                       || ! CHAR_BYTE8_P (c))
6761                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6762                   else
6763                     *dst++ = CHAR_TO_BYTE8 (c);
6764                 }
6765               produced_chars += to_nchars;
6766               buf += from_nchars;
6767             }
6768           else
6769             /* This is an annotation datum.  (-C) is the length.  */
6770             buf += -c;
6771         }
6772       carryover = buf_end - buf;
6773     }
6774   else
6775     {
6776       /* Source characters are at coding->source.  */
6777       const unsigned char *src = coding->source;
6778       const unsigned char *src_end = src + coding->consumed;
6779
6780       if (EQ (coding->dst_object, coding->src_object))
6781         dst_end = (unsigned char *) src;
6782       if (coding->src_multibyte != coding->dst_multibyte)
6783         {
6784           if (coding->src_multibyte)
6785             {
6786               int multibytep = 1;
6787               EMACS_INT consumed_chars = 0;
6788
6789               while (1)
6790                 {
6791                   const unsigned char *src_base = src;
6792                   int c;
6793
6794                   ONE_MORE_BYTE (c);
6795                   if (dst == dst_end)
6796                     {
6797                       if (EQ (coding->src_object, coding->dst_object))
6798                         dst_end = (unsigned char *) src;
6799                       if (dst == dst_end)
6800                         {
6801                           EMACS_INT offset = src - coding->source;
6802
6803                           dst = alloc_destination (coding, src_end - src + 1,
6804                                                    dst);
6805                           dst_end = coding->destination + coding->dst_bytes;
6806                           coding_set_source (coding);
6807                           src = coding->source + offset;
6808                           src_end = coding->source + coding->src_bytes;
6809                           if (EQ (coding->src_object, coding->dst_object))
6810                             dst_end = (unsigned char *) src;
6811                         }
6812                     }
6813                   *dst++ = c;
6814                   produced_chars++;
6815                 }
6816             no_more_source:
6817               ;
6818             }
6819           else
6820             while (src < src_end)
6821               {
6822                 int multibytep = 1;
6823                 int c = *src++;
6824
6825                 if (dst >= dst_end - 1)
6826                   {
6827                     if (EQ (coding->src_object, coding->dst_object))
6828                       dst_end = (unsigned char *) src;
6829                     if (dst >= dst_end - 1)
6830                       {
6831                         EMACS_INT offset = src - coding->source;
6832                         EMACS_INT more_bytes;
6833
6834                         if (EQ (coding->src_object, coding->dst_object))
6835                           more_bytes = ((src_end - src) / 2) + 2;
6836                         else
6837                           more_bytes = src_end - src + 2;
6838                         dst = alloc_destination (coding, more_bytes, dst);
6839                         dst_end = coding->destination + coding->dst_bytes;
6840                         coding_set_source (coding);
6841                         src = coding->source + offset;
6842                         src_end = coding->source + coding->src_bytes;
6843                         if (EQ (coding->src_object, coding->dst_object))
6844                           dst_end = (unsigned char *) src;
6845                       }
6846                   }
6847                 EMIT_ONE_BYTE (c);
6848               }
6849         }
6850       else
6851         {
6852           if (!EQ (coding->src_object, coding->dst_object))
6853             {
6854               EMACS_INT require = coding->src_bytes - coding->dst_bytes;
6855
6856               if (require > 0)
6857                 {
6858                   EMACS_INT offset = src - coding->source;
6859
6860                   dst = alloc_destination (coding, require, dst);
6861                   coding_set_source (coding);
6862                   src = coding->source + offset;
6863                   src_end = coding->source + coding->src_bytes;
6864                 }
6865             }
6866           produced_chars = coding->consumed_char;
6867           while (src < src_end)
6868             *dst++ = *src++;
6869         }
6870     }
6871
6872   produced = dst - (coding->destination + coding->produced);
6873   if (BUFFERP (coding->dst_object) && produced_chars > 0)
6874     insert_from_gap (produced_chars, produced);
6875   coding->produced += produced;
6876   coding->produced_char += produced_chars;
6877   return carryover;
6878 }
6879
6880 /* Compose text in CODING->object according to the annotation data at
6881    CHARBUF.  CHARBUF is an array:
6882      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
6883  */
6884
6885 static INLINE void
6886 produce_composition (struct coding_system *coding, int *charbuf, EMACS_INT pos)
6887 {
6888   int len;
6889   EMACS_INT to;
6890   enum composition_method method;
6891   Lisp_Object components;
6892
6893   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
6894   to = pos + charbuf[2];
6895   method = (enum composition_method) (charbuf[4]);
6896
6897   if (method == COMPOSITION_RELATIVE)
6898     components = Qnil;
6899   else
6900     {
6901       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6902       int i, j;
6903
6904       if (method == COMPOSITION_WITH_RULE)
6905         len = charbuf[2] * 3 - 2;
6906       charbuf += MAX_ANNOTATION_LENGTH;
6907       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
6908       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
6909         {
6910           if (charbuf[i] >= 0)
6911             args[j] = make_number (charbuf[i]);
6912           else
6913             {
6914               i++;
6915               args[j] = make_number (charbuf[i] % 0x100);
6916             }
6917         }
6918       components = (i == j ? Fstring (j, args) : Fvector (j, args));
6919     }
6920   compose_text (pos, to, components, Qnil, coding->dst_object);
6921 }
6922
6923
6924 /* Put `charset' property on text in CODING->object according to
6925    the annotation data at CHARBUF.  CHARBUF is an array:
6926      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6927  */
6928
6929 static INLINE void
6930 produce_charset (struct coding_system *coding, int *charbuf, EMACS_INT pos)
6931 {
6932   EMACS_INT from = pos - charbuf[2];
6933   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
6934
6935   Fput_text_property (make_number (from), make_number (pos),
6936                       Qcharset, CHARSET_NAME (charset),
6937                       coding->dst_object);
6938 }
6939
6940
6941 #define CHARBUF_SIZE 0x4000
6942
6943 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
6944   do {                                                                  \
6945     int size = CHARBUF_SIZE;                                            \
6946                                                                         \
6947     coding->charbuf = NULL;                                             \
6948     while (size > 1024)                                                 \
6949       {                                                                 \
6950         coding->charbuf = (int *) alloca (sizeof (int) * size);         \
6951         if (coding->charbuf)                                            \
6952           break;                                                        \
6953         size >>= 1;                                                     \
6954       }                                                                 \
6955     if (! coding->charbuf)                                              \
6956       {                                                                 \
6957         record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
6958         return coding->result;                                          \
6959       }                                                                 \
6960     coding->charbuf_size = size;                                        \
6961   } while (0)
6962
6963
6964 static void
6965 produce_annotation (struct coding_system *coding, EMACS_INT pos)
6966 {
6967   int *charbuf = coding->charbuf;
6968   int *charbuf_end = charbuf + coding->charbuf_used;
6969
6970   if (NILP (coding->dst_object))
6971     return;
6972
6973   while (charbuf < charbuf_end)
6974     {
6975       if (*charbuf >= 0)
6976         pos++, charbuf++;
6977       else
6978         {
6979           int len = -*charbuf;
6980
6981           if (len > 2)
6982             switch (charbuf[1])
6983               {
6984               case CODING_ANNOTATE_COMPOSITION_MASK:
6985                 produce_composition (coding, charbuf, pos);
6986                 break;
6987               case CODING_ANNOTATE_CHARSET_MASK:
6988                 produce_charset (coding, charbuf, pos);
6989                 break;
6990               }
6991           charbuf += len;
6992         }
6993     }
6994 }
6995
6996 /* Decode the data at CODING->src_object into CODING->dst_object.
6997    CODING->src_object is a buffer, a string, or nil.
6998    CODING->dst_object is a buffer.
6999
7000    If CODING->src_object is a buffer, it must be the current buffer.
7001    In this case, if CODING->src_pos is positive, it is a position of
7002    the source text in the buffer, otherwise, the source text is in the
7003    gap area of the buffer, and CODING->src_pos specifies the offset of
7004    the text from GPT (which must be the same as PT).  If this is the
7005    same buffer as CODING->dst_object, CODING->src_pos must be
7006    negative.
7007
7008    If CODING->src_object is a string, CODING->src_pos is an index to
7009    that string.
7010
7011    If CODING->src_object is nil, CODING->source must already point to
7012    the non-relocatable memory area.  In this case, CODING->src_pos is
7013    an offset from CODING->source.
7014
7015    The decoded data is inserted at the current point of the buffer
7016    CODING->dst_object.
7017 */
7018
7019 static int
7020 decode_coding (struct coding_system *coding)
7021 {
7022   Lisp_Object attrs;
7023   Lisp_Object undo_list;
7024   Lisp_Object translation_table;
7025   struct ccl_spec cclspec;
7026   int carryover;
7027   int i;
7028
7029   if (BUFFERP (coding->src_object)
7030       && coding->src_pos > 0
7031       && coding->src_pos < GPT
7032       && coding->src_pos + coding->src_chars > GPT)
7033     move_gap_both (coding->src_pos, coding->src_pos_byte);
7034
7035   undo_list = Qt;
7036   if (BUFFERP (coding->dst_object))
7037     {
7038       if (current_buffer != XBUFFER (coding->dst_object))
7039         set_buffer_internal (XBUFFER (coding->dst_object));
7040       if (GPT != PT)
7041         move_gap_both (PT, PT_BYTE);
7042       undo_list = current_buffer->undo_list;
7043       current_buffer->undo_list = Qt;
7044     }
7045
7046   coding->consumed = coding->consumed_char = 0;
7047   coding->produced = coding->produced_char = 0;
7048   coding->chars_at_source = 0;
7049   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7050   coding->errors = 0;
7051
7052   ALLOC_CONVERSION_WORK_AREA (coding);
7053
7054   attrs = CODING_ID_ATTRS (coding->id);
7055   translation_table = get_translation_table (attrs, 0, NULL);
7056
7057   carryover = 0;
7058   if (coding->decoder == decode_coding_ccl)
7059     {
7060       coding->spec.ccl = &cclspec;
7061       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7062     }
7063   do
7064     {
7065       EMACS_INT pos = coding->dst_pos + coding->produced_char;
7066
7067       coding_set_source (coding);
7068       coding->annotated = 0;
7069       coding->charbuf_used = carryover;
7070       (*(coding->decoder)) (coding);
7071       coding_set_destination (coding);
7072       carryover = produce_chars (coding, translation_table, 0);
7073       if (coding->annotated)
7074         produce_annotation (coding, pos);
7075       for (i = 0; i < carryover; i++)
7076         coding->charbuf[i]
7077           = coding->charbuf[coding->charbuf_used - carryover + i];
7078     }
7079   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7080          || (coding->consumed < coding->src_bytes
7081              && (coding->result == CODING_RESULT_SUCCESS
7082                  || coding->result == CODING_RESULT_INVALID_SRC)));
7083
7084   if (carryover > 0)
7085     {
7086       coding_set_destination (coding);
7087       coding->charbuf_used = carryover;
7088       produce_chars (coding, translation_table, 1);
7089     }
7090
7091   coding->carryover_bytes = 0;
7092   if (coding->consumed < coding->src_bytes)
7093     {
7094       int nbytes = coding->src_bytes - coding->consumed;
7095       const unsigned char *src;
7096
7097       coding_set_source (coding);
7098       coding_set_destination (coding);
7099       src = coding->source + coding->consumed;
7100
7101       if (coding->mode & CODING_MODE_LAST_BLOCK)
7102         {
7103           /* Flush out unprocessed data as binary chars.  We are sure
7104              that the number of data is less than the size of
7105              coding->charbuf.  */
7106           coding->charbuf_used = 0;
7107           coding->chars_at_source = 0;
7108
7109           while (nbytes-- > 0)
7110             {
7111               int c = *src++;
7112
7113               if (c & 0x80)
7114                 c = BYTE8_TO_CHAR (c);
7115               coding->charbuf[coding->charbuf_used++] = c;
7116             }
7117           produce_chars (coding, Qnil, 1);
7118         }
7119       else
7120         {
7121           /* Record unprocessed bytes in coding->carryover.  We are
7122              sure that the number of data is less than the size of
7123              coding->carryover.  */
7124           unsigned char *p = coding->carryover;
7125
7126           if (nbytes > sizeof coding->carryover)
7127             nbytes = sizeof coding->carryover;
7128           coding->carryover_bytes = nbytes;
7129           while (nbytes-- > 0)
7130             *p++ = *src++;
7131         }
7132       coding->consumed = coding->src_bytes;
7133     }
7134
7135   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7136       && !inhibit_eol_conversion)
7137     decode_eol (coding);
7138   if (BUFFERP (coding->dst_object))
7139     {
7140       current_buffer->undo_list = undo_list;
7141       record_insert (coding->dst_pos, coding->produced_char);
7142     }
7143   return coding->result;
7144 }
7145
7146
7147 /* Extract an annotation datum from a composition starting at POS and
7148    ending before LIMIT of CODING->src_object (buffer or string), store
7149    the data in BUF, set *STOP to a starting position of the next
7150    composition (if any) or to LIMIT, and return the address of the
7151    next element of BUF.
7152
7153    If such an annotation is not found, set *STOP to a starting
7154    position of a composition after POS (if any) or to LIMIT, and
7155    return BUF.  */
7156
7157 static INLINE int *
7158 handle_composition_annotation (EMACS_INT pos, EMACS_INT limit,
7159                                struct coding_system *coding, int *buf,
7160                                EMACS_INT *stop)
7161 {
7162   EMACS_INT start, end;
7163   Lisp_Object prop;
7164
7165   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7166       || end > limit)
7167     *stop = limit;
7168   else if (start > pos)
7169     *stop = start;
7170   else
7171     {
7172       if (start == pos)
7173         {
7174           /* We found a composition.  Store the corresponding
7175              annotation data in BUF.  */
7176           int *head = buf;
7177           enum composition_method method = COMPOSITION_METHOD (prop);
7178           int nchars = COMPOSITION_LENGTH (prop);
7179
7180           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7181           if (method != COMPOSITION_RELATIVE)
7182             {
7183               Lisp_Object components;
7184               int len, i, i_byte;
7185
7186               components = COMPOSITION_COMPONENTS (prop);
7187               if (VECTORP (components))
7188                 {
7189                   len = XVECTOR (components)->size;
7190                   for (i = 0; i < len; i++)
7191                     *buf++ = XINT (AREF (components, i));
7192                 }
7193               else if (STRINGP (components))
7194                 {
7195                   len = SCHARS (components);
7196                   i = i_byte = 0;
7197                   while (i < len)
7198                     {
7199                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7200                       buf++;
7201                     }
7202                 }
7203               else if (INTEGERP (components))
7204                 {
7205                   len = 1;
7206                   *buf++ = XINT (components);
7207                 }
7208               else if (CONSP (components))
7209                 {
7210                   for (len = 0; CONSP (components);
7211                        len++, components = XCDR (components))
7212                     *buf++ = XINT (XCAR (components));
7213                 }
7214               else
7215                 abort ();
7216               *head -= len;
7217             }
7218         }
7219
7220       if (find_composition (end, limit, &start, &end, &prop,
7221                             coding->src_object)
7222           && end <= limit)
7223         *stop = start;
7224       else
7225         *stop = limit;
7226     }
7227   return buf;
7228 }
7229
7230
7231 /* Extract an annotation datum from a text property `charset' at POS of
7232    CODING->src_object (buffer of string), store the data in BUF, set
7233    *STOP to the position where the value of `charset' property changes
7234    (limiting by LIMIT), and return the address of the next element of
7235    BUF.
7236
7237    If the property value is nil, set *STOP to the position where the
7238    property value is non-nil (limiting by LIMIT), and return BUF.  */
7239
7240 static INLINE int *
7241 handle_charset_annotation (EMACS_INT pos, EMACS_INT limit,
7242                            struct coding_system *coding, int *buf,
7243                            EMACS_INT *stop)
7244 {
7245   Lisp_Object val, next;
7246   int id;
7247
7248   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7249   if (! NILP (val) && CHARSETP (val))
7250     id = XINT (CHARSET_SYMBOL_ID (val));
7251   else
7252     id = -1;
7253   ADD_CHARSET_DATA (buf, 0, id);
7254   next = Fnext_single_property_change (make_number (pos), Qcharset,
7255                                        coding->src_object,
7256                                        make_number (limit));
7257   *stop = XINT (next);
7258   return buf;
7259 }
7260
7261
7262 static void
7263 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7264                int max_lookup)
7265 {
7266   int *buf = coding->charbuf;
7267   int *buf_end = coding->charbuf + coding->charbuf_size;
7268   const unsigned char *src = coding->source + coding->consumed;
7269   const unsigned char *src_end = coding->source + coding->src_bytes;
7270   EMACS_INT pos = coding->src_pos + coding->consumed_char;
7271   EMACS_INT end_pos = coding->src_pos + coding->src_chars;
7272   int multibytep = coding->src_multibyte;
7273   Lisp_Object eol_type;
7274   int c;
7275   EMACS_INT stop, stop_composition, stop_charset;
7276   int *lookup_buf = NULL;
7277
7278   if (! NILP (translation_table))
7279     lookup_buf = alloca (sizeof (int) * max_lookup);
7280
7281   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7282   if (VECTORP (eol_type))
7283     eol_type = Qunix;
7284
7285   /* Note: composition handling is not yet implemented.  */
7286   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7287
7288   if (NILP (coding->src_object))
7289     stop = stop_composition = stop_charset = end_pos;
7290   else
7291     {
7292       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7293         stop = stop_composition = pos;
7294       else
7295         stop = stop_composition = end_pos;
7296       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7297         stop = stop_charset = pos;
7298       else
7299         stop_charset = end_pos;
7300     }
7301
7302   /* Compensate for CRLF and conversion.  */
7303   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7304   while (buf < buf_end)
7305     {
7306       Lisp_Object trans;
7307
7308       if (pos == stop)
7309         {
7310           if (pos == end_pos)
7311             break;
7312           if (pos == stop_composition)
7313             buf = handle_composition_annotation (pos, end_pos, coding,
7314                                                  buf, &stop_composition);
7315           if (pos == stop_charset)
7316             buf = handle_charset_annotation (pos, end_pos, coding,
7317                                              buf, &stop_charset);
7318           stop = (stop_composition < stop_charset
7319                   ? stop_composition : stop_charset);
7320         }
7321
7322       if (! multibytep)
7323         {
7324           EMACS_INT bytes;
7325
7326           if (coding->encoder == encode_coding_raw_text
7327               || coding->encoder == encode_coding_ccl)
7328             c = *src++, pos++;
7329           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7330             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7331           else
7332             c = BYTE8_TO_CHAR (*src), src++, pos++;
7333         }
7334       else
7335         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7336       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7337         c = '\n';
7338       if (! EQ (eol_type, Qunix))
7339         {
7340           if (c == '\n')
7341             {
7342               if (EQ (eol_type, Qdos))
7343                 *buf++ = '\r';
7344               else
7345                 c = '\r';
7346             }
7347         }
7348
7349       trans = Qnil;
7350       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7351       if (NILP (trans))
7352         *buf++ = c;
7353       else
7354         {
7355           int from_nchars = 1, to_nchars = 1;
7356           int *lookup_buf_end;
7357           const unsigned char *p = src;
7358           int i;
7359
7360           lookup_buf[0] = c;
7361           for (i = 1; i < max_lookup && p < src_end; i++)
7362             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7363           lookup_buf_end = lookup_buf + i;
7364           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7365           if (INTEGERP (trans))
7366             c = XINT (trans);
7367           else if (CONSP (trans))
7368             {
7369               from_nchars = ASIZE (XCAR (trans));
7370               trans = XCDR (trans);
7371               if (INTEGERP (trans))
7372                 c = XINT (trans);
7373               else
7374                 {
7375                   to_nchars = ASIZE (trans);
7376                   if (buf + to_nchars > buf_end)
7377                     break;
7378                   c = XINT (AREF (trans, 0));
7379                 }
7380             }
7381           else
7382             break;
7383           *buf++ = c;
7384           for (i = 1; i < to_nchars; i++)
7385             *buf++ = XINT (AREF (trans, i));
7386           for (i = 1; i < from_nchars; i++, pos++)
7387             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7388         }
7389     }
7390
7391   coding->consumed = src - coding->source;
7392   coding->consumed_char = pos - coding->src_pos;
7393   coding->charbuf_used = buf - coding->charbuf;
7394   coding->chars_at_source = 0;
7395 }
7396
7397
7398 /* Encode the text at CODING->src_object into CODING->dst_object.
7399    CODING->src_object is a buffer or a string.
7400    CODING->dst_object is a buffer or nil.
7401
7402    If CODING->src_object is a buffer, it must be the current buffer.
7403    In this case, if CODING->src_pos is positive, it is a position of
7404    the source text in the buffer, otherwise. the source text is in the
7405    gap area of the buffer, and coding->src_pos specifies the offset of
7406    the text from GPT (which must be the same as PT).  If this is the
7407    same buffer as CODING->dst_object, CODING->src_pos must be
7408    negative and CODING should not have `pre-write-conversion'.
7409
7410    If CODING->src_object is a string, CODING should not have
7411    `pre-write-conversion'.
7412
7413    If CODING->dst_object is a buffer, the encoded data is inserted at
7414    the current point of that buffer.
7415
7416    If CODING->dst_object is nil, the encoded data is placed at the
7417    memory area specified by CODING->destination.  */
7418
7419 static int
7420 encode_coding (struct coding_system *coding)
7421 {
7422   Lisp_Object attrs;
7423   Lisp_Object translation_table;
7424   int max_lookup;
7425   struct ccl_spec cclspec;
7426
7427   attrs = CODING_ID_ATTRS (coding->id);
7428   if (coding->encoder == encode_coding_raw_text)
7429     translation_table = Qnil, max_lookup = 0;
7430   else
7431     translation_table = get_translation_table (attrs, 1, &max_lookup);
7432
7433   if (BUFFERP (coding->dst_object))
7434     {
7435       set_buffer_internal (XBUFFER (coding->dst_object));
7436       coding->dst_multibyte
7437         = ! NILP (current_buffer->enable_multibyte_characters);
7438     }
7439
7440   coding->consumed = coding->consumed_char = 0;
7441   coding->produced = coding->produced_char = 0;
7442   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7443   coding->errors = 0;
7444
7445   ALLOC_CONVERSION_WORK_AREA (coding);
7446
7447   if (coding->encoder == encode_coding_ccl)
7448     {
7449       coding->spec.ccl = &cclspec;
7450       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7451     }
7452   do {
7453     coding_set_source (coding);
7454     consume_chars (coding, translation_table, max_lookup);
7455     coding_set_destination (coding);
7456     (*(coding->encoder)) (coding);
7457   } while (coding->consumed_char < coding->src_chars);
7458
7459   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7460     insert_from_gap (coding->produced_char, coding->produced);
7461
7462   return (coding->result);
7463 }
7464
7465
7466 /* Name (or base name) of work buffer for code conversion.  */
7467 static Lisp_Object Vcode_conversion_workbuf_name;
7468
7469 /* A working buffer used by the top level conversion.  Once it is
7470    created, it is never destroyed.  It has the name
7471    Vcode_conversion_workbuf_name.  The other working buffers are
7472    destroyed after the use is finished, and their names are modified
7473    versions of Vcode_conversion_workbuf_name.  */
7474 static Lisp_Object Vcode_conversion_reused_workbuf;
7475
7476 /* 1 iff Vcode_conversion_reused_workbuf is already in use.  */
7477 static int reused_workbuf_in_use;
7478
7479
7480 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7481    multibyteness of returning buffer.  */
7482
7483 static Lisp_Object
7484 make_conversion_work_buffer (int multibyte)
7485 {
7486   Lisp_Object name, workbuf;
7487   struct buffer *current;
7488
7489   if (reused_workbuf_in_use++)
7490     {
7491       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7492       workbuf = Fget_buffer_create (name);
7493     }
7494   else
7495     {
7496       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7497         Vcode_conversion_reused_workbuf
7498           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7499       workbuf = Vcode_conversion_reused_workbuf;
7500     }
7501   current = current_buffer;
7502   set_buffer_internal (XBUFFER (workbuf));
7503   /* We can't allow modification hooks to run in the work buffer.  For
7504      instance, directory_files_internal assumes that file decoding
7505      doesn't compile new regexps.  */
7506   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7507   Ferase_buffer ();
7508   current_buffer->undo_list = Qt;
7509   current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
7510   set_buffer_internal (current);
7511   return workbuf;
7512 }
7513
7514
7515 static Lisp_Object
7516 code_conversion_restore (Lisp_Object arg)
7517 {
7518   Lisp_Object current, workbuf;
7519   struct gcpro gcpro1;
7520
7521   GCPRO1 (arg);
7522   current = XCAR (arg);
7523   workbuf = XCDR (arg);
7524   if (! NILP (workbuf))
7525     {
7526       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7527         reused_workbuf_in_use = 0;
7528       else if (! NILP (Fbuffer_live_p (workbuf)))
7529         Fkill_buffer (workbuf);
7530     }
7531   set_buffer_internal (XBUFFER (current));
7532   UNGCPRO;
7533   return Qnil;
7534 }
7535
7536 Lisp_Object
7537 code_conversion_save (int with_work_buf, int multibyte)
7538 {
7539   Lisp_Object workbuf = Qnil;
7540
7541   if (with_work_buf)
7542     workbuf = make_conversion_work_buffer (multibyte);
7543   record_unwind_protect (code_conversion_restore,
7544                          Fcons (Fcurrent_buffer (), workbuf));
7545   return workbuf;
7546 }
7547
7548 int
7549 decode_coding_gap (struct coding_system *coding,
7550                    EMACS_INT chars, EMACS_INT bytes)
7551 {
7552   int count = SPECPDL_INDEX ();
7553   Lisp_Object attrs;
7554
7555   code_conversion_save (0, 0);
7556
7557   coding->src_object = Fcurrent_buffer ();
7558   coding->src_chars = chars;
7559   coding->src_bytes = bytes;
7560   coding->src_pos = -chars;
7561   coding->src_pos_byte = -bytes;
7562   coding->src_multibyte = chars < bytes;
7563   coding->dst_object = coding->src_object;
7564   coding->dst_pos = PT;
7565   coding->dst_pos_byte = PT_BYTE;
7566   coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
7567
7568   if (CODING_REQUIRE_DETECTION (coding))
7569     detect_coding (coding);
7570
7571   coding->mode |= CODING_MODE_LAST_BLOCK;
7572   current_buffer->text->inhibit_shrinking = 1;
7573   decode_coding (coding);
7574   current_buffer->text->inhibit_shrinking = 0;
7575
7576   attrs = CODING_ID_ATTRS (coding->id);
7577   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7578     {
7579       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7580       Lisp_Object val;
7581
7582       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7583       val = call1 (CODING_ATTR_POST_READ (attrs),
7584                    make_number (coding->produced_char));
7585       CHECK_NATNUM (val);
7586       coding->produced_char += Z - prev_Z;
7587       coding->produced += Z_BYTE - prev_Z_BYTE;
7588     }
7589
7590   unbind_to (count, Qnil);
7591   return coding->result;
7592 }
7593
7594 int
7595 encode_coding_gap (struct coding_system *coding,
7596                    EMACS_INT chars, EMACS_INT bytes)
7597 {
7598   int count = SPECPDL_INDEX ();
7599
7600   code_conversion_save (0, 0);
7601
7602   coding->src_object = Fcurrent_buffer ();
7603   coding->src_chars = chars;
7604   coding->src_bytes = bytes;
7605   coding->src_pos = -chars;
7606   coding->src_pos_byte = -bytes;
7607   coding->src_multibyte = chars < bytes;
7608   coding->dst_object = coding->src_object;
7609   coding->dst_pos = PT;
7610   coding->dst_pos_byte = PT_BYTE;
7611
7612   encode_coding (coding);
7613
7614   unbind_to (count, Qnil);
7615   return coding->result;
7616 }
7617
7618
7619 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7620    SRC_OBJECT into DST_OBJECT by coding context CODING.
7621
7622    SRC_OBJECT is a buffer, a string, or Qnil.
7623
7624    If it is a buffer, the text is at point of the buffer.  FROM and TO
7625    are positions in the buffer.
7626
7627    If it is a string, the text is at the beginning of the string.
7628    FROM and TO are indices to the string.
7629
7630    If it is nil, the text is at coding->source.  FROM and TO are
7631    indices to coding->source.
7632
7633    DST_OBJECT is a buffer, Qt, or Qnil.
7634
7635    If it is a buffer, the decoded text is inserted at point of the
7636    buffer.  If the buffer is the same as SRC_OBJECT, the source text
7637    is deleted.
7638
7639    If it is Qt, a string is made from the decoded text, and
7640    set in CODING->dst_object.
7641
7642    If it is Qnil, the decoded text is stored at CODING->destination.
7643    The caller must allocate CODING->dst_bytes bytes at
7644    CODING->destination by xmalloc.  If the decoded text is longer than
7645    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7646  */
7647
7648 void
7649 decode_coding_object (struct coding_system *coding,
7650                       Lisp_Object src_object,
7651                       EMACS_INT from, EMACS_INT from_byte,
7652                       EMACS_INT to, EMACS_INT to_byte,
7653                       Lisp_Object dst_object)
7654 {
7655   int count = SPECPDL_INDEX ();
7656   unsigned char *destination;
7657   EMACS_INT dst_bytes;
7658   EMACS_INT chars = to - from;
7659   EMACS_INT bytes = to_byte - from_byte;
7660   Lisp_Object attrs;
7661   int saved_pt = -1, saved_pt_byte;
7662   int need_marker_adjustment = 0;
7663   Lisp_Object old_deactivate_mark;
7664
7665   old_deactivate_mark = Vdeactivate_mark;
7666
7667   if (NILP (dst_object))
7668     {
7669       destination = coding->destination;
7670       dst_bytes = coding->dst_bytes;
7671     }
7672
7673   coding->src_object = src_object;
7674   coding->src_chars = chars;
7675   coding->src_bytes = bytes;
7676   coding->src_multibyte = chars < bytes;
7677
7678   if (STRINGP (src_object))
7679     {
7680       coding->src_pos = from;
7681       coding->src_pos_byte = from_byte;
7682     }
7683   else if (BUFFERP (src_object))
7684     {
7685       set_buffer_internal (XBUFFER (src_object));
7686       if (from != GPT)
7687         move_gap_both (from, from_byte);
7688       if (EQ (src_object, dst_object))
7689         {
7690           struct Lisp_Marker *tail;
7691
7692           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7693             {
7694               tail->need_adjustment
7695                 = tail->charpos == (tail->insertion_type ? from : to);
7696               need_marker_adjustment |= tail->need_adjustment;
7697             }
7698           saved_pt = PT, saved_pt_byte = PT_BYTE;
7699           TEMP_SET_PT_BOTH (from, from_byte);
7700           current_buffer->text->inhibit_shrinking = 1;
7701           del_range_both (from, from_byte, to, to_byte, 1);
7702           coding->src_pos = -chars;
7703           coding->src_pos_byte = -bytes;
7704         }
7705       else
7706         {
7707           coding->src_pos = from;
7708           coding->src_pos_byte = from_byte;
7709         }
7710     }
7711
7712   if (CODING_REQUIRE_DETECTION (coding))
7713     detect_coding (coding);
7714   attrs = CODING_ID_ATTRS (coding->id);
7715
7716   if (EQ (dst_object, Qt)
7717       || (! NILP (CODING_ATTR_POST_READ (attrs))
7718           && NILP (dst_object)))
7719     {
7720       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7721       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
7722       coding->dst_pos = BEG;
7723       coding->dst_pos_byte = BEG_BYTE;
7724     }
7725   else if (BUFFERP (dst_object))
7726     {
7727       code_conversion_save (0, 0);
7728       coding->dst_object = dst_object;
7729       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7730       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7731       coding->dst_multibyte
7732         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
7733     }
7734   else
7735     {
7736       code_conversion_save (0, 0);
7737       coding->dst_object = Qnil;
7738       /* Most callers presume this will return a multibyte result, and they
7739          won't use `binary' or `raw-text' anyway, so let's not worry about
7740          CODING_FOR_UNIBYTE.  */
7741       coding->dst_multibyte = 1;
7742     }
7743
7744   decode_coding (coding);
7745
7746   if (BUFFERP (coding->dst_object))
7747     set_buffer_internal (XBUFFER (coding->dst_object));
7748
7749   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7750     {
7751       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7752       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7753       Lisp_Object val;
7754
7755       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7756       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7757               old_deactivate_mark);
7758       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7759                         make_number (coding->produced_char));
7760       UNGCPRO;
7761       CHECK_NATNUM (val);
7762       coding->produced_char += Z - prev_Z;
7763       coding->produced += Z_BYTE - prev_Z_BYTE;
7764     }
7765
7766   if (EQ (dst_object, Qt))
7767     {
7768       coding->dst_object = Fbuffer_string ();
7769     }
7770   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7771     {
7772       set_buffer_internal (XBUFFER (coding->dst_object));
7773       if (dst_bytes < coding->produced)
7774         {
7775           destination = xrealloc (destination, coding->produced);
7776           if (! destination)
7777             {
7778               record_conversion_result (coding,
7779                                         CODING_RESULT_INSUFFICIENT_MEM);
7780               unbind_to (count, Qnil);
7781               return;
7782             }
7783           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7784             move_gap_both (BEGV, BEGV_BYTE);
7785           memcpy (destination, BEGV_ADDR, coding->produced);
7786           coding->destination = destination;
7787         }
7788     }
7789
7790   if (saved_pt >= 0)
7791     {
7792       /* This is the case of:
7793          (BUFFERP (src_object) && EQ (src_object, dst_object))
7794          As we have moved PT while replacing the original buffer
7795          contents, we must recover it now.  */
7796       set_buffer_internal (XBUFFER (src_object));
7797       current_buffer->text->inhibit_shrinking = 0;
7798       if (saved_pt < from)
7799         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7800       else if (saved_pt < from + chars)
7801         TEMP_SET_PT_BOTH (from, from_byte);
7802       else if (! NILP (current_buffer->enable_multibyte_characters))
7803         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7804                           saved_pt_byte + (coding->produced - bytes));
7805       else
7806         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7807                           saved_pt_byte + (coding->produced - bytes));
7808
7809       if (need_marker_adjustment)
7810         {
7811           struct Lisp_Marker *tail;
7812
7813           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7814             if (tail->need_adjustment)
7815               {
7816                 tail->need_adjustment = 0;
7817                 if (tail->insertion_type)
7818                   {
7819                     tail->bytepos = from_byte;
7820                     tail->charpos = from;
7821                   }
7822                 else
7823                   {
7824                     tail->bytepos = from_byte + coding->produced;
7825                     tail->charpos
7826                       = (NILP (current_buffer->enable_multibyte_characters)
7827                          ? tail->bytepos : from + coding->produced_char);
7828                   }
7829               }
7830         }
7831     }
7832
7833   Vdeactivate_mark = old_deactivate_mark;
7834   unbind_to (count, coding->dst_object);
7835 }
7836
7837
7838 void
7839 encode_coding_object (struct coding_system *coding,
7840                       Lisp_Object src_object,
7841                       EMACS_INT from, EMACS_INT from_byte,
7842                       EMACS_INT to, EMACS_INT to_byte,
7843                       Lisp_Object dst_object)
7844 {
7845   int count = SPECPDL_INDEX ();
7846   EMACS_INT chars = to - from;
7847   EMACS_INT bytes = to_byte - from_byte;
7848   Lisp_Object attrs;
7849   int saved_pt = -1, saved_pt_byte;
7850   int need_marker_adjustment = 0;
7851   int kill_src_buffer = 0;
7852   Lisp_Object old_deactivate_mark;
7853
7854   old_deactivate_mark = Vdeactivate_mark;
7855
7856   coding->src_object = src_object;
7857   coding->src_chars = chars;
7858   coding->src_bytes = bytes;
7859   coding->src_multibyte = chars < bytes;
7860
7861   attrs = CODING_ID_ATTRS (coding->id);
7862
7863   if (EQ (src_object, dst_object))
7864     {
7865       struct Lisp_Marker *tail;
7866
7867       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7868         {
7869           tail->need_adjustment
7870             = tail->charpos == (tail->insertion_type ? from : to);
7871           need_marker_adjustment |= tail->need_adjustment;
7872         }
7873     }
7874
7875   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
7876     {
7877       coding->src_object = code_conversion_save (1, coding->src_multibyte);
7878       set_buffer_internal (XBUFFER (coding->src_object));
7879       if (STRINGP (src_object))
7880         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7881       else if (BUFFERP (src_object))
7882         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7883       else
7884         insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
7885
7886       if (EQ (src_object, dst_object))
7887         {
7888           set_buffer_internal (XBUFFER (src_object));
7889           saved_pt = PT, saved_pt_byte = PT_BYTE;
7890           del_range_both (from, from_byte, to, to_byte, 1);
7891           set_buffer_internal (XBUFFER (coding->src_object));
7892         }
7893
7894       {
7895         Lisp_Object args[3];
7896         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7897
7898         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7899                 old_deactivate_mark);
7900         args[0] = CODING_ATTR_PRE_WRITE (attrs);
7901         args[1] = make_number (BEG);
7902         args[2] = make_number (Z);
7903         safe_call (3, args);
7904         UNGCPRO;
7905       }
7906       if (XBUFFER (coding->src_object) != current_buffer)
7907         kill_src_buffer = 1;
7908       coding->src_object = Fcurrent_buffer ();
7909       if (BEG != GPT)
7910         move_gap_both (BEG, BEG_BYTE);
7911       coding->src_chars = Z - BEG;
7912       coding->src_bytes = Z_BYTE - BEG_BYTE;
7913       coding->src_pos = BEG;
7914       coding->src_pos_byte = BEG_BYTE;
7915       coding->src_multibyte = Z < Z_BYTE;
7916     }
7917   else if (STRINGP (src_object))
7918     {
7919       code_conversion_save (0, 0);
7920       coding->src_pos = from;
7921       coding->src_pos_byte = from_byte;
7922     }
7923   else if (BUFFERP (src_object))
7924     {
7925       code_conversion_save (0, 0);
7926       set_buffer_internal (XBUFFER (src_object));
7927       if (EQ (src_object, dst_object))
7928         {
7929           saved_pt = PT, saved_pt_byte = PT_BYTE;
7930           coding->src_object = del_range_1 (from, to, 1, 1);
7931           coding->src_pos = 0;
7932           coding->src_pos_byte = 0;
7933         }
7934       else
7935         {
7936           if (from < GPT && to >= GPT)
7937             move_gap_both (from, from_byte);
7938           coding->src_pos = from;
7939           coding->src_pos_byte = from_byte;
7940         }
7941     }
7942   else
7943     code_conversion_save (0, 0);
7944
7945   if (BUFFERP (dst_object))
7946     {
7947       coding->dst_object = dst_object;
7948       if (EQ (src_object, dst_object))
7949         {
7950           coding->dst_pos = from;
7951           coding->dst_pos_byte = from_byte;
7952         }
7953       else
7954         {
7955           struct buffer *current = current_buffer;
7956
7957           set_buffer_temp (XBUFFER (dst_object));
7958           coding->dst_pos = PT;
7959           coding->dst_pos_byte = PT_BYTE;
7960           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
7961           set_buffer_temp (current);
7962         }
7963       coding->dst_multibyte
7964         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
7965     }
7966   else if (EQ (dst_object, Qt))
7967     {
7968       coding->dst_object = Qnil;
7969       coding->dst_bytes = coding->src_chars;
7970       if (coding->dst_bytes == 0)
7971         coding->dst_bytes = 1;
7972       coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
7973       coding->dst_multibyte = 0;
7974     }
7975   else
7976     {
7977       coding->dst_object = Qnil;
7978       coding->dst_multibyte = 0;
7979     }
7980
7981   encode_coding (coding);
7982
7983   if (EQ (dst_object, Qt))
7984     {
7985       if (BUFFERP (coding->dst_object))
7986         coding->dst_object = Fbuffer_string ();
7987       else
7988         {
7989           coding->dst_object
7990             = make_unibyte_string ((char *) coding->destination,
7991                                    coding->produced);
7992           xfree (coding->destination);
7993         }
7994     }
7995
7996   if (saved_pt >= 0)
7997     {
7998       /* This is the case of:
7999          (BUFFERP (src_object) && EQ (src_object, dst_object))
8000          As we have moved PT while replacing the original buffer
8001          contents, we must recover it now.  */
8002       set_buffer_internal (XBUFFER (src_object));
8003       if (saved_pt < from)
8004         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8005       else if (saved_pt < from + chars)
8006         TEMP_SET_PT_BOTH (from, from_byte);
8007       else if (! NILP (current_buffer->enable_multibyte_characters))
8008         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8009                           saved_pt_byte + (coding->produced - bytes));
8010       else
8011         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8012                           saved_pt_byte + (coding->produced - bytes));
8013
8014       if (need_marker_adjustment)
8015         {
8016           struct Lisp_Marker *tail;
8017
8018           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8019             if (tail->need_adjustment)
8020               {
8021                 tail->need_adjustment = 0;
8022                 if (tail->insertion_type)
8023                   {
8024                     tail->bytepos = from_byte;
8025                     tail->charpos = from;
8026                   }
8027                 else
8028                   {
8029                     tail->bytepos = from_byte + coding->produced;
8030                     tail->charpos
8031                       = (NILP (current_buffer->enable_multibyte_characters)
8032                          ? tail->bytepos : from + coding->produced_char);
8033                   }
8034               }
8035         }
8036     }
8037
8038   if (kill_src_buffer)
8039     Fkill_buffer (coding->src_object);
8040
8041   Vdeactivate_mark = old_deactivate_mark;
8042   unbind_to (count, Qnil);
8043 }
8044
8045
8046 Lisp_Object
8047 preferred_coding_system (void)
8048 {
8049   int id = coding_categories[coding_priorities[0]].id;
8050
8051   return CODING_ID_NAME (id);
8052 }
8053
8054 \f
8055 #ifdef emacs
8056 /*** 8. Emacs Lisp library functions ***/
8057
8058 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8059        doc: /* Return t if OBJECT is nil or a coding-system.
8060 See the documentation of `define-coding-system' for information
8061 about coding-system objects.  */)
8062   (Lisp_Object object)
8063 {
8064   if (NILP (object)
8065       || CODING_SYSTEM_ID (object) >= 0)
8066     return Qt;
8067   if (! SYMBOLP (object)
8068       || NILP (Fget (object, Qcoding_system_define_form)))
8069     return Qnil;
8070   return Qt;
8071 }
8072
8073 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8074        Sread_non_nil_coding_system, 1, 1, 0,
8075        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8076   (Lisp_Object prompt)
8077 {
8078   Lisp_Object val;
8079   do
8080     {
8081       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8082                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8083     }
8084   while (SCHARS (val) == 0);
8085   return (Fintern (val, Qnil));
8086 }
8087
8088 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8089        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8090 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8091 Ignores case when completing coding systems (all Emacs coding systems
8092 are lower-case).  */)
8093   (Lisp_Object prompt, Lisp_Object default_coding_system)
8094 {
8095   Lisp_Object val;
8096   int count = SPECPDL_INDEX ();
8097
8098   if (SYMBOLP (default_coding_system))
8099     default_coding_system = SYMBOL_NAME (default_coding_system);
8100   specbind (Qcompletion_ignore_case, Qt);
8101   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8102                           Qt, Qnil, Qcoding_system_history,
8103                           default_coding_system, Qnil);
8104   unbind_to (count, Qnil);
8105   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8106 }
8107
8108 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8109        1, 1, 0,
8110        doc: /* Check validity of CODING-SYSTEM.
8111 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8112 It is valid if it is nil or a symbol defined as a coding system by the
8113 function `define-coding-system'.  */)
8114   (Lisp_Object coding_system)
8115 {
8116   Lisp_Object define_form;
8117
8118   define_form = Fget (coding_system, Qcoding_system_define_form);
8119   if (! NILP (define_form))
8120     {
8121       Fput (coding_system, Qcoding_system_define_form, Qnil);
8122       safe_eval (define_form);
8123     }
8124   if (!NILP (Fcoding_system_p (coding_system)))
8125     return coding_system;
8126   xsignal1 (Qcoding_system_error, coding_system);
8127 }
8128
8129 \f
8130 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8131    HIGHEST is nonzero, return the coding system of the highest
8132    priority among the detected coding systems.  Otherwise return a
8133    list of detected coding systems sorted by their priorities.  If
8134    MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
8135    multibyte form but contains only ASCII and eight-bit chars.
8136    Otherwise, the bytes are raw bytes.
8137
8138    CODING-SYSTEM controls the detection as below:
8139
8140    If it is nil, detect both text-format and eol-format.  If the
8141    text-format part of CODING-SYSTEM is already specified
8142    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8143    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8144    detect only text-format.  */
8145
8146 Lisp_Object
8147 detect_coding_system (const unsigned char *src,
8148                       EMACS_INT src_chars, EMACS_INT src_bytes,
8149                       int highest, int multibytep,
8150                       Lisp_Object coding_system)
8151 {
8152   const unsigned char *src_end = src + src_bytes;
8153   Lisp_Object attrs, eol_type;
8154   Lisp_Object val = Qnil;
8155   struct coding_system coding;
8156   int id;
8157   struct coding_detection_info detect_info;
8158   enum coding_category base_category;
8159   int null_byte_found = 0, eight_bit_found = 0;
8160
8161   if (NILP (coding_system))
8162     coding_system = Qundecided;
8163   setup_coding_system (coding_system, &coding);
8164   attrs = CODING_ID_ATTRS (coding.id);
8165   eol_type = CODING_ID_EOL_TYPE (coding.id);
8166   coding_system = CODING_ATTR_BASE_NAME (attrs);
8167
8168   coding.source = src;
8169   coding.src_chars = src_chars;
8170   coding.src_bytes = src_bytes;
8171   coding.src_multibyte = multibytep;
8172   coding.consumed = 0;
8173   coding.mode |= CODING_MODE_LAST_BLOCK;
8174   coding.head_ascii = 0;
8175
8176   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8177
8178   /* At first, detect text-format if necessary.  */
8179   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8180   if (base_category == coding_category_undecided)
8181     {
8182       enum coding_category category;
8183       struct coding_system *this;
8184       int c, i;
8185
8186       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8187       for (; src < src_end; src++)
8188         {
8189           c = *src;
8190           if (c & 0x80)
8191             {
8192               eight_bit_found = 1;
8193               if (null_byte_found)
8194                 break;
8195             }
8196           else if (c < 0x20)
8197             {
8198               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8199                   && ! inhibit_iso_escape_detection
8200                   && ! detect_info.checked)
8201                 {
8202                   if (detect_coding_iso_2022 (&coding, &detect_info))
8203                     {
8204                       /* We have scanned the whole data.  */
8205                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8206                         {
8207                           /* We didn't find an 8-bit code.  We may
8208                              have found a null-byte, but it's very
8209                              rare that a binary file confirm to
8210                              ISO-2022.  */
8211                           src = src_end;
8212                           coding.head_ascii = src - coding.source;
8213                         }
8214                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8215                       break;
8216                     }
8217                 }
8218               else if (! c && !inhibit_null_byte_detection)
8219                 {
8220                   null_byte_found = 1;
8221                   if (eight_bit_found)
8222                     break;
8223                 }
8224               if (! eight_bit_found)
8225                 coding.head_ascii++;
8226             }
8227           else if (! eight_bit_found)
8228             coding.head_ascii++;
8229         }
8230
8231       if (null_byte_found || eight_bit_found
8232           || coding.head_ascii < coding.src_bytes
8233           || detect_info.found)
8234         {
8235           if (coding.head_ascii == coding.src_bytes)
8236             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8237             for (i = 0; i < coding_category_raw_text; i++)
8238               {
8239                 category = coding_priorities[i];
8240                 this = coding_categories + category;
8241                 if (detect_info.found & (1 << category))
8242                   break;
8243               }
8244           else
8245             {
8246               if (null_byte_found)
8247                 {
8248                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8249                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8250                 }
8251               for (i = 0; i < coding_category_raw_text; i++)
8252                 {
8253                   category = coding_priorities[i];
8254                   this = coding_categories + category;
8255
8256                   if (this->id < 0)
8257                     {
8258                       /* No coding system of this category is defined.  */
8259                       detect_info.rejected |= (1 << category);
8260                     }
8261                   else if (category >= coding_category_raw_text)
8262                     continue;
8263                   else if (detect_info.checked & (1 << category))
8264                     {
8265                       if (highest
8266                           && (detect_info.found & (1 << category)))
8267                         break;
8268                     }
8269                   else if ((*(this->detector)) (&coding, &detect_info)
8270                            && highest
8271                            && (detect_info.found & (1 << category)))
8272                     {
8273                       if (category == coding_category_utf_16_auto)
8274                         {
8275                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8276                             category = coding_category_utf_16_le;
8277                           else
8278                             category = coding_category_utf_16_be;
8279                         }
8280                       break;
8281                     }
8282                 }
8283             }
8284         }
8285
8286       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8287           || null_byte_found)
8288         {
8289           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8290           id = CODING_SYSTEM_ID (Qno_conversion);
8291           val = Fcons (make_number (id), Qnil);
8292         }
8293       else if (! detect_info.rejected && ! detect_info.found)
8294         {
8295           detect_info.found = CATEGORY_MASK_ANY;
8296           id = coding_categories[coding_category_undecided].id;
8297           val = Fcons (make_number (id), Qnil);
8298         }
8299       else if (highest)
8300         {
8301           if (detect_info.found)
8302             {
8303               detect_info.found = 1 << category;
8304               val = Fcons (make_number (this->id), Qnil);
8305             }
8306           else
8307             for (i = 0; i < coding_category_raw_text; i++)
8308               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8309                 {
8310                   detect_info.found = 1 << coding_priorities[i];
8311                   id = coding_categories[coding_priorities[i]].id;
8312                   val = Fcons (make_number (id), Qnil);
8313                   break;
8314                 }
8315         }
8316       else
8317         {
8318           int mask = detect_info.rejected | detect_info.found;
8319           int found = 0;
8320
8321           for (i = coding_category_raw_text - 1; i >= 0; i--)
8322             {
8323               category = coding_priorities[i];
8324               if (! (mask & (1 << category)))
8325                 {
8326                   found |= 1 << category;
8327                   id = coding_categories[category].id;
8328                   if (id >= 0)
8329                     val = Fcons (make_number (id), val);
8330                 }
8331             }
8332           for (i = coding_category_raw_text - 1; i >= 0; i--)
8333             {
8334               category = coding_priorities[i];
8335               if (detect_info.found & (1 << category))
8336                 {
8337                   id = coding_categories[category].id;
8338                   val = Fcons (make_number (id), val);
8339                 }
8340             }
8341           detect_info.found |= found;
8342         }
8343     }
8344   else if (base_category == coding_category_utf_8_auto)
8345     {
8346       if (detect_coding_utf_8 (&coding, &detect_info))
8347         {
8348           struct coding_system *this;
8349
8350           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8351             this = coding_categories + coding_category_utf_8_sig;
8352           else
8353             this = coding_categories + coding_category_utf_8_nosig;
8354           val = Fcons (make_number (this->id), Qnil);
8355         }
8356     }
8357   else if (base_category == coding_category_utf_16_auto)
8358     {
8359       if (detect_coding_utf_16 (&coding, &detect_info))
8360         {
8361           struct coding_system *this;
8362
8363           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8364             this = coding_categories + coding_category_utf_16_le;
8365           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8366             this = coding_categories + coding_category_utf_16_be;
8367           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8368             this = coding_categories + coding_category_utf_16_be_nosig;
8369           else
8370             this = coding_categories + coding_category_utf_16_le_nosig;
8371           val = Fcons (make_number (this->id), Qnil);
8372         }
8373     }
8374   else
8375     {
8376       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8377       val = Fcons (make_number (coding.id), Qnil);
8378     }
8379
8380   /* Then, detect eol-format if necessary.  */
8381   {
8382     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8383     Lisp_Object tail;
8384
8385     if (VECTORP (eol_type))
8386       {
8387         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8388           {
8389             if (null_byte_found)
8390               normal_eol = EOL_SEEN_LF;
8391             else
8392               normal_eol = detect_eol (coding.source, src_bytes,
8393                                        coding_category_raw_text);
8394           }
8395         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8396                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8397           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8398                                       coding_category_utf_16_be);
8399         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8400                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8401           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8402                                       coding_category_utf_16_le);
8403       }
8404     else
8405       {
8406         if (EQ (eol_type, Qunix))
8407           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8408         else if (EQ (eol_type, Qdos))
8409           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8410         else
8411           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8412       }
8413
8414     for (tail = val; CONSP (tail); tail = XCDR (tail))
8415       {
8416         enum coding_category category;
8417         int this_eol;
8418
8419         id = XINT (XCAR (tail));
8420         attrs = CODING_ID_ATTRS (id);
8421         category = XINT (CODING_ATTR_CATEGORY (attrs));
8422         eol_type = CODING_ID_EOL_TYPE (id);
8423         if (VECTORP (eol_type))
8424           {
8425             if (category == coding_category_utf_16_be
8426                 || category == coding_category_utf_16_be_nosig)
8427               this_eol = utf_16_be_eol;
8428             else if (category == coding_category_utf_16_le
8429                      || category == coding_category_utf_16_le_nosig)
8430               this_eol = utf_16_le_eol;
8431             else
8432               this_eol = normal_eol;
8433
8434             if (this_eol == EOL_SEEN_LF)
8435               XSETCAR (tail, AREF (eol_type, 0));
8436             else if (this_eol == EOL_SEEN_CRLF)
8437               XSETCAR (tail, AREF (eol_type, 1));
8438             else if (this_eol == EOL_SEEN_CR)
8439               XSETCAR (tail, AREF (eol_type, 2));
8440             else
8441               XSETCAR (tail, CODING_ID_NAME (id));
8442           }
8443         else
8444           XSETCAR (tail, CODING_ID_NAME (id));
8445       }
8446   }
8447
8448   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8449 }
8450
8451
8452 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8453        2, 3, 0,
8454        doc: /* Detect coding system of the text in the region between START and END.
8455 Return a list of possible coding systems ordered by priority.
8456 The coding systems to try and their priorities follows what
8457 the function `coding-system-priority-list' (which see) returns.
8458
8459 If only ASCII characters are found (except for such ISO-2022 control
8460 characters as ESC), it returns a list of single element `undecided'
8461 or its subsidiary coding system according to a detected end-of-line
8462 format.
8463
8464 If optional argument HIGHEST is non-nil, return the coding system of
8465 highest priority.  */)
8466   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8467 {
8468   int from, to;
8469   int from_byte, to_byte;
8470
8471   CHECK_NUMBER_COERCE_MARKER (start);
8472   CHECK_NUMBER_COERCE_MARKER (end);
8473
8474   validate_region (&start, &end);
8475   from = XINT (start), to = XINT (end);
8476   from_byte = CHAR_TO_BYTE (from);
8477   to_byte = CHAR_TO_BYTE (to);
8478
8479   if (from < GPT && to >= GPT)
8480     move_gap_both (to, to_byte);
8481
8482   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8483                                to - from, to_byte - from_byte,
8484                                !NILP (highest),
8485                                !NILP (current_buffer
8486                                       ->enable_multibyte_characters),
8487                                Qnil);
8488 }
8489
8490 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8491        1, 2, 0,
8492        doc: /* Detect coding system of the text in STRING.
8493 Return a list of possible coding systems ordered by priority.
8494 The coding systems to try and their priorities follows what
8495 the function `coding-system-priority-list' (which see) returns.
8496
8497 If only ASCII characters are found (except for such ISO-2022 control
8498 characters as ESC), it returns a list of single element `undecided'
8499 or its subsidiary coding system according to a detected end-of-line
8500 format.
8501
8502 If optional argument HIGHEST is non-nil, return the coding system of
8503 highest priority.  */)
8504   (Lisp_Object string, Lisp_Object highest)
8505 {
8506   CHECK_STRING (string);
8507
8508   return detect_coding_system (SDATA (string),
8509                                SCHARS (string), SBYTES (string),
8510                                !NILP (highest), STRING_MULTIBYTE (string),
8511                                Qnil);
8512 }
8513
8514
8515 static INLINE int
8516 char_encodable_p (int c, Lisp_Object attrs)
8517 {
8518   Lisp_Object tail;
8519   struct charset *charset;
8520   Lisp_Object translation_table;
8521
8522   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8523   if (! NILP (translation_table))
8524     c = translate_char (translation_table, c);
8525   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8526        CONSP (tail); tail = XCDR (tail))
8527     {
8528       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8529       if (CHAR_CHARSET_P (c, charset))
8530         break;
8531     }
8532   return (! NILP (tail));
8533 }
8534
8535
8536 /* Return a list of coding systems that safely encode the text between
8537    START and END.  If EXCLUDE is non-nil, it is a list of coding
8538    systems not to check.  The returned list doesn't contain any such
8539    coding systems.  In any case, if the text contains only ASCII or is
8540    unibyte, return t.  */
8541
8542 DEFUN ("find-coding-systems-region-internal",
8543        Ffind_coding_systems_region_internal,
8544        Sfind_coding_systems_region_internal, 2, 3, 0,
8545        doc: /* Internal use only.  */)
8546   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8547 {
8548   Lisp_Object coding_attrs_list, safe_codings;
8549   EMACS_INT start_byte, end_byte;
8550   const unsigned char *p, *pbeg, *pend;
8551   int c;
8552   Lisp_Object tail, elt, work_table;
8553
8554   if (STRINGP (start))
8555     {
8556       if (!STRING_MULTIBYTE (start)
8557           || SCHARS (start) == SBYTES (start))
8558         return Qt;
8559       start_byte = 0;
8560       end_byte = SBYTES (start);
8561     }
8562   else
8563     {
8564       CHECK_NUMBER_COERCE_MARKER (start);
8565       CHECK_NUMBER_COERCE_MARKER (end);
8566       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8567         args_out_of_range (start, end);
8568       if (NILP (current_buffer->enable_multibyte_characters))
8569         return Qt;
8570       start_byte = CHAR_TO_BYTE (XINT (start));
8571       end_byte = CHAR_TO_BYTE (XINT (end));
8572       if (XINT (end) - XINT (start) == end_byte - start_byte)
8573         return Qt;
8574
8575       if (XINT (start) < GPT && XINT (end) > GPT)
8576         {
8577           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8578             move_gap_both (XINT (start), start_byte);
8579           else
8580             move_gap_both (XINT (end), end_byte);
8581         }
8582     }
8583
8584   coding_attrs_list = Qnil;
8585   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8586     if (NILP (exclude)
8587         || NILP (Fmemq (XCAR (tail), exclude)))
8588       {
8589         Lisp_Object attrs;
8590
8591         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8592         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8593             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8594           {
8595             ASET (attrs, coding_attr_trans_tbl,
8596                   get_translation_table (attrs, 1, NULL));
8597             coding_attrs_list = Fcons (attrs, coding_attrs_list);
8598           }
8599       }
8600
8601   if (STRINGP (start))
8602     p = pbeg = SDATA (start);
8603   else
8604     p = pbeg = BYTE_POS_ADDR (start_byte);
8605   pend = p + (end_byte - start_byte);
8606
8607   while (p < pend && ASCII_BYTE_P (*p)) p++;
8608   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8609
8610   work_table = Fmake_char_table (Qnil, Qnil);
8611   while (p < pend)
8612     {
8613       if (ASCII_BYTE_P (*p))
8614         p++;
8615       else
8616         {
8617           c = STRING_CHAR_ADVANCE (p);
8618           if (!NILP (char_table_ref (work_table, c)))
8619             /* This character was already checked.  Ignore it.  */
8620             continue;
8621
8622           charset_map_loaded = 0;
8623           for (tail = coding_attrs_list; CONSP (tail);)
8624             {
8625               elt = XCAR (tail);
8626               if (NILP (elt))
8627                 tail = XCDR (tail);
8628               else if (char_encodable_p (c, elt))
8629                 tail = XCDR (tail);
8630               else if (CONSP (XCDR (tail)))
8631                 {
8632                   XSETCAR (tail, XCAR (XCDR (tail)));
8633                   XSETCDR (tail, XCDR (XCDR (tail)));
8634                 }
8635               else
8636                 {
8637                   XSETCAR (tail, Qnil);
8638                   tail = XCDR (tail);
8639                 }
8640             }
8641           if (charset_map_loaded)
8642             {
8643               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8644
8645               if (STRINGP (start))
8646                 pbeg = SDATA (start);
8647               else
8648                 pbeg = BYTE_POS_ADDR (start_byte);
8649               p = pbeg + p_offset;
8650               pend = pbeg + pend_offset;
8651             }
8652           char_table_set (work_table, c, Qt);
8653         }
8654     }
8655
8656   safe_codings = list2 (Qraw_text, Qno_conversion);
8657   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8658     if (! NILP (XCAR (tail)))
8659       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8660
8661   return safe_codings;
8662 }
8663
8664
8665 DEFUN ("unencodable-char-position", Funencodable_char_position,
8666        Sunencodable_char_position, 3, 5, 0,
8667        doc: /*
8668 Return position of first un-encodable character in a region.
8669 START and END specify the region and CODING-SYSTEM specifies the
8670 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
8671
8672 If optional 4th argument COUNT is non-nil, it specifies at most how
8673 many un-encodable characters to search.  In this case, the value is a
8674 list of positions.
8675
8676 If optional 5th argument STRING is non-nil, it is a string to search
8677 for un-encodable characters.  In that case, START and END are indexes
8678 to the string.  */)
8679   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
8680 {
8681   int n;
8682   struct coding_system coding;
8683   Lisp_Object attrs, charset_list, translation_table;
8684   Lisp_Object positions;
8685   int from, to;
8686   const unsigned char *p, *stop, *pend;
8687   int ascii_compatible;
8688
8689   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8690   attrs = CODING_ID_ATTRS (coding.id);
8691   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8692     return Qnil;
8693   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8694   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8695   translation_table = get_translation_table (attrs, 1, NULL);
8696
8697   if (NILP (string))
8698     {
8699       validate_region (&start, &end);
8700       from = XINT (start);
8701       to = XINT (end);
8702       if (NILP (current_buffer->enable_multibyte_characters)
8703           || (ascii_compatible
8704               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8705         return Qnil;
8706       p = CHAR_POS_ADDR (from);
8707       pend = CHAR_POS_ADDR (to);
8708       if (from < GPT && to >= GPT)
8709         stop = GPT_ADDR;
8710       else
8711         stop = pend;
8712     }
8713   else
8714     {
8715       CHECK_STRING (string);
8716       CHECK_NATNUM (start);
8717       CHECK_NATNUM (end);
8718       from = XINT (start);
8719       to = XINT (end);
8720       if (from > to
8721           || to > SCHARS (string))
8722         args_out_of_range_3 (string, start, end);
8723       if (! STRING_MULTIBYTE (string))
8724         return Qnil;
8725       p = SDATA (string) + string_char_to_byte (string, from);
8726       stop = pend = SDATA (string) + string_char_to_byte (string, to);
8727       if (ascii_compatible && (to - from) == (pend - p))
8728         return Qnil;
8729     }
8730
8731   if (NILP (count))
8732     n = 1;
8733   else
8734     {
8735       CHECK_NATNUM (count);
8736       n = XINT (count);
8737     }
8738
8739   positions = Qnil;
8740   while (1)
8741     {
8742       int c;
8743
8744       if (ascii_compatible)
8745         while (p < stop && ASCII_BYTE_P (*p))
8746           p++, from++;
8747       if (p >= stop)
8748         {
8749           if (p >= pend)
8750             break;
8751           stop = pend;
8752           p = GAP_END_ADDR;
8753         }
8754
8755       c = STRING_CHAR_ADVANCE (p);
8756       if (! (ASCII_CHAR_P (c) && ascii_compatible)
8757           && ! char_charset (translate_char (translation_table, c),
8758                              charset_list, NULL))
8759         {
8760           positions = Fcons (make_number (from), positions);
8761           n--;
8762           if (n == 0)
8763             break;
8764         }
8765
8766       from++;
8767     }
8768
8769   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8770 }
8771
8772
8773 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8774        Scheck_coding_systems_region, 3, 3, 0,
8775        doc: /* Check if the region is encodable by coding systems.
8776
8777 START and END are buffer positions specifying the region.
8778 CODING-SYSTEM-LIST is a list of coding systems to check.
8779
8780 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
8781 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
8782 whole region, POS0, POS1, ... are buffer positions where non-encodable
8783 characters are found.
8784
8785 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8786 value is nil.
8787
8788 START may be a string.  In that case, check if the string is
8789 encodable, and the value contains indices to the string instead of
8790 buffer positions.  END is ignored.
8791
8792 If the current buffer (or START if it is a string) is unibyte, the value
8793 is nil.  */)
8794   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
8795 {
8796   Lisp_Object list;
8797   EMACS_INT start_byte, end_byte;
8798   int pos;
8799   const unsigned char *p, *pbeg, *pend;
8800   int c;
8801   Lisp_Object tail, elt, attrs;
8802
8803   if (STRINGP (start))
8804     {
8805       if (!STRING_MULTIBYTE (start)
8806           || SCHARS (start) == SBYTES (start))
8807         return Qnil;
8808       start_byte = 0;
8809       end_byte = SBYTES (start);
8810       pos = 0;
8811     }
8812   else
8813     {
8814       CHECK_NUMBER_COERCE_MARKER (start);
8815       CHECK_NUMBER_COERCE_MARKER (end);
8816       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8817         args_out_of_range (start, end);
8818       if (NILP (current_buffer->enable_multibyte_characters))
8819         return Qnil;
8820       start_byte = CHAR_TO_BYTE (XINT (start));
8821       end_byte = CHAR_TO_BYTE (XINT (end));
8822       if (XINT (end) - XINT (start) == end_byte - start_byte)
8823         return Qnil;
8824
8825       if (XINT (start) < GPT && XINT (end) > GPT)
8826         {
8827           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8828             move_gap_both (XINT (start), start_byte);
8829           else
8830             move_gap_both (XINT (end), end_byte);
8831         }
8832       pos = XINT (start);
8833     }
8834
8835   list = Qnil;
8836   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
8837     {
8838       elt = XCAR (tail);
8839       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
8840       ASET (attrs, coding_attr_trans_tbl,
8841             get_translation_table (attrs, 1, NULL));
8842       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
8843     }
8844
8845   if (STRINGP (start))
8846     p = pbeg = SDATA (start);
8847   else
8848     p = pbeg = BYTE_POS_ADDR (start_byte);
8849   pend = p + (end_byte - start_byte);
8850
8851   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8852   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8853
8854   while (p < pend)
8855     {
8856       if (ASCII_BYTE_P (*p))
8857         p++;
8858       else
8859         {
8860           c = STRING_CHAR_ADVANCE (p);
8861
8862           charset_map_loaded = 0;
8863           for (tail = list; CONSP (tail); tail = XCDR (tail))
8864             {
8865               elt = XCDR (XCAR (tail));
8866               if (! char_encodable_p (c, XCAR (elt)))
8867                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8868             }
8869           if (charset_map_loaded)
8870             {
8871               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8872
8873               if (STRINGP (start))
8874                 pbeg = SDATA (start);
8875               else
8876                 pbeg = BYTE_POS_ADDR (start_byte);
8877               p = pbeg + p_offset;
8878               pend = pbeg + pend_offset;
8879             }
8880         }
8881       pos++;
8882     }
8883
8884   tail = list;
8885   list = Qnil;
8886   for (; CONSP (tail); tail = XCDR (tail))
8887     {
8888       elt = XCAR (tail);
8889       if (CONSP (XCDR (XCDR (elt))))
8890         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8891                       list);
8892     }
8893
8894   return list;
8895 }
8896
8897
8898 Lisp_Object
8899 code_convert_region (Lisp_Object start, Lisp_Object end,
8900                      Lisp_Object coding_system, Lisp_Object dst_object,
8901                      int encodep, int norecord)
8902 {
8903   struct coding_system coding;
8904   EMACS_INT from, from_byte, to, to_byte;
8905   Lisp_Object src_object;
8906
8907   CHECK_NUMBER_COERCE_MARKER (start);
8908   CHECK_NUMBER_COERCE_MARKER (end);
8909   if (NILP (coding_system))
8910     coding_system = Qno_conversion;
8911   else
8912     CHECK_CODING_SYSTEM (coding_system);
8913   src_object = Fcurrent_buffer ();
8914   if (NILP (dst_object))
8915     dst_object = src_object;
8916   else if (! EQ (dst_object, Qt))
8917     CHECK_BUFFER (dst_object);
8918
8919   validate_region (&start, &end);
8920   from = XFASTINT (start);
8921   from_byte = CHAR_TO_BYTE (from);
8922   to = XFASTINT (end);
8923   to_byte = CHAR_TO_BYTE (to);
8924
8925   setup_coding_system (coding_system, &coding);
8926   coding.mode |= CODING_MODE_LAST_BLOCK;
8927
8928   if (encodep)
8929     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8930                           dst_object);
8931   else
8932     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8933                           dst_object);
8934   if (! norecord)
8935     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8936
8937   return (BUFFERP (dst_object)
8938           ? make_number (coding.produced_char)
8939           : coding.dst_object);
8940 }
8941
8942
8943 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
8944        3, 4, "r\nzCoding system: ",
8945        doc: /* Decode the current region from the specified coding system.
8946 When called from a program, takes four arguments:
8947         START, END, CODING-SYSTEM, and DESTINATION.
8948 START and END are buffer positions.
8949
8950 Optional 4th arguments DESTINATION specifies where the decoded text goes.
8951 If nil, the region between START and END is replaced by the decoded text.
8952 If buffer, the decoded text is inserted in that buffer after point (point
8953 does not move).
8954 In those cases, the length of the decoded text is returned.
8955 If DESTINATION is t, the decoded text is returned.
8956
8957 This function sets `last-coding-system-used' to the precise coding system
8958 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8959 not fully specified.)  */)
8960   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
8961 {
8962   return code_convert_region (start, end, coding_system, destination, 0, 0);
8963 }
8964
8965 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
8966        3, 4, "r\nzCoding system: ",
8967        doc: /* Encode the current region by specified coding system.
8968 When called from a program, takes four arguments:
8969         START, END, CODING-SYSTEM and DESTINATION.
8970 START and END are buffer positions.
8971
8972 Optional 4th arguments DESTINATION specifies where the encoded text goes.
8973 If nil, the region between START and END is replace by the encoded text.
8974 If buffer, the encoded text is inserted in that buffer after point (point
8975 does not move).
8976 In those cases, the length of the encoded text is returned.
8977 If DESTINATION is t, the encoded text is returned.
8978
8979 This function sets `last-coding-system-used' to the precise coding system
8980 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8981 not fully specified.)  */)
8982   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
8983 {
8984   return code_convert_region (start, end, coding_system, destination, 1, 0);
8985 }
8986
8987 Lisp_Object
8988 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
8989                      Lisp_Object dst_object, int encodep, int nocopy, int norecord)
8990 {
8991   struct coding_system coding;
8992   EMACS_INT chars, bytes;
8993
8994   CHECK_STRING (string);
8995   if (NILP (coding_system))
8996     {
8997       if (! norecord)
8998         Vlast_coding_system_used = Qno_conversion;
8999       if (NILP (dst_object))
9000         return (nocopy ? Fcopy_sequence (string) : string);
9001     }
9002
9003   if (NILP (coding_system))
9004     coding_system = Qno_conversion;
9005   else
9006     CHECK_CODING_SYSTEM (coding_system);
9007   if (NILP (dst_object))
9008     dst_object = Qt;
9009   else if (! EQ (dst_object, Qt))
9010     CHECK_BUFFER (dst_object);
9011
9012   setup_coding_system (coding_system, &coding);
9013   coding.mode |= CODING_MODE_LAST_BLOCK;
9014   chars = SCHARS (string);
9015   bytes = SBYTES (string);
9016   if (encodep)
9017     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9018   else
9019     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9020   if (! norecord)
9021     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9022
9023   return (BUFFERP (dst_object)
9024           ? make_number (coding.produced_char)
9025           : coding.dst_object);
9026 }
9027
9028
9029 /* Encode or decode STRING according to CODING_SYSTEM.
9030    Do not set Vlast_coding_system_used.
9031
9032    This function is called only from macros DECODE_FILE and
9033    ENCODE_FILE, thus we ignore character composition.  */
9034
9035 Lisp_Object
9036 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9037                               int encodep)
9038 {
9039   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9040 }
9041
9042
9043 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9044        2, 4, 0,
9045        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9046
9047 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9048 if the decoding operation is trivial.
9049
9050 Optional fourth arg BUFFER non-nil means that the decoded text is
9051 inserted in that buffer after point (point does not move).  In this
9052 case, the return value is the length of the decoded text.
9053
9054 This function sets `last-coding-system-used' to the precise coding system
9055 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9056 not fully specified.)  */)
9057   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9058 {
9059   return code_convert_string (string, coding_system, buffer,
9060                               0, ! NILP (nocopy), 0);
9061 }
9062
9063 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9064        2, 4, 0,
9065        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9066
9067 Optional third arg NOCOPY non-nil means it is OK to return STRING
9068 itself if the encoding operation is trivial.
9069
9070 Optional fourth arg BUFFER non-nil means that the encoded text is
9071 inserted in that buffer after point (point does not move).  In this
9072 case, the return value is the length of the encoded text.
9073
9074 This function sets `last-coding-system-used' to the precise coding system
9075 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9076 not fully specified.)  */)
9077   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9078 {
9079   return code_convert_string (string, coding_system, buffer,
9080                               1, ! NILP (nocopy), 1);
9081 }
9082
9083 \f
9084 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9085        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9086 Return the corresponding character.  */)
9087   (Lisp_Object code)
9088 {
9089   Lisp_Object spec, attrs, val;
9090   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9091   int c;
9092
9093   CHECK_NATNUM (code);
9094   c = XFASTINT (code);
9095   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9096   attrs = AREF (spec, 0);
9097
9098   if (ASCII_BYTE_P (c)
9099       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9100     return code;
9101
9102   val = CODING_ATTR_CHARSET_LIST (attrs);
9103   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9104   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9105   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9106
9107   if (c <= 0x7F)
9108     charset = charset_roman;
9109   else if (c >= 0xA0 && c < 0xDF)
9110     {
9111       charset = charset_kana;
9112       c -= 0x80;
9113     }
9114   else
9115     {
9116       int s1 = c >> 8, s2 = c & 0xFF;
9117
9118       if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
9119           || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
9120         error ("Invalid code: %d", code);
9121       SJIS_TO_JIS (c);
9122       charset = charset_kanji;
9123     }
9124   c = DECODE_CHAR (charset, c);
9125   if (c < 0)
9126     error ("Invalid code: %d", code);
9127   return make_number (c);
9128 }
9129
9130
9131 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9132        doc: /* Encode a Japanese character CH to shift_jis encoding.
9133 Return the corresponding code in SJIS.  */)
9134   (Lisp_Object ch)
9135 {
9136   Lisp_Object spec, attrs, charset_list;
9137   int c;
9138   struct charset *charset;
9139   unsigned code;
9140
9141   CHECK_CHARACTER (ch);
9142   c = XFASTINT (ch);
9143   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9144   attrs = AREF (spec, 0);
9145
9146   if (ASCII_CHAR_P (c)
9147       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9148     return ch;
9149
9150   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9151   charset = char_charset (c, charset_list, &code);
9152   if (code == CHARSET_INVALID_CODE (charset))
9153     error ("Can't encode by shift_jis encoding: %d", c);
9154   JIS_TO_SJIS (code);
9155
9156   return make_number (code);
9157 }
9158
9159 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9160        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9161 Return the corresponding character.  */)
9162   (Lisp_Object code)
9163 {
9164   Lisp_Object spec, attrs, val;
9165   struct charset *charset_roman, *charset_big5, *charset;
9166   int c;
9167
9168   CHECK_NATNUM (code);
9169   c = XFASTINT (code);
9170   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9171   attrs = AREF (spec, 0);
9172
9173   if (ASCII_BYTE_P (c)
9174       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9175     return code;
9176
9177   val = CODING_ATTR_CHARSET_LIST (attrs);
9178   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9179   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9180
9181   if (c <= 0x7F)
9182     charset = charset_roman;
9183   else
9184     {
9185       int b1 = c >> 8, b2 = c & 0x7F;
9186       if (b1 < 0xA1 || b1 > 0xFE
9187           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9188         error ("Invalid code: %d", code);
9189       charset = charset_big5;
9190     }
9191   c = DECODE_CHAR (charset, (unsigned )c);
9192   if (c < 0)
9193     error ("Invalid code: %d", code);
9194   return make_number (c);
9195 }
9196
9197 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9198        doc: /* Encode the Big5 character CH to BIG5 coding system.
9199 Return the corresponding character code in Big5.  */)
9200   (Lisp_Object ch)
9201 {
9202   Lisp_Object spec, attrs, charset_list;
9203   struct charset *charset;
9204   int c;
9205   unsigned code;
9206
9207   CHECK_CHARACTER (ch);
9208   c = XFASTINT (ch);
9209   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9210   attrs = AREF (spec, 0);
9211   if (ASCII_CHAR_P (c)
9212       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9213     return ch;
9214
9215   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9216   charset = char_charset (c, charset_list, &code);
9217   if (code == CHARSET_INVALID_CODE (charset))
9218     error ("Can't encode by Big5 encoding: %d", c);
9219
9220   return make_number (code);
9221 }
9222
9223 \f
9224 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9225        Sset_terminal_coding_system_internal, 1, 2, 0,
9226        doc: /* Internal use only.  */)
9227   (Lisp_Object coding_system, Lisp_Object terminal)
9228 {
9229   struct terminal *term = get_terminal (terminal, 1);
9230   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9231   CHECK_SYMBOL (coding_system);
9232   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9233   /* We had better not send unsafe characters to terminal.  */
9234   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9235   /* Character composition should be disabled.  */
9236   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9237   terminal_coding->src_multibyte = 1;
9238   terminal_coding->dst_multibyte = 0;
9239   if (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK)
9240     term->charset_list = coding_charset_list (terminal_coding);
9241   else
9242     term->charset_list = Fcons (make_number (charset_ascii), Qnil);
9243   return Qnil;
9244 }
9245
9246 DEFUN ("set-safe-terminal-coding-system-internal",
9247        Fset_safe_terminal_coding_system_internal,
9248        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9249        doc: /* Internal use only.  */)
9250   (Lisp_Object coding_system)
9251 {
9252   CHECK_SYMBOL (coding_system);
9253   setup_coding_system (Fcheck_coding_system (coding_system),
9254                        &safe_terminal_coding);
9255   /* Character composition should be disabled.  */
9256   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9257   safe_terminal_coding.src_multibyte = 1;
9258   safe_terminal_coding.dst_multibyte = 0;
9259   return Qnil;
9260 }
9261
9262 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9263        Sterminal_coding_system, 0, 1, 0,
9264        doc: /* Return coding system specified for terminal output on the given terminal.
9265 TERMINAL may be a terminal object, a frame, or nil for the selected
9266 frame's terminal device.  */)
9267   (Lisp_Object terminal)
9268 {
9269   struct coding_system *terminal_coding
9270     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9271   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9272
9273   /* For backward compatibility, return nil if it is `undecided'. */
9274   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9275 }
9276
9277 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9278        Sset_keyboard_coding_system_internal, 1, 2, 0,
9279        doc: /* Internal use only.  */)
9280   (Lisp_Object coding_system, Lisp_Object terminal)
9281 {
9282   struct terminal *t = get_terminal (terminal, 1);
9283   CHECK_SYMBOL (coding_system);
9284   if (NILP (coding_system))
9285     coding_system = Qno_conversion;
9286   else
9287     Fcheck_coding_system (coding_system);
9288   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9289   /* Character composition should be disabled.  */
9290   TERMINAL_KEYBOARD_CODING (t)->common_flags
9291     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9292   return Qnil;
9293 }
9294
9295 DEFUN ("keyboard-coding-system",
9296        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9297        doc: /* Return coding system specified for decoding keyboard input.  */)
9298   (Lisp_Object terminal)
9299 {
9300   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9301                          (get_terminal (terminal, 1))->id);
9302 }
9303
9304 \f
9305 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9306        Sfind_operation_coding_system,  1, MANY, 0,
9307        doc: /* Choose a coding system for an operation based on the target name.
9308 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9309 DECODING-SYSTEM is the coding system to use for decoding
9310 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9311 for encoding (in case OPERATION does encoding).
9312
9313 The first argument OPERATION specifies an I/O primitive:
9314   For file I/O, `insert-file-contents' or `write-region'.
9315   For process I/O, `call-process', `call-process-region', or `start-process'.
9316   For network I/O, `open-network-stream'.
9317
9318 The remaining arguments should be the same arguments that were passed
9319 to the primitive.  Depending on which primitive, one of those arguments
9320 is selected as the TARGET.  For example, if OPERATION does file I/O,
9321 whichever argument specifies the file name is TARGET.
9322
9323 TARGET has a meaning which depends on OPERATION:
9324   For file I/O, TARGET is a file name (except for the special case below).
9325   For process I/O, TARGET is a process name.
9326   For network I/O, TARGET is a service name or a port number.
9327
9328 This function looks up what is specified for TARGET in
9329 `file-coding-system-alist', `process-coding-system-alist',
9330 or `network-coding-system-alist' depending on OPERATION.
9331 They may specify a coding system, a cons of coding systems,
9332 or a function symbol to call.
9333 In the last case, we call the function with one argument,
9334 which is a list of all the arguments given to this function.
9335 If the function can't decide a coding system, it can return
9336 `undecided' so that the normal code-detection is performed.
9337
9338 If OPERATION is `insert-file-contents', the argument corresponding to
9339 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9340 file name to look up, and BUFFER is a buffer that contains the file's
9341 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9342 function to call for FILENAME, that function should examine the
9343 contents of BUFFER instead of reading the file.
9344
9345 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9346   (int nargs, Lisp_Object *args)
9347 {
9348   Lisp_Object operation, target_idx, target, val;
9349   register Lisp_Object chain;
9350
9351   if (nargs < 2)
9352     error ("Too few arguments");
9353   operation = args[0];
9354   if (!SYMBOLP (operation)
9355       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
9356     error ("Invalid first argument");
9357   if (nargs < 1 + XINT (target_idx))
9358     error ("Too few arguments for operation: %s",
9359            SDATA (SYMBOL_NAME (operation)));
9360   target = args[XINT (target_idx) + 1];
9361   if (!(STRINGP (target)
9362         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9363             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9364         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9365     error ("Invalid %dth argument", XINT (target_idx) + 1);
9366   if (CONSP (target))
9367     target = XCAR (target);
9368
9369   chain = ((EQ (operation, Qinsert_file_contents)
9370             || EQ (operation, Qwrite_region))
9371            ? Vfile_coding_system_alist
9372            : (EQ (operation, Qopen_network_stream)
9373               ? Vnetwork_coding_system_alist
9374               : Vprocess_coding_system_alist));
9375   if (NILP (chain))
9376     return Qnil;
9377
9378   for (; CONSP (chain); chain = XCDR (chain))
9379     {
9380       Lisp_Object elt;
9381
9382       elt = XCAR (chain);
9383       if (CONSP (elt)
9384           && ((STRINGP (target)
9385                && STRINGP (XCAR (elt))
9386                && fast_string_match (XCAR (elt), target) >= 0)
9387               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9388         {
9389           val = XCDR (elt);
9390           /* Here, if VAL is both a valid coding system and a valid
9391              function symbol, we return VAL as a coding system.  */
9392           if (CONSP (val))
9393             return val;
9394           if (! SYMBOLP (val))
9395             return Qnil;
9396           if (! NILP (Fcoding_system_p (val)))
9397             return Fcons (val, val);
9398           if (! NILP (Ffboundp (val)))
9399             {
9400               /* We use call1 rather than safe_call1
9401                  so as to get bug reports about functions called here
9402                  which don't handle the current interface.  */
9403               val = call1 (val, Flist (nargs, args));
9404               if (CONSP (val))
9405                 return val;
9406               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9407                 return Fcons (val, val);
9408             }
9409           return Qnil;
9410         }
9411     }
9412   return Qnil;
9413 }
9414
9415 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9416        Sset_coding_system_priority, 0, MANY, 0,
9417        doc: /* Assign higher priority to the coding systems given as arguments.
9418 If multiple coding systems belong to the same category,
9419 all but the first one are ignored.
9420
9421 usage: (set-coding-system-priority &rest coding-systems)  */)
9422   (int nargs, Lisp_Object *args)
9423 {
9424   int i, j;
9425   int changed[coding_category_max];
9426   enum coding_category priorities[coding_category_max];
9427
9428   memset (changed, 0, sizeof changed);
9429
9430   for (i = j = 0; i < nargs; i++)
9431     {
9432       enum coding_category category;
9433       Lisp_Object spec, attrs;
9434
9435       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9436       attrs = AREF (spec, 0);
9437       category = XINT (CODING_ATTR_CATEGORY (attrs));
9438       if (changed[category])
9439         /* Ignore this coding system because a coding system of the
9440            same category already had a higher priority.  */
9441         continue;
9442       changed[category] = 1;
9443       priorities[j++] = category;
9444       if (coding_categories[category].id >= 0
9445           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9446         setup_coding_system (args[i], &coding_categories[category]);
9447       Fset (AREF (Vcoding_category_table, category), args[i]);
9448     }
9449
9450   /* Now we have decided top J priorities.  Reflect the order of the
9451      original priorities to the remaining priorities.  */
9452
9453   for (i = j, j = 0; i < coding_category_max; i++, j++)
9454     {
9455       while (j < coding_category_max
9456              && changed[coding_priorities[j]])
9457         j++;
9458       if (j == coding_category_max)
9459         abort ();
9460       priorities[i] = coding_priorities[j];
9461     }
9462
9463   memcpy (coding_priorities, priorities, sizeof priorities);
9464
9465   /* Update `coding-category-list'.  */
9466   Vcoding_category_list = Qnil;
9467   for (i = coding_category_max - 1; i >= 0; i--)
9468     Vcoding_category_list
9469       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9470                Vcoding_category_list);
9471
9472   return Qnil;
9473 }
9474
9475 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9476        Scoding_system_priority_list, 0, 1, 0,
9477        doc: /* Return a list of coding systems ordered by their priorities.
9478 The list contains a subset of coding systems; i.e. coding systems
9479 assigned to each coding category (see `coding-category-list').
9480
9481 HIGHESTP non-nil means just return the highest priority one.  */)
9482   (Lisp_Object highestp)
9483 {
9484   int i;
9485   Lisp_Object val;
9486
9487   for (i = 0, val = Qnil; i < coding_category_max; i++)
9488     {
9489       enum coding_category category = coding_priorities[i];
9490       int id = coding_categories[category].id;
9491       Lisp_Object attrs;
9492
9493       if (id < 0)
9494         continue;
9495       attrs = CODING_ID_ATTRS (id);
9496       if (! NILP (highestp))
9497         return CODING_ATTR_BASE_NAME (attrs);
9498       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9499     }
9500   return Fnreverse (val);
9501 }
9502
9503 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9504
9505 static Lisp_Object
9506 make_subsidiaries (Lisp_Object base)
9507 {
9508   Lisp_Object subsidiaries;
9509   int base_name_len = SBYTES (SYMBOL_NAME (base));
9510   char *buf = (char *) alloca (base_name_len + 6);
9511   int i;
9512
9513   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
9514   subsidiaries = Fmake_vector (make_number (3), Qnil);
9515   for (i = 0; i < 3; i++)
9516     {
9517       memcpy (buf + base_name_len, suffixes[i], strlen (suffixes[i]) + 1);
9518       ASET (subsidiaries, i, intern (buf));
9519     }
9520   return subsidiaries;
9521 }
9522
9523
9524 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9525        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9526        doc: /* For internal use only.
9527 usage: (define-coding-system-internal ...)  */)
9528   (int nargs, Lisp_Object *args)
9529 {
9530   Lisp_Object name;
9531   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
9532   Lisp_Object attrs;            /* Vector of attributes.  */
9533   Lisp_Object eol_type;
9534   Lisp_Object aliases;
9535   Lisp_Object coding_type, charset_list, safe_charsets;
9536   enum coding_category category;
9537   Lisp_Object tail, val;
9538   int max_charset_id = 0;
9539   int i;
9540
9541   if (nargs < coding_arg_max)
9542     goto short_args;
9543
9544   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9545
9546   name = args[coding_arg_name];
9547   CHECK_SYMBOL (name);
9548   CODING_ATTR_BASE_NAME (attrs) = name;
9549
9550   val = args[coding_arg_mnemonic];
9551   if (! STRINGP (val))
9552     CHECK_CHARACTER (val);
9553   CODING_ATTR_MNEMONIC (attrs) = val;
9554
9555   coding_type = args[coding_arg_coding_type];
9556   CHECK_SYMBOL (coding_type);
9557   CODING_ATTR_TYPE (attrs) = coding_type;
9558
9559   charset_list = args[coding_arg_charset_list];
9560   if (SYMBOLP (charset_list))
9561     {
9562       if (EQ (charset_list, Qiso_2022))
9563         {
9564           if (! EQ (coding_type, Qiso_2022))
9565             error ("Invalid charset-list");
9566           charset_list = Viso_2022_charset_list;
9567         }
9568       else if (EQ (charset_list, Qemacs_mule))
9569         {
9570           if (! EQ (coding_type, Qemacs_mule))
9571             error ("Invalid charset-list");
9572           charset_list = Vemacs_mule_charset_list;
9573         }
9574       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9575         if (max_charset_id < XFASTINT (XCAR (tail)))
9576           max_charset_id = XFASTINT (XCAR (tail));
9577     }
9578   else
9579     {
9580       charset_list = Fcopy_sequence (charset_list);
9581       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9582         {
9583           struct charset *charset;
9584
9585           val = XCAR (tail);
9586           CHECK_CHARSET_GET_CHARSET (val, charset);
9587           if (EQ (coding_type, Qiso_2022)
9588               ? CHARSET_ISO_FINAL (charset) < 0
9589               : EQ (coding_type, Qemacs_mule)
9590               ? CHARSET_EMACS_MULE_ID (charset) < 0
9591               : 0)
9592             error ("Can't handle charset `%s'",
9593                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9594
9595           XSETCAR (tail, make_number (charset->id));
9596           if (max_charset_id < charset->id)
9597             max_charset_id = charset->id;
9598         }
9599     }
9600   CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
9601
9602   safe_charsets = make_uninit_string (max_charset_id + 1);
9603   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
9604   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9605     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9606   CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
9607
9608   CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
9609
9610   val = args[coding_arg_decode_translation_table];
9611   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9612     CHECK_SYMBOL (val);
9613   CODING_ATTR_DECODE_TBL (attrs) = val;
9614
9615   val = args[coding_arg_encode_translation_table];
9616   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9617     CHECK_SYMBOL (val);
9618   CODING_ATTR_ENCODE_TBL (attrs) = val;
9619
9620   val = args[coding_arg_post_read_conversion];
9621   CHECK_SYMBOL (val);
9622   CODING_ATTR_POST_READ (attrs) = val;
9623
9624   val = args[coding_arg_pre_write_conversion];
9625   CHECK_SYMBOL (val);
9626   CODING_ATTR_PRE_WRITE (attrs) = val;
9627
9628   val = args[coding_arg_default_char];
9629   if (NILP (val))
9630     CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
9631   else
9632     {
9633       CHECK_CHARACTER (val);
9634       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9635     }
9636
9637   val = args[coding_arg_for_unibyte];
9638   CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
9639
9640   val = args[coding_arg_plist];
9641   CHECK_LIST (val);
9642   CODING_ATTR_PLIST (attrs) = val;
9643
9644   if (EQ (coding_type, Qcharset))
9645     {
9646       /* Generate a lisp vector of 256 elements.  Each element is nil,
9647          integer, or a list of charset IDs.
9648
9649          If Nth element is nil, the byte code N is invalid in this
9650          coding system.
9651
9652          If Nth element is a number NUM, N is the first byte of a
9653          charset whose ID is NUM.
9654
9655          If Nth element is a list of charset IDs, N is the first byte
9656          of one of them.  The list is sorted by dimensions of the
9657          charsets.  A charset of smaller dimension comes first. */
9658       val = Fmake_vector (make_number (256), Qnil);
9659
9660       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9661         {
9662           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9663           int dim = CHARSET_DIMENSION (charset);
9664           int idx = (dim - 1) * 4;
9665
9666           if (CHARSET_ASCII_COMPATIBLE_P (charset))
9667             CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9668
9669           for (i = charset->code_space[idx];
9670                i <= charset->code_space[idx + 1]; i++)
9671             {
9672               Lisp_Object tmp, tmp2;
9673               int dim2;
9674
9675               tmp = AREF (val, i);
9676               if (NILP (tmp))
9677                 tmp = XCAR (tail);
9678               else if (NUMBERP (tmp))
9679                 {
9680                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9681                   if (dim < dim2)
9682                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
9683                   else
9684                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
9685                 }
9686               else
9687                 {
9688                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9689                     {
9690                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9691                       if (dim < dim2)
9692                         break;
9693                     }
9694                   if (NILP (tmp2))
9695                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9696                   else
9697                     {
9698                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9699                       XSETCAR (tmp2, XCAR (tail));
9700                     }
9701                 }
9702               ASET (val, i, tmp);
9703             }
9704         }
9705       ASET (attrs, coding_attr_charset_valids, val);
9706       category = coding_category_charset;
9707     }
9708   else if (EQ (coding_type, Qccl))
9709     {
9710       Lisp_Object valids;
9711
9712       if (nargs < coding_arg_ccl_max)
9713         goto short_args;
9714
9715       val = args[coding_arg_ccl_decoder];
9716       CHECK_CCL_PROGRAM (val);
9717       if (VECTORP (val))
9718         val = Fcopy_sequence (val);
9719       ASET (attrs, coding_attr_ccl_decoder, val);
9720
9721       val = args[coding_arg_ccl_encoder];
9722       CHECK_CCL_PROGRAM (val);
9723       if (VECTORP (val))
9724         val = Fcopy_sequence (val);
9725       ASET (attrs, coding_attr_ccl_encoder, val);
9726
9727       val = args[coding_arg_ccl_valids];
9728       valids = Fmake_string (make_number (256), make_number (0));
9729       for (tail = val; !NILP (tail); tail = Fcdr (tail))
9730         {
9731           int from, to;
9732
9733           val = Fcar (tail);
9734           if (INTEGERP (val))
9735             {
9736               from = to = XINT (val);
9737               if (from < 0 || from > 255)
9738                 args_out_of_range_3 (val, make_number (0), make_number (255));
9739             }
9740           else
9741             {
9742               CHECK_CONS (val);
9743               CHECK_NATNUM_CAR (val);
9744               CHECK_NATNUM_CDR (val);
9745               from = XINT (XCAR (val));
9746               if (from > 255)
9747                 args_out_of_range_3 (XCAR (val),
9748                                      make_number (0), make_number (255));
9749               to = XINT (XCDR (val));
9750               if (to < from || to > 255)
9751                 args_out_of_range_3 (XCDR (val),
9752                                      XCAR (val), make_number (255));
9753             }
9754           for (i = from; i <= to; i++)
9755             SSET (valids, i, 1);
9756         }
9757       ASET (attrs, coding_attr_ccl_valids, valids);
9758
9759       category = coding_category_ccl;
9760     }
9761   else if (EQ (coding_type, Qutf_16))
9762     {
9763       Lisp_Object bom, endian;
9764
9765       CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9766
9767       if (nargs < coding_arg_utf16_max)
9768         goto short_args;
9769
9770       bom = args[coding_arg_utf16_bom];
9771       if (! NILP (bom) && ! EQ (bom, Qt))
9772         {
9773           CHECK_CONS (bom);
9774           val = XCAR (bom);
9775           CHECK_CODING_SYSTEM (val);
9776           val = XCDR (bom);
9777           CHECK_CODING_SYSTEM (val);
9778         }
9779       ASET (attrs, coding_attr_utf_bom, bom);
9780
9781       endian = args[coding_arg_utf16_endian];
9782       CHECK_SYMBOL (endian);
9783       if (NILP (endian))
9784         endian = Qbig;
9785       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
9786         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
9787       ASET (attrs, coding_attr_utf_16_endian, endian);
9788
9789       category = (CONSP (bom)
9790                   ? coding_category_utf_16_auto
9791                   : NILP (bom)
9792                   ? (EQ (endian, Qbig)
9793                      ? coding_category_utf_16_be_nosig
9794                      : coding_category_utf_16_le_nosig)
9795                   : (EQ (endian, Qbig)
9796                      ? coding_category_utf_16_be
9797                      : coding_category_utf_16_le));
9798     }
9799   else if (EQ (coding_type, Qiso_2022))
9800     {
9801       Lisp_Object initial, reg_usage, request, flags;
9802       int i;
9803
9804       if (nargs < coding_arg_iso2022_max)
9805         goto short_args;
9806
9807       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9808       CHECK_VECTOR (initial);
9809       for (i = 0; i < 4; i++)
9810         {
9811           val = Faref (initial, make_number (i));
9812           if (! NILP (val))
9813             {
9814               struct charset *charset;
9815
9816               CHECK_CHARSET_GET_CHARSET (val, charset);
9817               ASET (initial, i, make_number (CHARSET_ID (charset)));
9818               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9819                 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9820             }
9821           else
9822             ASET (initial, i, make_number (-1));
9823         }
9824
9825       reg_usage = args[coding_arg_iso2022_reg_usage];
9826       CHECK_CONS (reg_usage);
9827       CHECK_NUMBER_CAR (reg_usage);
9828       CHECK_NUMBER_CDR (reg_usage);
9829
9830       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9831       for (tail = request; ! NILP (tail); tail = Fcdr (tail))
9832         {
9833           int id;
9834           Lisp_Object tmp;
9835
9836           val = Fcar (tail);
9837           CHECK_CONS (val);
9838           tmp = XCAR (val);
9839           CHECK_CHARSET_GET_ID (tmp, id);
9840           CHECK_NATNUM_CDR (val);
9841           if (XINT (XCDR (val)) >= 4)
9842             error ("Invalid graphic register number: %d", XINT (XCDR (val)));
9843           XSETCAR (val, make_number (id));
9844         }
9845
9846       flags = args[coding_arg_iso2022_flags];
9847       CHECK_NATNUM (flags);
9848       i = XINT (flags);
9849       if (EQ (args[coding_arg_charset_list], Qiso_2022))
9850         flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
9851
9852       ASET (attrs, coding_attr_iso_initial, initial);
9853       ASET (attrs, coding_attr_iso_usage, reg_usage);
9854       ASET (attrs, coding_attr_iso_request, request);
9855       ASET (attrs, coding_attr_iso_flags, flags);
9856       setup_iso_safe_charsets (attrs);
9857
9858       if (i & CODING_ISO_FLAG_SEVEN_BITS)
9859         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9860                           | CODING_ISO_FLAG_SINGLE_SHIFT))
9861                     ? coding_category_iso_7_else
9862                     : EQ (args[coding_arg_charset_list], Qiso_2022)
9863                     ? coding_category_iso_7
9864                     : coding_category_iso_7_tight);
9865       else
9866         {
9867           int id = XINT (AREF (initial, 1));
9868
9869           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
9870                        || EQ (args[coding_arg_charset_list], Qiso_2022)
9871                        || id < 0)
9872                       ? coding_category_iso_8_else
9873                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9874                       ? coding_category_iso_8_1
9875                       : coding_category_iso_8_2);
9876         }
9877       if (category != coding_category_iso_8_1
9878           && category != coding_category_iso_8_2)
9879         CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9880     }
9881   else if (EQ (coding_type, Qemacs_mule))
9882     {
9883       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
9884         ASET (attrs, coding_attr_emacs_mule_full, Qt);
9885       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9886       category = coding_category_emacs_mule;
9887     }
9888   else if (EQ (coding_type, Qshift_jis))
9889     {
9890
9891       struct charset *charset;
9892
9893       if (XINT (Flength (charset_list)) != 3
9894           && XINT (Flength (charset_list)) != 4)
9895         error ("There should be three or four charsets");
9896
9897       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9898       if (CHARSET_DIMENSION (charset) != 1)
9899         error ("Dimension of charset %s is not one",
9900                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9901       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9902         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9903
9904       charset_list = XCDR (charset_list);
9905       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9906       if (CHARSET_DIMENSION (charset) != 1)
9907         error ("Dimension of charset %s is not one",
9908                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9909
9910       charset_list = XCDR (charset_list);
9911       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9912       if (CHARSET_DIMENSION (charset) != 2)
9913         error ("Dimension of charset %s is not two",
9914                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9915
9916       charset_list = XCDR (charset_list);
9917       if (! NILP (charset_list))
9918         {
9919           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9920           if (CHARSET_DIMENSION (charset) != 2)
9921             error ("Dimension of charset %s is not two",
9922                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9923         }
9924
9925       category = coding_category_sjis;
9926       Vsjis_coding_system = name;
9927     }
9928   else if (EQ (coding_type, Qbig5))
9929     {
9930       struct charset *charset;
9931
9932       if (XINT (Flength (charset_list)) != 2)
9933         error ("There should be just two charsets");
9934
9935       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9936       if (CHARSET_DIMENSION (charset) != 1)
9937         error ("Dimension of charset %s is not one",
9938                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9939       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9940         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9941
9942       charset_list = XCDR (charset_list);
9943       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9944       if (CHARSET_DIMENSION (charset) != 2)
9945         error ("Dimension of charset %s is not two",
9946                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9947
9948       category = coding_category_big5;
9949       Vbig5_coding_system = name;
9950     }
9951   else if (EQ (coding_type, Qraw_text))
9952     {
9953       category = coding_category_raw_text;
9954       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9955     }
9956   else if (EQ (coding_type, Qutf_8))
9957     {
9958       Lisp_Object bom;
9959
9960       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9961
9962       if (nargs < coding_arg_utf8_max)
9963         goto short_args;
9964
9965       bom = args[coding_arg_utf8_bom];
9966       if (! NILP (bom) && ! EQ (bom, Qt))
9967         {
9968           CHECK_CONS (bom);
9969           val = XCAR (bom);
9970           CHECK_CODING_SYSTEM (val);
9971           val = XCDR (bom);
9972           CHECK_CODING_SYSTEM (val);
9973         }
9974       ASET (attrs, coding_attr_utf_bom, bom);
9975
9976       category = (CONSP (bom) ? coding_category_utf_8_auto
9977                   : NILP (bom) ? coding_category_utf_8_nosig
9978                   : coding_category_utf_8_sig);
9979     }
9980   else if (EQ (coding_type, Qundecided))
9981     category = coding_category_undecided;
9982   else
9983     error ("Invalid coding system type: %s",
9984            SDATA (SYMBOL_NAME (coding_type)));
9985
9986   CODING_ATTR_CATEGORY (attrs) = make_number (category);
9987   CODING_ATTR_PLIST (attrs)
9988     = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
9989                                 CODING_ATTR_PLIST (attrs)));
9990   CODING_ATTR_PLIST (attrs)
9991     = Fcons (QCascii_compatible_p,
9992              Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
9993                     CODING_ATTR_PLIST (attrs)));
9994
9995   eol_type = args[coding_arg_eol_type];
9996   if (! NILP (eol_type)
9997       && ! EQ (eol_type, Qunix)
9998       && ! EQ (eol_type, Qdos)
9999       && ! EQ (eol_type, Qmac))
10000     error ("Invalid eol-type");
10001
10002   aliases = Fcons (name, Qnil);
10003
10004   if (NILP (eol_type))
10005     {
10006       eol_type = make_subsidiaries (name);
10007       for (i = 0; i < 3; i++)
10008         {
10009           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10010
10011           this_name = AREF (eol_type, i);
10012           this_aliases = Fcons (this_name, Qnil);
10013           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10014           this_spec = Fmake_vector (make_number (3), attrs);
10015           ASET (this_spec, 1, this_aliases);
10016           ASET (this_spec, 2, this_eol_type);
10017           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10018           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10019           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10020           if (NILP (val))
10021             Vcoding_system_alist
10022               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10023                        Vcoding_system_alist);
10024         }
10025     }
10026
10027   spec_vec = Fmake_vector (make_number (3), attrs);
10028   ASET (spec_vec, 1, aliases);
10029   ASET (spec_vec, 2, eol_type);
10030
10031   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10032   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10033   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10034   if (NILP (val))
10035     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10036                                   Vcoding_system_alist);
10037
10038   {
10039     int id = coding_categories[category].id;
10040
10041     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10042       setup_coding_system (name, &coding_categories[category]);
10043   }
10044
10045   return Qnil;
10046
10047  short_args:
10048   return Fsignal (Qwrong_number_of_arguments,
10049                   Fcons (intern ("define-coding-system-internal"),
10050                          make_number (nargs)));
10051 }
10052
10053
10054 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10055        3, 3, 0,
10056        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10057   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10058 {
10059   Lisp_Object spec, attrs;
10060
10061   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10062   attrs = AREF (spec, 0);
10063   if (EQ (prop, QCmnemonic))
10064     {
10065       if (! STRINGP (val))
10066         CHECK_CHARACTER (val);
10067       CODING_ATTR_MNEMONIC (attrs) = val;
10068     }
10069   else if (EQ (prop, QCdefault_char))
10070     {
10071       if (NILP (val))
10072         val = make_number (' ');
10073       else
10074         CHECK_CHARACTER (val);
10075       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
10076     }
10077   else if (EQ (prop, QCdecode_translation_table))
10078     {
10079       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10080         CHECK_SYMBOL (val);
10081       CODING_ATTR_DECODE_TBL (attrs) = val;
10082     }
10083   else if (EQ (prop, QCencode_translation_table))
10084     {
10085       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10086         CHECK_SYMBOL (val);
10087       CODING_ATTR_ENCODE_TBL (attrs) = val;
10088     }
10089   else if (EQ (prop, QCpost_read_conversion))
10090     {
10091       CHECK_SYMBOL (val);
10092       CODING_ATTR_POST_READ (attrs) = val;
10093     }
10094   else if (EQ (prop, QCpre_write_conversion))
10095     {
10096       CHECK_SYMBOL (val);
10097       CODING_ATTR_PRE_WRITE (attrs) = val;
10098     }
10099   else if (EQ (prop, QCascii_compatible_p))
10100     {
10101       CODING_ATTR_ASCII_COMPAT (attrs) = val;
10102     }
10103
10104   CODING_ATTR_PLIST (attrs)
10105     = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
10106   return val;
10107 }
10108
10109
10110 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10111        Sdefine_coding_system_alias, 2, 2, 0,
10112        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10113   (Lisp_Object alias, Lisp_Object coding_system)
10114 {
10115   Lisp_Object spec, aliases, eol_type, val;
10116
10117   CHECK_SYMBOL (alias);
10118   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10119   aliases = AREF (spec, 1);
10120   /* ALIASES should be a list of length more than zero, and the first
10121      element is a base coding system.  Append ALIAS at the tail of the
10122      list.  */
10123   while (!NILP (XCDR (aliases)))
10124     aliases = XCDR (aliases);
10125   XSETCDR (aliases, Fcons (alias, Qnil));
10126
10127   eol_type = AREF (spec, 2);
10128   if (VECTORP (eol_type))
10129     {
10130       Lisp_Object subsidiaries;
10131       int i;
10132
10133       subsidiaries = make_subsidiaries (alias);
10134       for (i = 0; i < 3; i++)
10135         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10136                                      AREF (eol_type, i));
10137     }
10138
10139   Fputhash (alias, spec, Vcoding_system_hash_table);
10140   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10141   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10142   if (NILP (val))
10143     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10144                                   Vcoding_system_alist);
10145
10146   return Qnil;
10147 }
10148
10149 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10150        1, 1, 0,
10151        doc: /* Return the base of CODING-SYSTEM.
10152 Any alias or subsidiary coding system is not a base coding system.  */)
10153   (Lisp_Object coding_system)
10154 {
10155   Lisp_Object spec, attrs;
10156
10157   if (NILP (coding_system))
10158     return (Qno_conversion);
10159   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10160   attrs = AREF (spec, 0);
10161   return CODING_ATTR_BASE_NAME (attrs);
10162 }
10163
10164 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10165        1, 1, 0,
10166        doc: "Return the property list of CODING-SYSTEM.")
10167   (Lisp_Object coding_system)
10168 {
10169   Lisp_Object spec, attrs;
10170
10171   if (NILP (coding_system))
10172     coding_system = Qno_conversion;
10173   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10174   attrs = AREF (spec, 0);
10175   return CODING_ATTR_PLIST (attrs);
10176 }
10177
10178
10179 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10180        1, 1, 0,
10181        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10182   (Lisp_Object coding_system)
10183 {
10184   Lisp_Object spec;
10185
10186   if (NILP (coding_system))
10187     coding_system = Qno_conversion;
10188   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10189   return AREF (spec, 1);
10190 }
10191
10192 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10193        Scoding_system_eol_type, 1, 1, 0,
10194        doc: /* Return eol-type of CODING-SYSTEM.
10195 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10196
10197 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10198 and CR respectively.
10199
10200 A vector value indicates that a format of end-of-line should be
10201 detected automatically.  Nth element of the vector is the subsidiary
10202 coding system whose eol-type is N.  */)
10203   (Lisp_Object coding_system)
10204 {
10205   Lisp_Object spec, eol_type;
10206   int n;
10207
10208   if (NILP (coding_system))
10209     coding_system = Qno_conversion;
10210   if (! CODING_SYSTEM_P (coding_system))
10211     return Qnil;
10212   spec = CODING_SYSTEM_SPEC (coding_system);
10213   eol_type = AREF (spec, 2);
10214   if (VECTORP (eol_type))
10215     return Fcopy_sequence (eol_type);
10216   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10217   return make_number (n);
10218 }
10219
10220 #endif /* emacs */
10221
10222 \f
10223 /*** 9. Post-amble ***/
10224
10225 void
10226 init_coding_once (void)
10227 {
10228   int i;
10229
10230   for (i = 0; i < coding_category_max; i++)
10231     {
10232       coding_categories[i].id = -1;
10233       coding_priorities[i] = i;
10234     }
10235
10236   /* ISO2022 specific initialize routine.  */
10237   for (i = 0; i < 0x20; i++)
10238     iso_code_class[i] = ISO_control_0;
10239   for (i = 0x21; i < 0x7F; i++)
10240     iso_code_class[i] = ISO_graphic_plane_0;
10241   for (i = 0x80; i < 0xA0; i++)
10242     iso_code_class[i] = ISO_control_1;
10243   for (i = 0xA1; i < 0xFF; i++)
10244     iso_code_class[i] = ISO_graphic_plane_1;
10245   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10246   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10247   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10248   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10249   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10250   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10251   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10252   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10253   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10254
10255   for (i = 0; i < 256; i++)
10256     {
10257       emacs_mule_bytes[i] = 1;
10258     }
10259   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10260   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10261   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10262   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10263 }
10264
10265 #ifdef emacs
10266
10267 void
10268 syms_of_coding (void)
10269 {
10270   staticpro (&Vcoding_system_hash_table);
10271   {
10272     Lisp_Object args[2];
10273     args[0] = QCtest;
10274     args[1] = Qeq;
10275     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10276   }
10277
10278   staticpro (&Vsjis_coding_system);
10279   Vsjis_coding_system = Qnil;
10280
10281   staticpro (&Vbig5_coding_system);
10282   Vbig5_coding_system = Qnil;
10283
10284   staticpro (&Vcode_conversion_reused_workbuf);
10285   Vcode_conversion_reused_workbuf = Qnil;
10286
10287   staticpro (&Vcode_conversion_workbuf_name);
10288   Vcode_conversion_workbuf_name = make_pure_c_string (" *code-conversion-work*");
10289
10290   reused_workbuf_in_use = 0;
10291
10292   DEFSYM (Qcharset, "charset");
10293   DEFSYM (Qtarget_idx, "target-idx");
10294   DEFSYM (Qcoding_system_history, "coding-system-history");
10295   Fset (Qcoding_system_history, Qnil);
10296
10297   /* Target FILENAME is the first argument.  */
10298   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10299   /* Target FILENAME is the third argument.  */
10300   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10301
10302   DEFSYM (Qcall_process, "call-process");
10303   /* Target PROGRAM is the first argument.  */
10304   Fput (Qcall_process, Qtarget_idx, make_number (0));
10305
10306   DEFSYM (Qcall_process_region, "call-process-region");
10307   /* Target PROGRAM is the third argument.  */
10308   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10309
10310   DEFSYM (Qstart_process, "start-process");
10311   /* Target PROGRAM is the third argument.  */
10312   Fput (Qstart_process, Qtarget_idx, make_number (2));
10313
10314   DEFSYM (Qopen_network_stream, "open-network-stream");
10315   /* Target SERVICE is the fourth argument.  */
10316   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10317
10318   DEFSYM (Qcoding_system, "coding-system");
10319   DEFSYM (Qcoding_aliases, "coding-aliases");
10320
10321   DEFSYM (Qeol_type, "eol-type");
10322   DEFSYM (Qunix, "unix");
10323   DEFSYM (Qdos, "dos");
10324
10325   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10326   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10327   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10328   DEFSYM (Qdefault_char, "default-char");
10329   DEFSYM (Qundecided, "undecided");
10330   DEFSYM (Qno_conversion, "no-conversion");
10331   DEFSYM (Qraw_text, "raw-text");
10332
10333   DEFSYM (Qiso_2022, "iso-2022");
10334
10335   DEFSYM (Qutf_8, "utf-8");
10336   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10337
10338   DEFSYM (Qutf_16, "utf-16");
10339   DEFSYM (Qbig, "big");
10340   DEFSYM (Qlittle, "little");
10341
10342   DEFSYM (Qshift_jis, "shift-jis");
10343   DEFSYM (Qbig5, "big5");
10344
10345   DEFSYM (Qcoding_system_p, "coding-system-p");
10346
10347   DEFSYM (Qcoding_system_error, "coding-system-error");
10348   Fput (Qcoding_system_error, Qerror_conditions,
10349         pure_cons (Qcoding_system_error, pure_cons (Qerror, Qnil)));
10350   Fput (Qcoding_system_error, Qerror_message,
10351         make_pure_c_string ("Invalid coding system"));
10352
10353   /* Intern this now in case it isn't already done.
10354      Setting this variable twice is harmless.
10355      But don't staticpro it here--that is done in alloc.c.  */
10356   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10357
10358   DEFSYM (Qtranslation_table, "translation-table");
10359   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10360   DEFSYM (Qtranslation_table_id, "translation-table-id");
10361   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10362   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10363
10364   DEFSYM (Qvalid_codes, "valid-codes");
10365
10366   DEFSYM (Qemacs_mule, "emacs-mule");
10367
10368   DEFSYM (QCcategory, ":category");
10369   DEFSYM (QCmnemonic, ":mnemonic");
10370   DEFSYM (QCdefault_char, ":default-char");
10371   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10372   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10373   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10374   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10375   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10376
10377   Vcoding_category_table
10378     = Fmake_vector (make_number (coding_category_max), Qnil);
10379   staticpro (&Vcoding_category_table);
10380   /* Followings are target of code detection.  */
10381   ASET (Vcoding_category_table, coding_category_iso_7,
10382         intern_c_string ("coding-category-iso-7"));
10383   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10384         intern_c_string ("coding-category-iso-7-tight"));
10385   ASET (Vcoding_category_table, coding_category_iso_8_1,
10386         intern_c_string ("coding-category-iso-8-1"));
10387   ASET (Vcoding_category_table, coding_category_iso_8_2,
10388         intern_c_string ("coding-category-iso-8-2"));
10389   ASET (Vcoding_category_table, coding_category_iso_7_else,
10390         intern_c_string ("coding-category-iso-7-else"));
10391   ASET (Vcoding_category_table, coding_category_iso_8_else,
10392         intern_c_string ("coding-category-iso-8-else"));
10393   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10394         intern_c_string ("coding-category-utf-8-auto"));
10395   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10396         intern_c_string ("coding-category-utf-8"));
10397   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10398         intern_c_string ("coding-category-utf-8-sig"));
10399   ASET (Vcoding_category_table, coding_category_utf_16_be,
10400         intern_c_string ("coding-category-utf-16-be"));
10401   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10402         intern_c_string ("coding-category-utf-16-auto"));
10403   ASET (Vcoding_category_table, coding_category_utf_16_le,
10404         intern_c_string ("coding-category-utf-16-le"));
10405   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10406         intern_c_string ("coding-category-utf-16-be-nosig"));
10407   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10408         intern_c_string ("coding-category-utf-16-le-nosig"));
10409   ASET (Vcoding_category_table, coding_category_charset,
10410         intern_c_string ("coding-category-charset"));
10411   ASET (Vcoding_category_table, coding_category_sjis,
10412         intern_c_string ("coding-category-sjis"));
10413   ASET (Vcoding_category_table, coding_category_big5,
10414         intern_c_string ("coding-category-big5"));
10415   ASET (Vcoding_category_table, coding_category_ccl,
10416         intern_c_string ("coding-category-ccl"));
10417   ASET (Vcoding_category_table, coding_category_emacs_mule,
10418         intern_c_string ("coding-category-emacs-mule"));
10419   /* Followings are NOT target of code detection.  */
10420   ASET (Vcoding_category_table, coding_category_raw_text,
10421         intern_c_string ("coding-category-raw-text"));
10422   ASET (Vcoding_category_table, coding_category_undecided,
10423         intern_c_string ("coding-category-undecided"));
10424
10425   DEFSYM (Qinsufficient_source, "insufficient-source");
10426   DEFSYM (Qinconsistent_eol, "inconsistent-eol");
10427   DEFSYM (Qinvalid_source, "invalid-source");
10428   DEFSYM (Qinterrupted, "interrupted");
10429   DEFSYM (Qinsufficient_memory, "insufficient-memory");
10430   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10431
10432   defsubr (&Scoding_system_p);
10433   defsubr (&Sread_coding_system);
10434   defsubr (&Sread_non_nil_coding_system);
10435   defsubr (&Scheck_coding_system);
10436   defsubr (&Sdetect_coding_region);
10437   defsubr (&Sdetect_coding_string);
10438   defsubr (&Sfind_coding_systems_region_internal);
10439   defsubr (&Sunencodable_char_position);
10440   defsubr (&Scheck_coding_systems_region);
10441   defsubr (&Sdecode_coding_region);
10442   defsubr (&Sencode_coding_region);
10443   defsubr (&Sdecode_coding_string);
10444   defsubr (&Sencode_coding_string);
10445   defsubr (&Sdecode_sjis_char);
10446   defsubr (&Sencode_sjis_char);
10447   defsubr (&Sdecode_big5_char);
10448   defsubr (&Sencode_big5_char);
10449   defsubr (&Sset_terminal_coding_system_internal);
10450   defsubr (&Sset_safe_terminal_coding_system_internal);
10451   defsubr (&Sterminal_coding_system);
10452   defsubr (&Sset_keyboard_coding_system_internal);
10453   defsubr (&Skeyboard_coding_system);
10454   defsubr (&Sfind_operation_coding_system);
10455   defsubr (&Sset_coding_system_priority);
10456   defsubr (&Sdefine_coding_system_internal);
10457   defsubr (&Sdefine_coding_system_alias);
10458   defsubr (&Scoding_system_put);
10459   defsubr (&Scoding_system_base);
10460   defsubr (&Scoding_system_plist);
10461   defsubr (&Scoding_system_aliases);
10462   defsubr (&Scoding_system_eol_type);
10463   defsubr (&Scoding_system_priority_list);
10464
10465   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10466                doc: /* List of coding systems.
10467
10468 Do not alter the value of this variable manually.  This variable should be
10469 updated by the functions `define-coding-system' and
10470 `define-coding-system-alias'.  */);
10471   Vcoding_system_list = Qnil;
10472
10473   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10474                doc: /* Alist of coding system names.
10475 Each element is one element list of coding system name.
10476 This variable is given to `completing-read' as COLLECTION argument.
10477
10478 Do not alter the value of this variable manually.  This variable should be
10479 updated by the functions `make-coding-system' and
10480 `define-coding-system-alias'.  */);
10481   Vcoding_system_alist = Qnil;
10482
10483   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10484                doc: /* List of coding-categories (symbols) ordered by priority.
10485
10486 On detecting a coding system, Emacs tries code detection algorithms
10487 associated with each coding-category one by one in this order.  When
10488 one algorithm agrees with a byte sequence of source text, the coding
10489 system bound to the corresponding coding-category is selected.
10490
10491 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
10492   {
10493     int i;
10494
10495     Vcoding_category_list = Qnil;
10496     for (i = coding_category_max - 1; i >= 0; i--)
10497       Vcoding_category_list
10498         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
10499                  Vcoding_category_list);
10500   }
10501
10502   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
10503                doc: /* Specify the coding system for read operations.
10504 It is useful to bind this variable with `let', but do not set it globally.
10505 If the value is a coding system, it is used for decoding on read operation.
10506 If not, an appropriate element is used from one of the coding system alists.
10507 There are three such tables: `file-coding-system-alist',
10508 `process-coding-system-alist', and `network-coding-system-alist'.  */);
10509   Vcoding_system_for_read = Qnil;
10510
10511   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
10512                doc: /* Specify the coding system for write operations.
10513 Programs bind this variable with `let', but you should not set it globally.
10514 If the value is a coding system, it is used for encoding of output,
10515 when writing it to a file and when sending it to a file or subprocess.
10516
10517 If this does not specify a coding system, an appropriate element
10518 is used from one of the coding system alists.
10519 There are three such tables: `file-coding-system-alist',
10520 `process-coding-system-alist', and `network-coding-system-alist'.
10521 For output to files, if the above procedure does not specify a coding system,
10522 the value of `buffer-file-coding-system' is used.  */);
10523   Vcoding_system_for_write = Qnil;
10524
10525   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
10526                doc: /*
10527 Coding system used in the latest file or process I/O.  */);
10528   Vlast_coding_system_used = Qnil;
10529
10530   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
10531                doc: /*
10532 Error status of the last code conversion.
10533
10534 When an error was detected in the last code conversion, this variable
10535 is set to one of the following symbols.
10536   `insufficient-source'
10537   `inconsistent-eol'
10538   `invalid-source'
10539   `interrupted'
10540   `insufficient-memory'
10541 When no error was detected, the value doesn't change.  So, to check
10542 the error status of a code conversion by this variable, you must
10543 explicitly set this variable to nil before performing code
10544 conversion.  */);
10545   Vlast_code_conversion_error = Qnil;
10546
10547   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
10548                doc: /*
10549 *Non-nil means always inhibit code conversion of end-of-line format.
10550 See info node `Coding Systems' and info node `Text and Binary' concerning
10551 such conversion.  */);
10552   inhibit_eol_conversion = 0;
10553
10554   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
10555                doc: /*
10556 Non-nil means process buffer inherits coding system of process output.
10557 Bind it to t if the process output is to be treated as if it were a file
10558 read from some filesystem.  */);
10559   inherit_process_coding_system = 0;
10560
10561   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
10562                doc: /*
10563 Alist to decide a coding system to use for a file I/O operation.
10564 The format is ((PATTERN . VAL) ...),
10565 where PATTERN is a regular expression matching a file name,
10566 VAL is a coding system, a cons of coding systems, or a function symbol.
10567 If VAL is a coding system, it is used for both decoding and encoding
10568 the file contents.
10569 If VAL is a cons of coding systems, the car part is used for decoding,
10570 and the cdr part is used for encoding.
10571 If VAL is a function symbol, the function must return a coding system
10572 or a cons of coding systems which are used as above.  The function is
10573 called with an argument that is a list of the arguments with which
10574 `find-operation-coding-system' was called.  If the function can't decide
10575 a coding system, it can return `undecided' so that the normal
10576 code-detection is performed.
10577
10578 See also the function `find-operation-coding-system'
10579 and the variable `auto-coding-alist'.  */);
10580   Vfile_coding_system_alist = Qnil;
10581
10582   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
10583                doc: /*
10584 Alist to decide a coding system to use for a process I/O operation.
10585 The format is ((PATTERN . VAL) ...),
10586 where PATTERN is a regular expression matching a program name,
10587 VAL is a coding system, a cons of coding systems, or a function symbol.
10588 If VAL is a coding system, it is used for both decoding what received
10589 from the program and encoding what sent to the program.
10590 If VAL is a cons of coding systems, the car part is used for decoding,
10591 and the cdr part is used for encoding.
10592 If VAL is a function symbol, the function must return a coding system
10593 or a cons of coding systems which are used as above.
10594
10595 See also the function `find-operation-coding-system'.  */);
10596   Vprocess_coding_system_alist = Qnil;
10597
10598   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
10599                doc: /*
10600 Alist to decide a coding system to use for a network I/O operation.
10601 The format is ((PATTERN . VAL) ...),
10602 where PATTERN is a regular expression matching a network service name
10603 or is a port number to connect to,
10604 VAL is a coding system, a cons of coding systems, or a function symbol.
10605 If VAL is a coding system, it is used for both decoding what received
10606 from the network stream and encoding what sent to the network stream.
10607 If VAL is a cons of coding systems, the car part is used for decoding,
10608 and the cdr part is used for encoding.
10609 If VAL is a function symbol, the function must return a coding system
10610 or a cons of coding systems which are used as above.
10611
10612 See also the function `find-operation-coding-system'.  */);
10613   Vnetwork_coding_system_alist = Qnil;
10614
10615   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
10616                doc: /* Coding system to use with system messages.
10617 Also used for decoding keyboard input on X Window system.  */);
10618   Vlocale_coding_system = Qnil;
10619
10620   /* The eol mnemonics are reset in startup.el system-dependently.  */
10621   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
10622                doc: /*
10623 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
10624   eol_mnemonic_unix = make_pure_c_string (":");
10625
10626   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
10627                doc: /*
10628 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
10629   eol_mnemonic_dos = make_pure_c_string ("\\");
10630
10631   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
10632                doc: /*
10633 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
10634   eol_mnemonic_mac = make_pure_c_string ("/");
10635
10636   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
10637                doc: /*
10638 *String displayed in mode line when end-of-line format is not yet determined.  */);
10639   eol_mnemonic_undecided = make_pure_c_string (":");
10640
10641   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
10642                doc: /*
10643 *Non-nil enables character translation while encoding and decoding.  */);
10644   Venable_character_translation = Qt;
10645
10646   DEFVAR_LISP ("standard-translation-table-for-decode",
10647                Vstandard_translation_table_for_decode,
10648                doc: /* Table for translating characters while decoding.  */);
10649   Vstandard_translation_table_for_decode = Qnil;
10650
10651   DEFVAR_LISP ("standard-translation-table-for-encode",
10652                Vstandard_translation_table_for_encode,
10653                doc: /* Table for translating characters while encoding.  */);
10654   Vstandard_translation_table_for_encode = Qnil;
10655
10656   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
10657                doc: /* Alist of charsets vs revision numbers.
10658 While encoding, if a charset (car part of an element) is found,
10659 designate it with the escape sequence identifying revision (cdr part
10660 of the element).  */);
10661   Vcharset_revision_table = Qnil;
10662
10663   DEFVAR_LISP ("default-process-coding-system",
10664                Vdefault_process_coding_system,
10665                doc: /* Cons of coding systems used for process I/O by default.
10666 The car part is used for decoding a process output,
10667 the cdr part is used for encoding a text to be sent to a process.  */);
10668   Vdefault_process_coding_system = Qnil;
10669
10670   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
10671                doc: /*
10672 Table of extra Latin codes in the range 128..159 (inclusive).
10673 This is a vector of length 256.
10674 If Nth element is non-nil, the existence of code N in a file
10675 \(or output of subprocess) doesn't prevent it to be detected as
10676 a coding system of ISO 2022 variant which has a flag
10677 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10678 or reading output of a subprocess.
10679 Only 128th through 159th elements have a meaning.  */);
10680   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
10681
10682   DEFVAR_LISP ("select-safe-coding-system-function",
10683                Vselect_safe_coding_system_function,
10684                doc: /*
10685 Function to call to select safe coding system for encoding a text.
10686
10687 If set, this function is called to force a user to select a proper
10688 coding system which can encode the text in the case that a default
10689 coding system used in each operation can't encode the text.  The
10690 function should take care that the buffer is not modified while
10691 the coding system is being selected.
10692
10693 The default value is `select-safe-coding-system' (which see).  */);
10694   Vselect_safe_coding_system_function = Qnil;
10695
10696   DEFVAR_BOOL ("coding-system-require-warning",
10697                coding_system_require_warning,
10698                doc: /* Internal use only.
10699 If non-nil, on writing a file, `select-safe-coding-system-function' is
10700 called even if `coding-system-for-write' is non-nil.  The command
10701 `universal-coding-system-argument' binds this variable to t temporarily.  */);
10702   coding_system_require_warning = 0;
10703
10704
10705   DEFVAR_BOOL ("inhibit-iso-escape-detection",
10706                inhibit_iso_escape_detection,
10707                doc: /*
10708 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
10709
10710 When Emacs reads text, it tries to detect how the text is encoded.
10711 This code detection is sensitive to escape sequences.  If Emacs sees
10712 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10713 of the ISO2022 encodings, and decodes text by the corresponding coding
10714 system (e.g. `iso-2022-7bit').
10715
10716 However, there may be a case that you want to read escape sequences in
10717 a file as is.  In such a case, you can set this variable to non-nil.
10718 Then the code detection will ignore any escape sequences, and no text is
10719 detected as encoded in some ISO-2022 encoding.  The result is that all
10720 escape sequences become visible in a buffer.
10721
10722 The default value is nil, and it is strongly recommended not to change
10723 it.  That is because many Emacs Lisp source files that contain
10724 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10725 in Emacs's distribution, and they won't be decoded correctly on
10726 reading if you suppress escape sequence detection.
10727
10728 The other way to read escape sequences in a file without decoding is
10729 to explicitly specify some coding system that doesn't use ISO-2022
10730 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
10731   inhibit_iso_escape_detection = 0;
10732
10733   DEFVAR_BOOL ("inhibit-null-byte-detection",
10734                inhibit_null_byte_detection,
10735                doc: /* If non-nil, Emacs ignores null bytes on code detection.
10736 By default, Emacs treats it as binary data, and does not attempt to
10737 decode it.  The effect is as if you specified `no-conversion' for
10738 reading that text.
10739
10740 Set this to non-nil when a regular text happens to include null bytes.
10741 Examples are Index nodes of Info files and null-byte delimited output
10742 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
10743 decode text as usual.  */);
10744   inhibit_null_byte_detection = 0;
10745
10746   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
10747                doc: /* Char table for translating self-inserting characters.
10748 This is applied to the result of input methods, not their input.
10749 See also `keyboard-translate-table'.
10750
10751 Use of this variable for character code unification was rendered
10752 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
10753 internal character representation.  */);
10754     Vtranslation_table_for_input = Qnil;
10755
10756   {
10757     Lisp_Object args[coding_arg_max];
10758     Lisp_Object plist[16];
10759     int i;
10760
10761     for (i = 0; i < coding_arg_max; i++)
10762       args[i] = Qnil;
10763
10764     plist[0] = intern_c_string (":name");
10765     plist[1] = args[coding_arg_name] = Qno_conversion;
10766     plist[2] = intern_c_string (":mnemonic");
10767     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10768     plist[4] = intern_c_string (":coding-type");
10769     plist[5] = args[coding_arg_coding_type] = Qraw_text;
10770     plist[6] = intern_c_string (":ascii-compatible-p");
10771     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10772     plist[8] = intern_c_string (":default-char");
10773     plist[9] = args[coding_arg_default_char] = make_number (0);
10774     plist[10] = intern_c_string (":for-unibyte");
10775     plist[11] = args[coding_arg_for_unibyte] = Qt;
10776     plist[12] = intern_c_string (":docstring");
10777     plist[13] = make_pure_c_string ("Do no conversion.\n\
10778 \n\
10779 When you visit a file with this coding, the file is read into a\n\
10780 unibyte buffer as is, thus each byte of a file is treated as a\n\
10781 character.");
10782     plist[14] = intern_c_string (":eol-type");
10783     plist[15] = args[coding_arg_eol_type] = Qunix;
10784     args[coding_arg_plist] = Flist (16, plist);
10785     Fdefine_coding_system_internal (coding_arg_max, args);
10786
10787     plist[1] = args[coding_arg_name] = Qundecided;
10788     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10789     plist[5] = args[coding_arg_coding_type] = Qundecided;
10790     /* This is already set.
10791        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
10792     plist[8] = intern_c_string (":charset-list");
10793     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10794     plist[11] = args[coding_arg_for_unibyte] = Qnil;
10795     plist[13] = make_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
10796     plist[15] = args[coding_arg_eol_type] = Qnil;
10797     args[coding_arg_plist] = Flist (16, plist);
10798     Fdefine_coding_system_internal (coding_arg_max, args);
10799   }
10800
10801   setup_coding_system (Qno_conversion, &safe_terminal_coding);
10802
10803   {
10804     int i;
10805
10806     for (i = 0; i < coding_category_max; i++)
10807       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10808   }
10809 #if defined (DOS_NT)
10810   system_eol_type = Qdos;
10811 #else
10812   system_eol_type = Qunix;
10813 #endif
10814   staticpro (&system_eol_type);
10815 }
10816
10817 char *
10818 emacs_strerror (int error_number)
10819 {
10820   char *str;
10821
10822   synchronize_system_messages_locale ();
10823   str = strerror (error_number);
10824
10825   if (! NILP (Vlocale_coding_system))
10826     {
10827       Lisp_Object dec = code_convert_string_norecord (build_string (str),
10828                                                       Vlocale_coding_system,
10829                                                       0);
10830       str = SSDATA (dec);
10831     }
10832
10833   return str;
10834 }
10835
10836 #endif /* emacs */