code.delx.au - gnu-emacs/blob - src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4    Copyright (C) 2001, 2002 Free Software Foundation, Inc.
   5    Copyright (C) 2001, 2002
   6      National Institute of Advanced Industrial Science and Technology (AIST)
   7      Registration Number H13PRO009
   8
   9 This file is part of GNU Emacs.
  10
  11 GNU Emacs is free software; you can redistribute it and/or modify
  12 it under the terms of the GNU General Public License as published by
  13 the Free Software Foundation; either version 2, or (at your option)
  14 any later version.
  15
  16 GNU Emacs is distributed in the hope that it will be useful,
  17 but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19 GNU General Public License for more details.
  20
  21 You should have received a copy of the GNU General Public License
  22 along with GNU Emacs; see the file COPYING.  If not, write to
  23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  24 Boston, MA 02111-1307, USA.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  In
  59   C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return 1 if the byte sequence conforms to XXX, otherwise return 0.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static int
 156 detect_coding_XXX (coding, detect_info)
 157      struct coding_system *coding;
 158      struct coding_detection_info *detect_info;
 159 {
 160   unsigned char *src = coding->source;
 161   unsigned char *src_end = coding->source + coding->src_bytes;
 162   int multibytep = coding->src_multibyte;
 163   int consumed_chars = 0;
 164   int found = 0;
 165   ...;
 166
 167   while (1)
 168     {
 169       /* Get one byte from the source.  If the souce is exausted, jump
 170          to no_more_source:.  */
 171       ONE_MORE_BYTE (c);
 172
 173       if (! __C_conforms_to_XXX___ (c))
 174         break;
 175       if (! __C_strongly_suggests_XXX__ (c))
 176         found = CATEGORY_MASK_XXX;
 177     }
 178   /* The byte sequence is invalid for XXX.  */
 179   detect_info->rejected |= CATEGORY_MASK_XXX;
 180   return 0;
 181
 182  no_more_source:
 183   /* The source exausted successfully.  */
 184   detect_info->found |= found;
 185   return 1;
 186 }
 187 #endif
 188
 189 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 190
 191   These functions decode a byte sequence specified as a source by
 192   CODING.  The resulting multibyte text goes to a place pointed to by
 193   CODING->charbuf, the length of which should not exceed
 194   CODING->charbuf_size;
 195
 196   These functions set the information of original and decoded texts in
 197   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 198   They also set CODING->result to one of CODING_RESULT_XXX indicating
 199   how the decoding is finished.
 200
 201   Below is the template of these functions.  */
 202
 203 #if 0
 204 static void
 205 decode_coding_XXXX (coding)
 206      struct coding_system *coding;
 207 {
 208   unsigned char *src = coding->source + coding->consumed;
 209   unsigned char *src_end = coding->source + coding->src_bytes;
 210   /* SRC_BASE remembers the start position in source in each loop.
 211      The loop will be exited when there's not enough source code, or
 212      when there's no room in CHARBUF for a decoded character.  */
 213   unsigned char *src_base;
 214   /* A buffer to produce decoded characters.  */
 215   int *charbuf = coding->charbuf;
 216   int *charbuf_end = charbuf + coding->charbuf_size;
 217   int multibytep = coding->src_multibyte;
 218
 219   while (1)
 220     {
 221       src_base = src;
 222       if (charbuf < charbuf_end)
 223         /* No more room to produce a decoded character.  */
 224         break;
 225       ONE_MORE_BYTE (c);
 226       /* Decode it. */
 227     }
 228
 229  no_more_source:
 230   if (src_base < src_end
 231       && coding->mode & CODING_MODE_LAST_BLOCK)
 232     /* If the source ends by partial bytes to construct a character,
 233        treat them as eight-bit raw data.  */
 234     while (src_base < src_end && charbuf < charbuf_end)
 235       *charbuf++ = *src_base++;
 236   /* Remember how many bytes and characters we consumed.  If the
 237      source is multibyte, the bytes and chars are not identical.  */
 238   coding->consumed = coding->consumed_char = src_base - coding->source;
 239   /* Remember how many characters we produced.  */
 240   coding->charbuf_used = charbuf - coding->charbuf;
 241 }
 242 #endif
 243
 244 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 245
 246   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 247   internal multibyte format by CODING.  The resulting byte sequence
 248   goes to a place pointed to by DESTINATION, the length of which
 249   should not exceed DST_BYTES.
 250
 251   These functions set the information of original and encoded texts in
 252   the members produced, produced_char, consumed, and consumed_char of
 253   the structure *CODING.  They also set the member result to one of
 254   CODING_RESULT_XXX indicating how the encoding finished.
 255
 256   DST_BYTES zero means that source area and destination area are
 257   overlapped, which means that we can produce a encoded text until it
 258   reaches at the head of not-yet-encoded source text.
 259
 260   Below is a template of these functions.  */
 261 #if 0
 262 static void
 263 encode_coding_XXX (coding)
 264      struct coding_system *coding;
 265 {
 266   int multibytep = coding->dst_multibyte;
 267   int *charbuf = coding->charbuf;
 268   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 269   unsigned char *dst = coding->destination + coding->produced;
 270   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 271   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 272   int produced_chars = 0;
 273
 274   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 275     {
 276       int c = *charbuf;
 277       /* Encode C into DST, and increment DST.  */
 278     }
 279  label_no_more_destination:
 280   /* How many chars and bytes we produced.  */
 281   coding->produced_char += produced_chars;
 282   coding->produced = dst - coding->destination;
 283 }
 284 #endif
 285
 286 \f
 287 /*** 1. Preamble ***/
 288
 289 #include <config.h>
 290 #include <stdio.h>
 291
 292 #include "lisp.h"
 293 #include "buffer.h"
 294 #include "character.h"
 295 #include "charset.h"
 296 #include "ccl.h"
 297 #include "composite.h"
 298 #include "coding.h"
 299 #include "window.h"
 300
 301 Lisp_Object Vcoding_system_hash_table;
 302
 303 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
 304 Lisp_Object Qunix, Qdos;
 305 extern Lisp_Object Qmac;        /* frame.c */
 306 Lisp_Object Qbuffer_file_coding_system;
 307 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 308 Lisp_Object Qdefault_char;
 309 Lisp_Object Qno_conversion, Qundecided;
 310 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
 311 Lisp_Object Qbig, Qlittle;
 312 Lisp_Object Qcoding_system_history;
 313 Lisp_Object Qvalid_codes;
 314 Lisp_Object QCcategory;
 315
 316 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 317 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 318 Lisp_Object Qstart_process, Qopen_network_stream;
 319 Lisp_Object Qtarget_idx;
 320
 321 Lisp_Object Vselect_safe_coding_system_function;
 322
 323 /* Mnemonic string for each format of end-of-line.  */
 324 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 325 /* Mnemonic string to indicate format of end-of-line is not yet
 326    decided.  */
 327 Lisp_Object eol_mnemonic_undecided;
 328
 329 #ifdef emacs
 330
 331 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 332
 333 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 334
 335 /* Coding system emacs-mule and raw-text are for converting only
 336    end-of-line format.  */
 337 Lisp_Object Qemacs_mule, Qraw_text;
 338
 339 /* Coding-systems are handed between Emacs Lisp programs and C internal
 340    routines by the following three variables.  */
 341 /* Coding-system for reading files and receiving data from process.  */
 342 Lisp_Object Vcoding_system_for_read;
 343 /* Coding-system for writing files and sending data to process.  */
 344 Lisp_Object Vcoding_system_for_write;
 345 /* Coding-system actually used in the latest I/O.  */
 346 Lisp_Object Vlast_coding_system_used;
 347
 348 /* A vector of length 256 which contains information about special
 349    Latin codes (especially for dealing with Microsoft codes).  */
 350 Lisp_Object Vlatin_extra_code_table;
 351
 352 /* Flag to inhibit code conversion of end-of-line format.  */
 353 int inhibit_eol_conversion;
 354
 355 /* Flag to inhibit ISO2022 escape sequence detection.  */
 356 int inhibit_iso_escape_detection;
 357
 358 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 359 int inherit_process_coding_system;
 360
 361 /* Coding system to be used to encode text for terminal display.  */
 362 struct coding_system terminal_coding;
 363
 364 /* Coding system to be used to encode text for terminal display when
 365    terminal coding system is nil.  */
 366 struct coding_system safe_terminal_coding;
 367
 368 /* Coding system of what is sent from terminal keyboard.  */
 369 struct coding_system keyboard_coding;
 370
 371 Lisp_Object Vfile_coding_system_alist;
 372 Lisp_Object Vprocess_coding_system_alist;
 373 Lisp_Object Vnetwork_coding_system_alist;
 374
 375 Lisp_Object Vlocale_coding_system;
 376
 377 #endif /* emacs */
 378
 379 /* Flag to tell if we look up translation table on character code
 380    conversion.  */
 381 Lisp_Object Venable_character_translation;
 382 /* Standard translation table to look up on decoding (reading).  */
 383 Lisp_Object Vstandard_translation_table_for_decode;
 384 /* Standard translation table to look up on encoding (writing).  */
 385 Lisp_Object Vstandard_translation_table_for_encode;
 386
 387 Lisp_Object Qtranslation_table;
 388 Lisp_Object Qtranslation_table_id;
 389 Lisp_Object Qtranslation_table_for_decode;
 390 Lisp_Object Qtranslation_table_for_encode;
 391
 392 /* Alist of charsets vs revision number.  */
 393 static Lisp_Object Vcharset_revision_table;
 394
 395 /* Default coding systems used for process I/O.  */
 396 Lisp_Object Vdefault_process_coding_system;
 397
 398 /* Global flag to tell that we can't call post-read-conversion and
 399    pre-write-conversion functions.  Usually the value is zero, but it
 400    is set to 1 temporarily while such functions are running.  This is
 401    to avoid infinite recursive call.  */
 402 static int inhibit_pre_post_conversion;
 403
 404 /* Two special coding systems.  */
 405 Lisp_Object Vsjis_coding_system;
 406 Lisp_Object Vbig5_coding_system;
 407
 408
 409 static int detect_coding_utf_8 P_ ((struct coding_system *,
 410                                     struct coding_detection_info *info));
 411 static void decode_coding_utf_8 P_ ((struct coding_system *));
 412 static int encode_coding_utf_8 P_ ((struct coding_system *));
 413
 414 static int detect_coding_utf_16 P_ ((struct coding_system *,
 415                                      struct coding_detection_info *info));
 416 static void decode_coding_utf_16 P_ ((struct coding_system *));
 417 static int encode_coding_utf_16 P_ ((struct coding_system *));
 418
 419 static int detect_coding_iso_2022 P_ ((struct coding_system *,
 420                                        struct coding_detection_info *info));
 421 static void decode_coding_iso_2022 P_ ((struct coding_system *));
 422 static int encode_coding_iso_2022 P_ ((struct coding_system *));
 423
 424 static int detect_coding_emacs_mule P_ ((struct coding_system *,
 425                                          struct coding_detection_info *info));
 426 static void decode_coding_emacs_mule P_ ((struct coding_system *));
 427 static int encode_coding_emacs_mule P_ ((struct coding_system *));
 428
 429 static int detect_coding_sjis P_ ((struct coding_system *,
 430                                    struct coding_detection_info *info));
 431 static void decode_coding_sjis P_ ((struct coding_system *));
 432 static int encode_coding_sjis P_ ((struct coding_system *));
 433
 434 static int detect_coding_big5 P_ ((struct coding_system *,
 435                                    struct coding_detection_info *info));
 436 static void decode_coding_big5 P_ ((struct coding_system *));
 437 static int encode_coding_big5 P_ ((struct coding_system *));
 438
 439 static int detect_coding_ccl P_ ((struct coding_system *,
 440                                   struct coding_detection_info *info));
 441 static void decode_coding_ccl P_ ((struct coding_system *));
 442 static int encode_coding_ccl P_ ((struct coding_system *));
 443
 444 static void decode_coding_raw_text P_ ((struct coding_system *));
 445 static int encode_coding_raw_text P_ ((struct coding_system *));
 446
 447
 448 /* ISO2022 section */
 449
 450 #define CODING_ISO_INITIAL(coding, reg)                 \
 451   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 452                      coding_attr_iso_initial),          \
 453                reg)))
 454
 455
 456 #define CODING_ISO_REQUEST(coding, charset_id)  \
 457   ((charset_id <= (coding)->max_charset_id      \
 458     ? (coding)->safe_charsets[charset_id]       \
 459     : -1))
 460
 461
 462 #define CODING_ISO_FLAGS(coding)        \
 463   ((coding)->spec.iso_2022.flags)
 464 #define CODING_ISO_DESIGNATION(coding, reg)     \
 465   ((coding)->spec.iso_2022.current_designation[reg])
 466 #define CODING_ISO_INVOCATION(coding, plane)    \
 467   ((coding)->spec.iso_2022.current_invocation[plane])
 468 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 469   ((coding)->spec.iso_2022.single_shifting)
 470 #define CODING_ISO_BOL(coding)  \
 471   ((coding)->spec.iso_2022.bol)
 472 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 473   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 474
 475 /* Control characters of ISO2022.  */
 476                         /* code */      /* function */
 477 #define ISO_CODE_LF     0x0A            /* line-feed */
 478 #define ISO_CODE_CR     0x0D            /* carriage-return */
 479 #define ISO_CODE_SO     0x0E            /* shift-out */
 480 #define ISO_CODE_SI     0x0F            /* shift-in */
 481 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 482 #define ISO_CODE_ESC    0x1B            /* escape */
 483 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 484 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 485 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 486
 487 /* All code (1-byte) of ISO2022 is classified into one of the
 488    followings.  */
 489 enum iso_code_class_type
 490   {
 491     ISO_control_0,              /* Control codes in the range
 492                                    0x00..0x1F and 0x7F, except for the
 493                                    following 5 codes.  */
 494     ISO_carriage_return,        /* ISO_CODE_CR (0x0D) */
 495     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 496     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 497     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 498     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 499     ISO_control_1,              /* Control codes in the range
 500                                    0x80..0x9F, except for the
 501                                    following 3 codes.  */
 502     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 503     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 504     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 505     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 506     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 507     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 508     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 509   };
 510
 511 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 512     `iso-flags' attribute of an iso2022 coding system.  */
 513
 514 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 515    instead of the correct short-form sequence (e.g. ESC $ A).  */
 516 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 517
 518 /* If set, reset graphic planes and registers at end-of-line to the
 519    initial state.  */
 520 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 521
 522 /* If set, reset graphic planes and registers before any control
 523    characters to the initial state.  */
 524 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 525
 526 /* If set, encode by 7-bit environment.  */
 527 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 528
 529 /* If set, use locking-shift function.  */
 530 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 531
 532 /* If set, use single-shift function.  Overwrite
 533    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 534 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 535
 536 /* If set, use designation escape sequence.  */
 537 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 538
 539 /* If set, produce revision number sequence.  */
 540 #define CODING_ISO_FLAG_REVISION        0x0080
 541
 542 /* If set, produce ISO6429's direction specifying sequence.  */
 543 #define CODING_ISO_FLAG_DIRECTION       0x0100
 544
 545 /* If set, assume designation states are reset at beginning of line on
 546    output.  */
 547 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 548
 549 /* If set, designation sequence should be placed at beginning of line
 550    on output.  */
 551 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 552
 553 /* If set, do not encode unsafe charactes on output.  */
 554 #define CODING_ISO_FLAG_SAFE            0x0800
 555
 556 /* If set, extra latin codes (128..159) are accepted as a valid code
 557    on input.  */
 558 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 559
 560 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 561
 562 #define CODING_ISO_FLAG_EUC_TW_SHIFT    0x4000
 563
 564 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 565
 566 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 567
 568 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 569
 570 /* A character to be produced on output if encoding of the original
 571    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 572 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 573
 574
 575 /* UTF-16 section */
 576 #define CODING_UTF_16_BOM(coding)       \
 577   ((coding)->spec.utf_16.bom)
 578
 579 #define CODING_UTF_16_ENDIAN(coding)    \
 580   ((coding)->spec.utf_16.endian)
 581
 582 #define CODING_UTF_16_SURROGATE(coding) \
 583   ((coding)->spec.utf_16.surrogate)
 584
 585
 586 /* CCL section */
 587 #define CODING_CCL_DECODER(coding)      \
 588   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 589 #define CODING_CCL_ENCODER(coding)      \
 590   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 591 #define CODING_CCL_VALIDS(coding)                                          \
 592   (XSTRING (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)) \
 593    ->data)
 594
 595 /* Index for each coding category in `coding_categories' */
 596
 597 enum coding_category
 598   {
 599     coding_category_iso_7,
 600     coding_category_iso_7_tight,
 601     coding_category_iso_8_1,
 602     coding_category_iso_8_2,
 603     coding_category_iso_7_else,
 604     coding_category_iso_8_else,
 605     coding_category_utf_8,
 606     coding_category_utf_16_auto,
 607     coding_category_utf_16_be,
 608     coding_category_utf_16_le,
 609     coding_category_utf_16_be_nosig,
 610     coding_category_utf_16_le_nosig,
 611     coding_category_charset,
 612     coding_category_sjis,
 613     coding_category_big5,
 614     coding_category_ccl,
 615     coding_category_emacs_mule,
 616     /* All above are targets of code detection.  */
 617     coding_category_raw_text,
 618     coding_category_undecided,
 619     coding_category_max
 620   };
 621
 622 /* Definitions of flag bits used in detect_coding_XXXX.  */
 623 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 624 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 625 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 626 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 627 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 628 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 629 #define CATEGORY_MASK_UTF_8             (1 << coding_category_utf_8)
 630 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 631 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 632 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 633 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 634 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 635 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 636 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 637 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 638 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 639 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 640 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 641
 642 /* This value is returned if detect_coding_mask () find nothing other
 643    than ASCII characters.  */
 644 #define CATEGORY_MASK_ANY               \
 645   (CATEGORY_MASK_ISO_7                  \
 646    | CATEGORY_MASK_ISO_7_TIGHT          \
 647    | CATEGORY_MASK_ISO_8_1              \
 648    | CATEGORY_MASK_ISO_8_2              \
 649    | CATEGORY_MASK_ISO_7_ELSE           \
 650    | CATEGORY_MASK_ISO_8_ELSE           \
 651    | CATEGORY_MASK_UTF_8                \
 652    | CATEGORY_MASK_UTF_16_BE            \
 653    | CATEGORY_MASK_UTF_16_LE            \
 654    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 655    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 656    | CATEGORY_MASK_CHARSET              \
 657    | CATEGORY_MASK_SJIS                 \
 658    | CATEGORY_MASK_BIG5                 \
 659    | CATEGORY_MASK_CCL                  \
 660    | CATEGORY_MASK_EMACS_MULE)
 661
 662
 663 #define CATEGORY_MASK_ISO_7BIT \
 664   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 665
 666 #define CATEGORY_MASK_ISO_8BIT \
 667   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 668
 669 #define CATEGORY_MASK_ISO_ELSE \
 670   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 671
 672 #define CATEGORY_MASK_ISO_ESCAPE        \
 673   (CATEGORY_MASK_ISO_7                  \
 674    | CATEGORY_MASK_ISO_7_TIGHT          \
 675    | CATEGORY_MASK_ISO_7_ELSE           \
 676    | CATEGORY_MASK_ISO_8_ELSE)
 677
 678 #define CATEGORY_MASK_ISO       \
 679   (  CATEGORY_MASK_ISO_7BIT     \
 680      | CATEGORY_MASK_ISO_8BIT   \
 681      | CATEGORY_MASK_ISO_ELSE)
 682
 683 #define CATEGORY_MASK_UTF_16            \
 684   (CATEGORY_MASK_UTF_16_BE              \
 685    | CATEGORY_MASK_UTF_16_LE            \
 686    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 687    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 688
 689
 690 /* List of symbols `coding-category-xxx' ordered by priority.  This
 691    variable is exposed to Emacs Lisp.  */
 692 static Lisp_Object Vcoding_category_list;
 693
 694 /* Table of coding categories (Lisp symbols).  This variable is for
 695    internal use oly.  */
 696 static Lisp_Object Vcoding_category_table;
 697
 698 /* Table of coding-categories ordered by priority.  */
 699 static enum coding_category coding_priorities[coding_category_max];
 700
 701 /* Nth element is a coding context for the coding system bound to the
 702    Nth coding category.  */
 703 static struct coding_system coding_categories[coding_category_max];
 704
 705 /*** Commonly used macros and functions ***/
 706
 707 #ifndef min
 708 #define min(a, b) ((a) < (b) ? (a) : (b))
 709 #endif
 710 #ifndef max
 711 #define max(a, b) ((a) > (b) ? (a) : (b))
 712 #endif
 713
 714 #define CODING_GET_INFO(coding, attrs, eol_type, charset_list)  \
 715   do {                                                          \
 716     attrs = CODING_ID_ATTRS (coding->id);                       \
 717     eol_type = CODING_ID_EOL_TYPE (coding->id);                 \
 718     if (VECTORP (eol_type))                                     \
 719       eol_type = Qunix;                                         \
 720     charset_list = CODING_ATTR_CHARSET_LIST (attrs);            \
 721   } while (0)
 722
 723
 724 /* Safely get one byte from the source text pointed by SRC which ends
 725    at SRC_END, and set C to that byte.  If there are not enough bytes
 726    in the source, it jumps to `no_more_source'.  The caller
 727    should declare and set these variables appropriately in advance:
 728         src, src_end, multibytep
 729 */
 730
 731 #define ONE_MORE_BYTE(c)                                        \
 732   do {                                                          \
 733     if (src == src_end)                                         \
 734       {                                                         \
 735         if (src_base < src)                                     \
 736           coding->result = CODING_RESULT_INSUFFICIENT_SRC;      \
 737         goto no_more_source;                                    \
 738       }                                                         \
 739     c = *src++;                                                 \
 740     if (multibytep && (c & 0x80))                               \
 741       {                                                         \
 742         if ((c & 0xFE) != 0xC0)                                 \
 743           error ("Undecodable char found");                     \
 744         c = ((c & 1) << 6) | *src++;                            \
 745       }                                                         \
 746     consumed_chars++;                                           \
 747   } while (0)
 748
 749
 750 #define ONE_MORE_BYTE_NO_CHECK(c)               \
 751   do {                                          \
 752     c = *src++;                                 \
 753     if (multibytep && (c & 0x80))               \
 754       {                                         \
 755         if ((c & 0xFE) != 0xC0)                 \
 756           error ("Undecodable char found");     \
 757         c = ((c & 1) << 6) | *src++;            \
 758       }                                         \
 759     consumed_chars++;                           \
 760   } while (0)
 761
 762
 763 /* Store a byte C in the place pointed by DST and increment DST to the
 764    next free point, and increment PRODUCED_CHARS.  The caller should
 765    assure that C is 0..127, and declare and set the variable `dst'
 766    appropriately in advance.
 767 */
 768
 769
 770 #define EMIT_ONE_ASCII_BYTE(c)  \
 771   do {                          \
 772     produced_chars++;           \
 773     *dst++ = (c);               \
 774   } while (0)
 775
 776
 777 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2.  */
 778
 779 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 780   do {                                  \
 781     produced_chars += 2;                \
 782     *dst++ = (c1), *dst++ = (c2);       \
 783   } while (0)
 784
 785
 786 /* Store a byte C in the place pointed by DST and increment DST to the
 787    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP is
 788    nonzero, store in an appropriate multibyte from.  The caller should
 789    declare and set the variables `dst' and `multibytep' appropriately
 790    in advance.  */
 791
 792 #define EMIT_ONE_BYTE(c)                \
 793   do {                                  \
 794     produced_chars++;                   \
 795     if (multibytep)                     \
 796       {                                 \
 797         int ch = (c);                   \
 798         if (ch >= 0x80)                 \
 799           ch = BYTE8_TO_CHAR (ch);      \
 800         CHAR_STRING_ADVANCE (ch, dst);  \
 801       }                                 \
 802     else                                \
 803       *dst++ = (c);                     \
 804   } while (0)
 805
 806
 807 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 808
 809 #define EMIT_TWO_BYTES(c1, c2)          \
 810   do {                                  \
 811     produced_chars += 2;                \
 812     if (multibytep)                     \
 813       {                                 \
 814         int ch;                         \
 815                                         \
 816         ch = (c1);                      \
 817         if (ch >= 0x80)                 \
 818           ch = BYTE8_TO_CHAR (ch);      \
 819         CHAR_STRING_ADVANCE (ch, dst);  \
 820         ch = (c2);                      \
 821         if (ch >= 0x80)                 \
 822           ch = BYTE8_TO_CHAR (ch);      \
 823         CHAR_STRING_ADVANCE (ch, dst);  \
 824       }                                 \
 825     else                                \
 826       {                                 \
 827         *dst++ = (c1);                  \
 828         *dst++ = (c2);                  \
 829       }                                 \
 830   } while (0)
 831
 832
 833 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 834   do {                                  \
 835     EMIT_ONE_BYTE (c1);                 \
 836     EMIT_TWO_BYTES (c2, c3);            \
 837   } while (0)
 838
 839
 840 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 841   do {                                          \
 842     EMIT_TWO_BYTES (c1, c2);                    \
 843     EMIT_TWO_BYTES (c3, c4);                    \
 844   } while (0)
 845
 846
 847 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 848   do {                                                                       \
 849     charset_map_loaded = 0;                                                  \
 850     c = DECODE_CHAR (charset, code);                                         \
 851     if (charset_map_loaded)                                                  \
 852       {                                                                      \
 853         unsigned char *orig = coding->source;                                \
 854         EMACS_INT offset;                                                    \
 855                                                                              \
 856         coding_set_source (coding);                                          \
 857         offset = coding->source - orig;                                      \
 858         src += offset;                                                       \
 859         src_base += offset;                                                  \
 860         src_end += offset;                                                   \
 861       }                                                                      \
 862   } while (0)
 863
 864
 865 #define ASSURE_DESTINATION(bytes)                               \
 866   do {                                                          \
 867     if (dst + (bytes) >= dst_end)                               \
 868       {                                                         \
 869         int more_bytes = charbuf_end - charbuf + (bytes);       \
 870                                                                 \
 871         dst = alloc_destination (coding, more_bytes, dst);      \
 872         dst_end = coding->destination + coding->dst_bytes;      \
 873       }                                                         \
 874   } while (0)
 875
 876
 877
 878 static void
 879 coding_set_source (coding)
 880      struct coding_system *coding;
 881 {
 882   if (BUFFERP (coding->src_object))
 883     {
 884       struct buffer *buf = XBUFFER (coding->src_object);
 885
 886       if (coding->src_pos < 0)
 887         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 888       else
 889         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 890     }
 891   else if (STRINGP (coding->src_object))
 892     {
 893       coding->source = (XSTRING (coding->src_object)->data
 894                         + coding->src_pos_byte);
 895     }
 896   else
 897     /* Otherwise, the source is C string and is never relocated
 898        automatically.  Thus we don't have to update anything.  */
 899     ;
 900 }
 901
 902 static void
 903 coding_set_destination (coding)
 904      struct coding_system *coding;
 905 {
 906   if (BUFFERP (coding->dst_object))
 907     {
 908       if (coding->src_pos < 0)
 909         {
 910           coding->destination = BEG_ADDR + coding->dst_pos_byte - 1;
 911           coding->dst_bytes = (GAP_END_ADDR
 912                                - (coding->src_bytes - coding->consumed)
 913                                - coding->destination);
 914         }
 915       else
 916         {
 917           /* We are sure that coding->dst_pos_byte is before the gap
 918              of the buffer. */
 919           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
 920                                  + coding->dst_pos_byte - 1);
 921           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
 922                                - coding->destination);
 923         }
 924     }
 925   else
 926     /* Otherwise, the destination is C string and is never relocated
 927        automatically.  Thus we don't have to update anything.  */
 928     ;
 929 }
 930
 931
 932 static void
 933 coding_alloc_by_realloc (coding, bytes)
 934      struct coding_system *coding;
 935      EMACS_INT bytes;
 936 {
 937   coding->destination = (unsigned char *) xrealloc (coding->destination,
 938                                                     coding->dst_bytes + bytes);
 939   coding->dst_bytes += bytes;
 940 }
 941
 942 static void
 943 coding_alloc_by_making_gap (coding, bytes)
 944      struct coding_system *coding;
 945      EMACS_INT bytes;
 946 {
 947   if (BUFFERP (coding->dst_object)
 948       && EQ (coding->src_object, coding->dst_object))
 949     {
 950       EMACS_INT add = coding->src_bytes - coding->consumed;
 951
 952       GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
 953       make_gap (bytes);
 954       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
 955     }
 956   else
 957     {
 958       Lisp_Object this_buffer;
 959
 960       this_buffer = Fcurrent_buffer ();
 961       set_buffer_internal (XBUFFER (coding->dst_object));
 962       make_gap (bytes);
 963       set_buffer_internal (XBUFFER (this_buffer));
 964     }
 965 }
 966
 967
 968 static unsigned char *
 969 alloc_destination (coding, nbytes, dst)
 970      struct coding_system *coding;
 971      int nbytes;
 972      unsigned char *dst;
 973 {
 974   EMACS_INT offset = dst - coding->destination;
 975
 976   if (BUFFERP (coding->dst_object))
 977     coding_alloc_by_making_gap (coding, nbytes);
 978   else
 979     coding_alloc_by_realloc (coding, nbytes);
 980   coding->result = CODING_RESULT_SUCCESS;
 981   coding_set_destination (coding);
 982   dst = coding->destination + offset;
 983   return dst;
 984 }
 985
 986 /** Macros for annotations.  */
 987
 988 /* Maximum length of annotation data (sum of annotations for
 989    composition and charset).  */
 990 #define MAX_ANNOTATION_LENGTH (5 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 5)
 991
 992 /* An annotation data is stored in the array coding->charbuf in this
 993    format:
 994      [ -LENGTH ANNOTATION_MASK FROM TO ... ]
 995    LENGTH is the number of elements in the annotation.
 996    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
 997    FROM and TO specify the range of text annotated.  They are relative
 998    to coding->src_pos (on encoding) or coding->dst_pos (on decoding).
 999
1000    The format of the following elements depend on ANNOTATION_MASK.
1001
1002    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1003    follows:
1004      ... METHOD [ COMPOSITION-COMPONENTS ... ]
1005    METHOD is one of enum composition_method.
1006    Optionnal COMPOSITION-COMPONENTS are characters and composition
1007    rules.
1008
1009    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1010    follows.  */
1011
1012 #define ADD_ANNOTATION_DATA(buf, len, mask, from, to)   \
1013   do {                                                  \
1014     *(buf)++ = -(len);                                  \
1015     *(buf)++ = (mask);                                  \
1016     *(buf)++ = (from);                                  \
1017     *(buf)++ = (to);                                    \
1018     coding->annotated = 1;                              \
1019   } while (0);
1020
1021 #define ADD_COMPOSITION_DATA(buf, from, to, method)                           \
1022   do {                                                                        \
1023     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, from, to); \
1024     *buf++ = method;                                                          \
1025   } while (0)
1026
1027
1028 #define ADD_CHARSET_DATA(buf, from, to, id)                               \
1029   do {                                                                    \
1030     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_CHARSET_MASK, from, to); \
1031     *buf++ = id;                                                          \
1032   } while (0)
1033
1034 \f
1035 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1036
1037
1038
1039 \f
1040 /*** 3. UTF-8 ***/
1041
1042 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1043    Check if a text is encoded in UTF-8.  If it is, return 1, else
1044    return 0.  */
1045
1046 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1047 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1048 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1049 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1050 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1051 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1052
1053 static int
1054 detect_coding_utf_8 (coding, detect_info)
1055      struct coding_system *coding;
1056      struct coding_detection_info *detect_info;
1057 {
1058   unsigned char *src = coding->source, *src_base = src;
1059   unsigned char *src_end = coding->source + coding->src_bytes;
1060   int multibytep = coding->src_multibyte;
1061   int consumed_chars = 0;
1062   int found = 0;
1063   int incomplete;
1064
1065   detect_info->checked |= CATEGORY_MASK_UTF_8;
1066   /* A coding system of this category is always ASCII compatible.  */
1067   src += coding->head_ascii;
1068
1069   while (1)
1070     {
1071       int c, c1, c2, c3, c4;
1072
1073       incomplete = 0;
1074       ONE_MORE_BYTE (c);
1075       if (UTF_8_1_OCTET_P (c))
1076         continue;
1077       incomplete = 1;
1078       ONE_MORE_BYTE (c1);
1079       if (! UTF_8_EXTRA_OCTET_P (c1))
1080         break;
1081       if (UTF_8_2_OCTET_LEADING_P (c))
1082         {
1083           found = CATEGORY_MASK_UTF_8;
1084           continue;
1085         }
1086       ONE_MORE_BYTE (c2);
1087       if (! UTF_8_EXTRA_OCTET_P (c2))
1088         break;
1089       if (UTF_8_3_OCTET_LEADING_P (c))
1090         {
1091           found = CATEGORY_MASK_UTF_8;
1092           continue;
1093         }
1094       ONE_MORE_BYTE (c3);
1095       if (! UTF_8_EXTRA_OCTET_P (c3))
1096         break;
1097       if (UTF_8_4_OCTET_LEADING_P (c))
1098         {
1099           found = CATEGORY_MASK_UTF_8;
1100           continue;
1101         }
1102       ONE_MORE_BYTE (c4);
1103       if (! UTF_8_EXTRA_OCTET_P (c4))
1104         break;
1105       if (UTF_8_5_OCTET_LEADING_P (c))
1106         {
1107           found = CATEGORY_MASK_UTF_8;
1108           continue;
1109         }
1110       break;
1111     }
1112   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1113   return 0;
1114
1115  no_more_source:
1116   if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
1117     {
1118       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1119       return 0;
1120     }
1121   detect_info->found |= found;
1122   return 1;
1123 }
1124
1125
1126 static void
1127 decode_coding_utf_8 (coding)
1128      struct coding_system *coding;
1129 {
1130   unsigned char *src = coding->source + coding->consumed;
1131   unsigned char *src_end = coding->source + coding->src_bytes;
1132   unsigned char *src_base;
1133   int *charbuf = coding->charbuf;
1134   int *charbuf_end = charbuf + coding->charbuf_size;
1135   int consumed_chars = 0, consumed_chars_base;
1136   int multibytep = coding->src_multibyte;
1137   Lisp_Object attr, eol_type, charset_list;
1138
1139   CODING_GET_INFO (coding, attr, eol_type, charset_list);
1140
1141   while (1)
1142     {
1143       int c, c1, c2, c3, c4, c5;
1144
1145       src_base = src;
1146       consumed_chars_base = consumed_chars;
1147
1148       if (charbuf >= charbuf_end)
1149         break;
1150
1151       ONE_MORE_BYTE (c1);
1152       if (UTF_8_1_OCTET_P(c1))
1153         {
1154           c = c1;
1155           if (c == '\r')
1156             {
1157               if (EQ (eol_type, Qdos))
1158                 {
1159                   if (src == src_end)
1160                     {
1161                       coding->result = CODING_RESULT_INSUFFICIENT_SRC;
1162                       goto no_more_source;
1163                     }
1164                   if (*src == '\n')
1165                     ONE_MORE_BYTE (c);
1166                 }
1167               else if (EQ (eol_type, Qmac))
1168                 c = '\n';
1169             }
1170         }
1171       else
1172         {
1173           ONE_MORE_BYTE (c2);
1174           if (! UTF_8_EXTRA_OCTET_P (c2))
1175             goto invalid_code;
1176           if (UTF_8_2_OCTET_LEADING_P (c1))
1177             {
1178               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1179               /* Reject overlong sequences here and below.  Encoders
1180                  producing them are incorrect, they can be misleading,
1181                  and they mess up read/write invariance.  */
1182               if (c < 128)
1183                 goto invalid_code;
1184             }
1185           else
1186             {
1187               ONE_MORE_BYTE (c3);
1188               if (! UTF_8_EXTRA_OCTET_P (c3))
1189                 goto invalid_code;
1190               if (UTF_8_3_OCTET_LEADING_P (c1))
1191                 {
1192                   c = (((c1 & 0xF) << 12)
1193                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1194                   if (c < 0x800
1195                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1196                     goto invalid_code;
1197                 }
1198               else
1199                 {
1200                   ONE_MORE_BYTE (c4);
1201                   if (! UTF_8_EXTRA_OCTET_P (c4))
1202                     goto invalid_code;
1203                   if (UTF_8_4_OCTET_LEADING_P (c1))
1204                     {
1205                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1206                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1207                     if (c < 0x10000)
1208                       goto invalid_code;
1209                     }
1210                   else
1211                     {
1212                       ONE_MORE_BYTE (c5);
1213                       if (! UTF_8_EXTRA_OCTET_P (c5))
1214                         goto invalid_code;
1215                       if (UTF_8_5_OCTET_LEADING_P (c1))
1216                         {
1217                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1218                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1219                                | (c5 & 0x3F));
1220                           if ((c > MAX_CHAR) || (c < 0x200000))
1221                             goto invalid_code;
1222                         }
1223                       else
1224                         goto invalid_code;
1225                     }
1226                 }
1227             }
1228         }
1229
1230       *charbuf++ = c;
1231       continue;
1232
1233     invalid_code:
1234       src = src_base;
1235       consumed_chars = consumed_chars_base;
1236       ONE_MORE_BYTE (c);
1237       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1238       coding->errors++;
1239     }
1240
1241  no_more_source:
1242   coding->consumed_char += consumed_chars_base;
1243   coding->consumed = src_base - coding->source;
1244   coding->charbuf_used = charbuf - coding->charbuf;
1245 }
1246
1247
1248 static int
1249 encode_coding_utf_8 (coding)
1250      struct coding_system *coding;
1251 {
1252   int multibytep = coding->dst_multibyte;
1253   int *charbuf = coding->charbuf;
1254   int *charbuf_end = charbuf + coding->charbuf_used;
1255   unsigned char *dst = coding->destination + coding->produced;
1256   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1257   int produced_chars = 0;
1258   int c;
1259
1260   if (multibytep)
1261     {
1262       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1263
1264       while (charbuf < charbuf_end)
1265         {
1266           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1267
1268           ASSURE_DESTINATION (safe_room);
1269           c = *charbuf++;
1270           if (CHAR_BYTE8_P (c))
1271             {
1272               c = CHAR_TO_BYTE8 (c);
1273               EMIT_ONE_BYTE (c);
1274             }
1275           else
1276             {
1277               CHAR_STRING_ADVANCE (c, pend);
1278               for (p = str; p < pend; p++)
1279                 EMIT_ONE_BYTE (*p);
1280             }
1281         }
1282     }
1283   else
1284     {
1285       int safe_room = MAX_MULTIBYTE_LENGTH;
1286
1287       while (charbuf < charbuf_end)
1288         {
1289           ASSURE_DESTINATION (safe_room);
1290           c = *charbuf++;
1291           dst += CHAR_STRING (c, dst);
1292           produced_chars++;
1293         }
1294     }
1295   coding->result = CODING_RESULT_SUCCESS;
1296   coding->produced_char += produced_chars;
1297   coding->produced = dst - coding->destination;
1298   return 0;
1299 }
1300
1301
1302 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1303    Check if a text is encoded in one of UTF-16 based coding systems.
1304    If it is, return 1, else return 0.  */
1305
1306 #define UTF_16_HIGH_SURROGATE_P(val) \
1307   (((val) & 0xFC00) == 0xD800)
1308
1309 #define UTF_16_LOW_SURROGATE_P(val) \
1310   (((val) & 0xFC00) == 0xDC00)
1311
1312 #define UTF_16_INVALID_P(val)   \
1313   (((val) == 0xFFFE)            \
1314    || ((val) == 0xFFFF)         \
1315    || UTF_16_LOW_SURROGATE_P (val))
1316
1317
1318 static int
1319 detect_coding_utf_16 (coding, detect_info)
1320      struct coding_system *coding;
1321      struct coding_detection_info *detect_info;
1322 {
1323   unsigned char *src = coding->source, *src_base = src;
1324   unsigned char *src_end = coding->source + coding->src_bytes;
1325   int multibytep = coding->src_multibyte;
1326   int consumed_chars = 0;
1327   int c1, c2;
1328
1329   detect_info->checked |= CATEGORY_MASK_UTF_16;
1330
1331   if (coding->mode & CODING_MODE_LAST_BLOCK
1332       && (coding->src_bytes & 1))
1333     {
1334       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1335       return 0;
1336     }
1337   ONE_MORE_BYTE (c1);
1338   ONE_MORE_BYTE (c2);
1339
1340   if ((c1 == 0xFF) && (c2 == 0xFE))
1341     {
1342       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1343                              | CATEGORY_MASK_UTF_16_AUTO);
1344       detect_info->rejected |= CATEGORY_MASK_UTF_16_BE;
1345     }
1346   else if ((c1 == 0xFE) && (c2 == 0xFF))
1347     {
1348       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1349                              | CATEGORY_MASK_UTF_16_AUTO);
1350       detect_info->rejected |= CATEGORY_MASK_UTF_16_LE;
1351     }
1352  no_more_source:
1353   return 1;
1354 }
1355
1356 static void
1357 decode_coding_utf_16 (coding)
1358      struct coding_system *coding;
1359 {
1360   unsigned char *src = coding->source + coding->consumed;
1361   unsigned char *src_end = coding->source + coding->src_bytes;
1362   unsigned char *src_base;
1363   int *charbuf = coding->charbuf;
1364   int *charbuf_end = charbuf + coding->charbuf_size;
1365   int consumed_chars = 0, consumed_chars_base;
1366   int multibytep = coding->src_multibyte;
1367   enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1368   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1369   int surrogate = CODING_UTF_16_SURROGATE (coding);
1370   Lisp_Object attr, eol_type, charset_list;
1371
1372   CODING_GET_INFO (coding, attr, eol_type, charset_list);
1373
1374   if (bom == utf_16_with_bom)
1375     {
1376       int c, c1, c2;
1377
1378       src_base = src;
1379       ONE_MORE_BYTE (c1);
1380       ONE_MORE_BYTE (c2);
1381       c = (c1 << 8) | c2;
1382
1383       if (endian == utf_16_big_endian
1384           ? c != 0xFEFF : c != 0xFFFE)
1385         {
1386           /* The first two bytes are not BOM.  Treat them as bytes
1387              for a normal character.  */
1388           src = src_base;
1389           coding->errors++;
1390         }
1391       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1392     }
1393   else if (bom == utf_16_detect_bom)
1394     {
1395       /* We have already tried to detect BOM and failed in
1396          detect_coding.  */
1397       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1398     }
1399
1400   while (1)
1401     {
1402       int c, c1, c2;
1403
1404       src_base = src;
1405       consumed_chars_base = consumed_chars;
1406
1407       if (charbuf + 2 >= charbuf_end)
1408         break;
1409
1410       ONE_MORE_BYTE (c1);
1411       ONE_MORE_BYTE (c2);
1412       c = (endian == utf_16_big_endian
1413            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1414       if (surrogate)
1415         {
1416           if (! UTF_16_LOW_SURROGATE_P (c))
1417             {
1418               if (endian == utf_16_big_endian)
1419                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1420               else
1421                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1422               *charbuf++ = c1;
1423               *charbuf++ = c2;
1424               coding->errors++;
1425               if (UTF_16_HIGH_SURROGATE_P (c))
1426                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1427               else
1428                 *charbuf++ = c;
1429             }
1430           else
1431             {
1432               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1433               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1434               *charbuf++ = c;
1435             }
1436         }
1437       else
1438         {
1439           if (UTF_16_HIGH_SURROGATE_P (c))
1440             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1441           else
1442             *charbuf++ = c;
1443         }
1444     }
1445
1446  no_more_source:
1447   coding->consumed_char += consumed_chars_base;
1448   coding->consumed = src_base - coding->source;
1449   coding->charbuf_used = charbuf - coding->charbuf;
1450 }
1451
1452 static int
1453 encode_coding_utf_16 (coding)
1454      struct coding_system *coding;
1455 {
1456   int multibytep = coding->dst_multibyte;
1457   int *charbuf = coding->charbuf;
1458   int *charbuf_end = charbuf + coding->charbuf_used;
1459   unsigned char *dst = coding->destination + coding->produced;
1460   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1461   int safe_room = 8;
1462   enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1463   int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1464   int produced_chars = 0;
1465   Lisp_Object attrs, eol_type, charset_list;
1466   int c;
1467
1468   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1469
1470   if (bom != utf_16_without_bom)
1471     {
1472       ASSURE_DESTINATION (safe_room);
1473       if (big_endian)
1474         EMIT_TWO_BYTES (0xFE, 0xFF);
1475       else
1476         EMIT_TWO_BYTES (0xFF, 0xFE);
1477       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1478     }
1479
1480   while (charbuf < charbuf_end)
1481     {
1482       ASSURE_DESTINATION (safe_room);
1483       c = *charbuf++;
1484       if (c >= MAX_UNICODE_CHAR)
1485         c = coding->default_char;
1486
1487       if (c < 0x10000)
1488         {
1489           if (big_endian)
1490             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1491           else
1492             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1493         }
1494       else
1495         {
1496           int c1, c2;
1497
1498           c -= 0x10000;
1499           c1 = (c >> 10) + 0xD800;
1500           c2 = (c & 0x3FF) + 0xDC00;
1501           if (big_endian)
1502             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1503           else
1504             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1505         }
1506     }
1507   coding->result = CODING_RESULT_SUCCESS;
1508   coding->produced = dst - coding->destination;
1509   coding->produced_char += produced_chars;
1510   return 0;
1511 }
1512
1513 \f
1514 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1515
1516 /* Emacs' internal format for representation of multiple character
1517    sets is a kind of multi-byte encoding, i.e. characters are
1518    represented by variable-length sequences of one-byte codes.
1519
1520    ASCII characters and control characters (e.g. `tab', `newline') are
1521    represented by one-byte sequences which are their ASCII codes, in
1522    the range 0x00 through 0x7F.
1523
1524    8-bit characters of the range 0x80..0x9F are represented by
1525    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1526    code + 0x20).
1527
1528    8-bit characters of the range 0xA0..0xFF are represented by
1529    one-byte sequences which are their 8-bit code.
1530
1531    The other characters are represented by a sequence of `base
1532    leading-code', optional `extended leading-code', and one or two
1533    `position-code's.  The length of the sequence is determined by the
1534    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1535    whereas extended leading-code and position-code take the range 0xA0
1536    through 0xFF.  See `charset.h' for more details about leading-code
1537    and position-code.
1538
1539    --- CODE RANGE of Emacs' internal format ---
1540    character set        range
1541    -------------        -----
1542    ascii                0x00..0x7F
1543    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1544    eight-bit-graphic    0xA0..0xBF
1545    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1546    ---------------------------------------------
1547
1548    As this is the internal character representation, the format is
1549    usually not used externally (i.e. in a file or in a data sent to a
1550    process).  But, it is possible to have a text externally in this
1551    format (i.e. by encoding by the coding system `emacs-mule').
1552
1553    In that case, a sequence of one-byte codes has a slightly different
1554    form.
1555
1556    At first, all characters in eight-bit-control are represented by
1557    one-byte sequences which are their 8-bit code.
1558
1559    Next, character composition data are represented by the byte
1560    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1561    where,
1562         METHOD is 0xF0 plus one of composition method (enum
1563         composition_method),
1564
1565         BYTES is 0xA0 plus a byte length of this composition data,
1566
1567         CHARS is 0x20 plus a number of characters composed by this
1568         data,
1569
1570         COMPONENTs are characters of multibye form or composition
1571         rules encoded by two-byte of ASCII codes.
1572
1573    In addition, for backward compatibility, the following formats are
1574    also recognized as composition data on decoding.
1575
1576    0x80 MSEQ ...
1577    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1578
1579    Here,
1580         MSEQ is a multibyte form but in these special format:
1581           ASCII: 0xA0 ASCII_CODE+0x80,
1582           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1583         RULE is a one byte code of the range 0xA0..0xF0 that
1584         represents a composition rule.
1585   */
1586
1587 char emacs_mule_bytes[256];
1588
1589 int
1590 emacs_mule_char (coding, src, nbytes, nchars, id)
1591      struct coding_system *coding;
1592      unsigned char *src;
1593      int *nbytes, *nchars, *id;
1594 {
1595   unsigned char *src_end = coding->source + coding->src_bytes;
1596   int multibytep = coding->src_multibyte;
1597   unsigned char *src_base = src;
1598   struct charset *charset;
1599   unsigned code;
1600   int c;
1601   int consumed_chars = 0;
1602
1603   ONE_MORE_BYTE (c);
1604   switch (emacs_mule_bytes[c])
1605     {
1606     case 2:
1607       if (! (charset = emacs_mule_charset[c]))
1608         goto invalid_code;
1609       ONE_MORE_BYTE (c);
1610       code = c & 0x7F;
1611       break;
1612
1613     case 3:
1614       if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1615           || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1616         {
1617           ONE_MORE_BYTE (c);
1618           if (! (charset = emacs_mule_charset[c]))
1619             goto invalid_code;
1620           ONE_MORE_BYTE (c);
1621           code = c & 0x7F;
1622         }
1623       else
1624         {
1625           if (! (charset = emacs_mule_charset[c]))
1626             goto invalid_code;
1627           ONE_MORE_BYTE (c);
1628           code = (c & 0x7F) << 8;
1629           ONE_MORE_BYTE (c);
1630           code |= c & 0x7F;
1631         }
1632       break;
1633
1634     case 4:
1635       ONE_MORE_BYTE (c);
1636       if (! (charset = emacs_mule_charset[c]))
1637         goto invalid_code;
1638       ONE_MORE_BYTE (c);
1639       code = (c & 0x7F) << 8;
1640       ONE_MORE_BYTE (c);
1641       code |= c & 0x7F;
1642       break;
1643
1644     case 1:
1645       code = c;
1646       charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
1647                                  ? charset_ascii : charset_eight_bit);
1648       break;
1649
1650     default:
1651       abort ();
1652     }
1653   c = DECODE_CHAR (charset, code);
1654   if (c < 0)
1655     goto invalid_code;
1656   *nbytes = src - src_base;
1657   *nchars = consumed_chars;
1658   if (id)
1659     *id = charset->id;
1660   return c;
1661
1662  no_more_source:
1663   return -2;
1664
1665  invalid_code:
1666   return -1;
1667 }
1668
1669
1670 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1671    Check if a text is encoded in `emacs-mule'.  If it is, return 1,
1672    else return 0.  */
1673
1674 static int
1675 detect_coding_emacs_mule (coding, detect_info)
1676      struct coding_system *coding;
1677      struct coding_detection_info *detect_info;
1678 {
1679   unsigned char *src = coding->source, *src_base = src;
1680   unsigned char *src_end = coding->source + coding->src_bytes;
1681   int multibytep = coding->src_multibyte;
1682   int consumed_chars = 0;
1683   int c;
1684   int found = 0;
1685   int incomplete;
1686
1687   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1688   /* A coding system of this category is always ASCII compatible.  */
1689   src += coding->head_ascii;
1690
1691   while (1)
1692     {
1693       incomplete = 0;
1694       ONE_MORE_BYTE (c);
1695       incomplete = 1;
1696
1697       if (c == 0x80)
1698         {
1699           /* Perhaps the start of composite character.  We simple skip
1700              it because analyzing it is too heavy for detecting.  But,
1701              at least, we check that the composite character
1702              constitues of more than 4 bytes.  */
1703           unsigned char *src_base;
1704
1705         repeat:
1706           src_base = src;
1707           do
1708             {
1709               ONE_MORE_BYTE (c);
1710             }
1711           while (c >= 0xA0);
1712
1713           if (src - src_base <= 4)
1714             break;
1715           found = CATEGORY_MASK_EMACS_MULE;
1716           if (c == 0x80)
1717             goto repeat;
1718         }
1719
1720       if (c < 0x80)
1721         {
1722           if (c < 0x20
1723               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1724             break;
1725         }
1726       else
1727         {
1728           unsigned char *src_base = src - 1;
1729
1730           do
1731             {
1732               ONE_MORE_BYTE (c);
1733             }
1734           while (c >= 0xA0);
1735           if (src - src_base != emacs_mule_bytes[*src_base])
1736             break;
1737           found = CATEGORY_MASK_EMACS_MULE;
1738         }
1739     }
1740   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1741   return 0;
1742
1743  no_more_source:
1744   if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
1745     {
1746       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1747       return 0;
1748     }
1749   detect_info->found |= found;
1750   return 1;
1751 }
1752
1753
1754 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1755
1756 /* Decode a character represented as a component of composition
1757    sequence of Emacs 20/21 style at SRC.  Set C to that character and
1758    update SRC to the head of next character (or an encoded composition
1759    rule).  If SRC doesn't points a composition component, set C to -1.
1760    If SRC points an invalid byte sequence, global exit by a return
1761    value 0.  */
1762
1763 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf)                 \
1764   if (1)                                                        \
1765     {                                                           \
1766       int c;                                                    \
1767       int nbytes, nchars;                                       \
1768                                                                 \
1769       if (src == src_end)                                       \
1770         break;                                                  \
1771       c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
1772       if (c < 0)                                                \
1773         {                                                       \
1774           if (c == -2)                                          \
1775             break;                                              \
1776           goto invalid_code;                                    \
1777         }                                                       \
1778       *buf++ = c;                                               \
1779       src += nbytes;                                            \
1780       consumed_chars += nchars;                                 \
1781     }                                                           \
1782   else
1783
1784
1785 /* Decode a composition rule represented as a component of composition
1786    sequence of Emacs 20 style at SRC.  Store the decoded rule in *BUF,
1787    and increment BUF.  If SRC points an invalid byte sequence, set C
1788    to -1.  */
1789
1790 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf)      \
1791   do {                                                  \
1792     int c, gref, nref;                                  \
1793                                                         \
1794     if (src >= src_end)                                 \
1795       goto invalid_code;                                \
1796     ONE_MORE_BYTE_NO_CHECK (c);                         \
1797     c -= 0x20;                                          \
1798     if (c < 0 || c >= 81)                               \
1799       goto invalid_code;                                \
1800                                                         \
1801     gref = c / 9, nref = c % 9;                         \
1802     *buf++ = COMPOSITION_ENCODE_RULE (gref, nref);      \
1803   } while (0)
1804
1805
1806 /* Decode a composition rule represented as a component of composition
1807    sequence of Emacs 21 style at SRC.  Store the decoded rule in *BUF,
1808    and increment BUF.  If SRC points an invalid byte sequence, set C
1809    to -1.  */
1810
1811 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf)      \
1812   do {                                                  \
1813     int gref, nref;                                     \
1814                                                         \
1815     if (src + 1>= src_end)                              \
1816       goto invalid_code;                                \
1817     ONE_MORE_BYTE_NO_CHECK (gref);                      \
1818     gref -= 0x20;                                       \
1819     ONE_MORE_BYTE_NO_CHECK (nref);                      \
1820     nref -= 0x20;                                       \
1821     if (gref < 0 || gref >= 81                          \
1822         || nref < 0 || nref >= 81)                      \
1823       goto invalid_code;                                \
1824     *buf++ = COMPOSITION_ENCODE_RULE (gref, nref);      \
1825   } while (0)
1826
1827
1828 #define DECODE_EMACS_MULE_21_COMPOSITION(c)                             \
1829   do {                                                                  \
1830     /* Emacs 21 style format.  The first three bytes at SRC are         \
1831        (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is  \
1832        the byte length of this composition information, CHARS is the    \
1833        number of characters composed by this composition.  */           \
1834     enum composition_method method = c - 0xF2;                          \
1835     int *charbuf_base = charbuf;                                        \
1836     int from, to;                                                       \
1837     int consumed_chars_limit;                                           \
1838     int nbytes, nchars;                                                 \
1839                                                                         \
1840     ONE_MORE_BYTE (c);                                                  \
1841     nbytes = c - 0xA0;                                                  \
1842     if (nbytes < 3)                                                     \
1843       goto invalid_code;                                                \
1844     ONE_MORE_BYTE (c);                                                  \
1845     nchars = c - 0xA0;                                                  \
1846     from = coding->produced + char_offset;                              \
1847     to = from + nchars;                                                 \
1848     ADD_COMPOSITION_DATA (charbuf, from, to, method);                   \
1849     consumed_chars_limit = consumed_chars_base + nbytes;                \
1850     if (method != COMPOSITION_RELATIVE)                                 \
1851       {                                                                 \
1852         int i = 0;                                                      \
1853         while (consumed_chars < consumed_chars_limit)                   \
1854           {                                                             \
1855             if (i % 2 && method != COMPOSITION_WITH_ALTCHARS)           \
1856               DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf);          \
1857             else                                                        \
1858               DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf);             \
1859             i++;                                                        \
1860           }                                                             \
1861         if (consumed_chars < consumed_chars_limit)                      \
1862           goto invalid_code;                                            \
1863         charbuf_base[0] -= i;                                           \
1864       }                                                                 \
1865   } while (0)
1866
1867
1868 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c)            \
1869   do {                                                          \
1870     /* Emacs 20 style format for relative composition.  */      \
1871     /* Store multibyte form of characters to be composed.  */   \
1872     enum composition_method method = COMPOSITION_RELATIVE;      \
1873     int components[MAX_COMPOSITION_COMPONENTS * 2 - 1];         \
1874     int *buf = components;                                      \
1875     int i, j;                                                   \
1876     int from, to;                                               \
1877                                                                 \
1878     src = src_base;                                             \
1879     ONE_MORE_BYTE (c);          /* skip 0x80 */                 \
1880     for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++)            \
1881       DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);                 \
1882     if (i < 2)                                                  \
1883       goto invalid_code;                                        \
1884     from = coding->produced_char + char_offset;                 \
1885     to = from + i;                                              \
1886     ADD_COMPOSITION_DATA (charbuf, from, to, method);           \
1887     for (j = 0; j < i; j++)                                     \
1888       *charbuf++ = components[j];                               \
1889   } while (0)
1890
1891
1892 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c)            \
1893   do {                                                          \
1894     /* Emacs 20 style format for rule-base composition.  */     \
1895     /* Store multibyte form of characters to be composed.  */   \
1896     enum composition_method method = COMPOSITION_WITH_RULE;     \
1897     int components[MAX_COMPOSITION_COMPONENTS * 2 - 1];         \
1898     int *buf = components;                                      \
1899     int i, j;                                                   \
1900     int from, to;                                               \
1901                                                                 \
1902     DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);                   \
1903     for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++)            \
1904       {                                                         \
1905         DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf);            \
1906         DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);               \
1907       }                                                         \
1908     if (i < 1 || (buf - components) % 2 == 0)                   \
1909       goto invalid_code;                                        \
1910     if (charbuf + i + (i / 2) + 1 < charbuf_end)                \
1911       goto no_more_source;                                      \
1912     from = coding->produced_char + char_offset;                 \
1913     to = from + i;                                              \
1914     ADD_COMPOSITION_DATA (buf, from, to, method);               \
1915     for (j = 0; j < i; j++)                                     \
1916       *charbuf++ = components[j];                               \
1917     for (j = 0; j < i; j += 2)                                  \
1918       *charbuf++ = components[j];                               \
1919   } while (0)
1920
1921
1922 static void
1923 decode_coding_emacs_mule (coding)
1924      struct coding_system *coding;
1925 {
1926   unsigned char *src = coding->source + coding->consumed;
1927   unsigned char *src_end = coding->source + coding->src_bytes;
1928   unsigned char *src_base;
1929   int *charbuf = coding->charbuf;
1930   int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
1931   int consumed_chars = 0, consumed_chars_base;
1932   int multibytep = coding->src_multibyte;
1933   Lisp_Object attrs, eol_type, charset_list;
1934   int char_offset = coding->produced_char;
1935   int last_offset = char_offset;
1936   int last_id = charset_ascii;
1937
1938   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1939
1940   while (1)
1941     {
1942       int c;
1943
1944       src_base = src;
1945       consumed_chars_base = consumed_chars;
1946
1947       if (charbuf >= charbuf_end)
1948         break;
1949
1950       ONE_MORE_BYTE (c);
1951
1952       if (c < 0x80)
1953         {
1954           if (c == '\r')
1955             {
1956               if (EQ (eol_type, Qdos))
1957                 {
1958                   if (src == src_end)
1959                     {
1960                       coding->result = CODING_RESULT_INSUFFICIENT_SRC;
1961                       goto no_more_source;
1962                     }
1963                   if (*src == '\n')
1964                     ONE_MORE_BYTE (c);
1965                 }
1966               else if (EQ (eol_type, Qmac))
1967                 c = '\n';
1968             }
1969           *charbuf++ = c;
1970           char_offset++;
1971         }
1972       else if (c == 0x80)
1973         {
1974           ONE_MORE_BYTE (c);
1975           if (c - 0xF2 >= COMPOSITION_RELATIVE
1976               && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
1977             DECODE_EMACS_MULE_21_COMPOSITION (c);
1978           else if (c < 0xC0)
1979             DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
1980           else if (c == 0xFF)
1981             DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
1982           else
1983             goto invalid_code;
1984         }
1985       else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
1986         {
1987           int nbytes, nchars;
1988           int id;
1989
1990           src = src_base;
1991           consumed_chars = consumed_chars_base;
1992           c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
1993           if (c < 0)
1994             {
1995               if (c == -2)
1996                 break;
1997               goto invalid_code;
1998             }
1999           if (last_id != id)
2000             {
2001               if (last_id != charset_ascii)
2002                 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
2003               last_id = id;
2004               last_offset = char_offset;
2005             }
2006           *charbuf++ = c;
2007           src += nbytes;
2008           consumed_chars += nchars;
2009           char_offset++;
2010         }
2011       continue;
2012
2013     invalid_code:
2014       src = src_base;
2015       consumed_chars = consumed_chars_base;
2016       ONE_MORE_BYTE (c);
2017       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2018       char_offset++;
2019       coding->errors++;
2020     }
2021
2022  no_more_source:
2023   if (last_id != charset_ascii)
2024     ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
2025   coding->consumed_char += consumed_chars_base;
2026   coding->consumed = src_base - coding->source;
2027   coding->charbuf_used = charbuf - coding->charbuf;
2028 }
2029
2030
2031 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2032   do {                                          \
2033     if (id < 0xA0)                              \
2034       codes[0] = id, codes[1] = 0;              \
2035     else if (id < 0xE0)                         \
2036       codes[0] = 0x9A, codes[1] = id;           \
2037     else if (id < 0xF0)                         \
2038       codes[0] = 0x9B, codes[1] = id;           \
2039     else if (id < 0xF5)                         \
2040       codes[0] = 0x9C, codes[1] = id;           \
2041     else                                        \
2042       codes[0] = 0x9D, codes[1] = id;           \
2043   } while (0);
2044
2045
2046 static int
2047 encode_coding_emacs_mule (coding)
2048      struct coding_system *coding;
2049 {
2050   int multibytep = coding->dst_multibyte;
2051   int *charbuf = coding->charbuf;
2052   int *charbuf_end = charbuf + coding->charbuf_used;
2053   unsigned char *dst = coding->destination + coding->produced;
2054   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2055   int safe_room = 8;
2056   int produced_chars = 0;
2057   Lisp_Object attrs, eol_type, charset_list;
2058   int c;
2059   int preferred_charset_id = -1;
2060
2061   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
2062
2063   while (charbuf < charbuf_end)
2064     {
2065       ASSURE_DESTINATION (safe_room);
2066       c = *charbuf++;
2067
2068       if (c < 0)
2069         {
2070           /* Handle an annotation.  */
2071           switch (*charbuf)
2072             {
2073             case CODING_ANNOTATE_COMPOSITION_MASK:
2074               /* Not yet implemented.  */
2075               break;
2076             case CODING_ANNOTATE_CHARSET_MASK:
2077               preferred_charset_id = charbuf[3];
2078               if (preferred_charset_id >= 0
2079                   && NILP (Fmemq (make_number (preferred_charset_id),
2080                                   charset_list)))
2081                 preferred_charset_id = -1;
2082               break;
2083             default:
2084               abort ();
2085             }
2086           charbuf += -c - 1;
2087           continue;
2088         }
2089
2090       if (ASCII_CHAR_P (c))
2091         EMIT_ONE_ASCII_BYTE (c);
2092       else if (CHAR_BYTE8_P (c))
2093         {
2094           c = CHAR_TO_BYTE8 (c);
2095           EMIT_ONE_BYTE (c);
2096         }
2097       else
2098         {
2099           struct charset *charset;
2100           unsigned code;
2101           int dimension;
2102           int emacs_mule_id;
2103           unsigned char leading_codes[2];
2104
2105           if (preferred_charset_id >= 0)
2106             {
2107               charset = CHARSET_FROM_ID (preferred_charset_id);
2108               if (! CHAR_CHARSET_P (c, charset))
2109                 charset = char_charset (c, charset_list, NULL);
2110             }
2111           else
2112             charset = char_charset (c, charset_list, &code);
2113           if (! charset)
2114             {
2115               c = coding->default_char;
2116               if (ASCII_CHAR_P (c))
2117                 {
2118                   EMIT_ONE_ASCII_BYTE (c);
2119                   continue;
2120                 }
2121               charset = char_charset (c, charset_list, &code);
2122             }
2123           dimension = CHARSET_DIMENSION (charset);
2124           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2125           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2126           EMIT_ONE_BYTE (leading_codes[0]);
2127           if (leading_codes[1])
2128             EMIT_ONE_BYTE (leading_codes[1]);
2129           if (dimension == 1)
2130             EMIT_ONE_BYTE (code);
2131           else
2132             {
2133               EMIT_ONE_BYTE (code >> 8);
2134               EMIT_ONE_BYTE (code & 0xFF);
2135             }
2136         }
2137     }
2138   coding->result = CODING_RESULT_SUCCESS;
2139   coding->produced_char += produced_chars;
2140   coding->produced = dst - coding->destination;
2141   return 0;
2142 }
2143
2144 \f
2145 /*** 7. ISO2022 handlers ***/
2146
2147 /* The following note describes the coding system ISO2022 briefly.
2148    Since the intention of this note is to help understand the
2149    functions in this file, some parts are NOT ACCURATE or are OVERLY
2150    SIMPLIFIED.  For thorough understanding, please refer to the
2151    original document of ISO2022.  This is equivalent to the standard
2152    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2153
2154    ISO2022 provides many mechanisms to encode several character sets
2155    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2156    is encoded using bytes less than 128.  This may make the encoded
2157    text a little bit longer, but the text passes more easily through
2158    several types of gateway, some of which strip off the MSB (Most
2159    Significant Bit).
2160
2161    There are two kinds of character sets: control character sets and
2162    graphic character sets.  The former contain control characters such
2163    as `newline' and `escape' to provide control functions (control
2164    functions are also provided by escape sequences).  The latter
2165    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2166    two control character sets and many graphic character sets.
2167
2168    Graphic character sets are classified into one of the following
2169    four classes, according to the number of bytes (DIMENSION) and
2170    number of characters in one dimension (CHARS) of the set:
2171    - DIMENSION1_CHARS94
2172    - DIMENSION1_CHARS96
2173    - DIMENSION2_CHARS94
2174    - DIMENSION2_CHARS96
2175
2176    In addition, each character set is assigned an identification tag,
2177    unique for each set, called the "final character" (denoted as <F>
2178    hereafter).  The <F> of each character set is decided by ECMA(*)
2179    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2180    (0x30..0x3F are for private use only).
2181
2182    Note (*): ECMA = European Computer Manufacturers Association
2183
2184    Here are examples of graphic character sets [NAME(<F>)]:
2185         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2186         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2187         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2188         o DIMENSION2_CHARS96 -- none for the moment
2189
2190    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2191         C0 [0x00..0x1F] -- control character plane 0
2192         GL [0x20..0x7F] -- graphic character plane 0
2193         C1 [0x80..0x9F] -- control character plane 1
2194         GR [0xA0..0xFF] -- graphic character plane 1
2195
2196    A control character set is directly designated and invoked to C0 or
2197    C1 by an escape sequence.  The most common case is that:
2198    - ISO646's  control character set is designated/invoked to C0, and
2199    - ISO6429's control character set is designated/invoked to C1,
2200    and usually these designations/invocations are omitted in encoded
2201    text.  In a 7-bit environment, only C0 can be used, and a control
2202    character for C1 is encoded by an appropriate escape sequence to
2203    fit into the environment.  All control characters for C1 are
2204    defined to have corresponding escape sequences.
2205
2206    A graphic character set is at first designated to one of four
2207    graphic registers (G0 through G3), then these graphic registers are
2208    invoked to GL or GR.  These designations and invocations can be
2209    done independently.  The most common case is that G0 is invoked to
2210    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2211    these invocations and designations are omitted in encoded text.
2212    In a 7-bit environment, only GL can be used.
2213
2214    When a graphic character set of CHARS94 is invoked to GL, codes
2215    0x20 and 0x7F of the GL area work as control characters SPACE and
2216    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2217    be used.
2218
2219    There are two ways of invocation: locking-shift and single-shift.
2220    With locking-shift, the invocation lasts until the next different
2221    invocation, whereas with single-shift, the invocation affects the
2222    following character only and doesn't affect the locking-shift
2223    state.  Invocations are done by the following control characters or
2224    escape sequences:
2225
2226    ----------------------------------------------------------------------
2227    abbrev  function                  cntrl escape seq   description
2228    ----------------------------------------------------------------------
2229    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2230    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2231    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2232    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2233    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2234    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2235    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2236    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2237    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2238    ----------------------------------------------------------------------
2239    (*) These are not used by any known coding system.
2240
2241    Control characters for these functions are defined by macros
2242    ISO_CODE_XXX in `coding.h'.
2243
2244    Designations are done by the following escape sequences:
2245    ----------------------------------------------------------------------
2246    escape sequence      description
2247    ----------------------------------------------------------------------
2248    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2249    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2250    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2251    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2252    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2253    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2254    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2255    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2256    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2257    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2258    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2259    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2260    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2261    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2262    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2263    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2264    ----------------------------------------------------------------------
2265
2266    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2267    of dimension 1, chars 94, and final character <F>, etc...
2268
2269    Note (*): Although these designations are not allowed in ISO2022,
2270    Emacs accepts them on decoding, and produces them on encoding
2271    CHARS96 character sets in a coding system which is characterized as
2272    7-bit environment, non-locking-shift, and non-single-shift.
2273
2274    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2275    '(' must be omitted.  We refer to this as "short-form" hereafter.
2276
2277    Now you may notice that there are a lot of ways of encoding the
2278    same multilingual text in ISO2022.  Actually, there exist many
2279    coding systems such as Compound Text (used in X11's inter client
2280    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2281    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2282    localized platforms), and all of these are variants of ISO2022.
2283
2284    In addition to the above, Emacs handles two more kinds of escape
2285    sequences: ISO6429's direction specification and Emacs' private
2286    sequence for specifying character composition.
2287
2288    ISO6429's direction specification takes the following form:
2289         o CSI ']'      -- end of the current direction
2290         o CSI '0' ']'  -- end of the current direction
2291         o CSI '1' ']'  -- start of left-to-right text
2292         o CSI '2' ']'  -- start of right-to-left text
2293    The control character CSI (0x9B: control sequence introducer) is
2294    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2295
2296    Character composition specification takes the following form:
2297         o ESC '0' -- start relative composition
2298         o ESC '1' -- end composition
2299         o ESC '2' -- start rule-base composition (*)
2300         o ESC '3' -- start relative composition with alternate chars  (**)
2301         o ESC '4' -- start rule-base composition with alternate chars  (**)
2302   Since these are not standard escape sequences of any ISO standard,
2303   the use of them with these meanings is restricted to Emacs only.
2304
2305   (*) This form is used only in Emacs 20.7 and older versions,
2306   but newer versions can safely decode it.
2307   (**) This form is used only in Emacs 21.1 and newer versions,
2308   and older versions can't decode it.
2309
2310   Here's a list of example usages of these composition escape
2311   sequences (categorized by `enum composition_method').
2312
2313   COMPOSITION_RELATIVE:
2314         ESC 0 CHAR [ CHAR ] ESC 1
2315   COMPOSITION_WITH_RULE:
2316         ESC 2 CHAR [ RULE CHAR ] ESC 1
2317   COMPOSITION_WITH_ALTCHARS:
2318         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2319   COMPOSITION_WITH_RULE_ALTCHARS:
2320         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2321
2322 enum iso_code_class_type iso_code_class[256];
2323
2324 #define SAFE_CHARSET_P(coding, id)      \
2325   ((id) <= (coding)->max_charset_id     \
2326    && (coding)->safe_charsets[id] >= 0)
2327
2328
2329 #define SHIFT_OUT_OK(category)  \
2330   (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2331
2332 static void
2333 setup_iso_safe_charsets (attrs)
2334      Lisp_Object attrs;
2335 {
2336   Lisp_Object charset_list, safe_charsets;
2337   Lisp_Object request;
2338   Lisp_Object reg_usage;
2339   Lisp_Object tail;
2340   int reg94, reg96;
2341   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2342   int max_charset_id;
2343
2344   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2345   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2346       && ! EQ (charset_list, Viso_2022_charset_list))
2347     {
2348       CODING_ATTR_CHARSET_LIST (attrs)
2349         = charset_list = Viso_2022_charset_list;
2350       ASET (attrs, coding_attr_safe_charsets, Qnil);
2351     }
2352
2353   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2354     return;
2355
2356   max_charset_id = 0;
2357   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2358     {
2359       int id = XINT (XCAR (tail));
2360       if (max_charset_id < id)
2361         max_charset_id = id;
2362     }
2363
2364   safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2365                                 make_number (255));
2366   request = AREF (attrs, coding_attr_iso_request);
2367   reg_usage = AREF (attrs, coding_attr_iso_usage);
2368   reg94 = XINT (XCAR (reg_usage));
2369   reg96 = XINT (XCDR (reg_usage));
2370
2371   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2372     {
2373       Lisp_Object id;
2374       Lisp_Object reg;
2375       struct charset *charset;
2376
2377       id = XCAR (tail);
2378       charset = CHARSET_FROM_ID (XINT (id));
2379       reg = Fcdr (Fassq (id, request));
2380       if (! NILP (reg))
2381         XSTRING (safe_charsets)->data[XINT (id)] = XINT (reg);
2382       else if (charset->iso_chars_96)
2383         {
2384           if (reg96 < 4)
2385             XSTRING (safe_charsets)->data[XINT (id)] = reg96;
2386         }
2387       else
2388         {
2389           if (reg94 < 4)
2390             XSTRING (safe_charsets)->data[XINT (id)] = reg94;
2391         }
2392     }
2393   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2394 }
2395
2396
2397 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2398    Check if a text is encoded in one of ISO-2022 based codig systems.
2399    If it is, return 1, else return 0.  */
2400
2401 static int
2402 detect_coding_iso_2022 (coding, detect_info)
2403      struct coding_system *coding;
2404      struct coding_detection_info *detect_info;
2405 {
2406   unsigned char *src = coding->source, *src_base = src;
2407   unsigned char *src_end = coding->source + coding->src_bytes;
2408   int multibytep = coding->src_multibyte;
2409   int single_shifting = 0;
2410   int id;
2411   int c, c1;
2412   int consumed_chars = 0;
2413   int i;
2414   int rejected = 0;
2415   int found = 0;
2416
2417   detect_info->checked |= CATEGORY_MASK_ISO;
2418
2419   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2420     {
2421       struct coding_system *this = &(coding_categories[i]);
2422       Lisp_Object attrs, val;
2423
2424       attrs = CODING_ID_ATTRS (this->id);
2425       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2426           && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2427         setup_iso_safe_charsets (attrs);
2428       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2429       this->max_charset_id = XSTRING (val)->size - 1;
2430       this->safe_charsets = (char *) XSTRING (val)->data;
2431     }
2432
2433   /* A coding system of this category is always ASCII compatible.  */
2434   src += coding->head_ascii;
2435
2436   while (rejected != CATEGORY_MASK_ISO)
2437     {
2438       ONE_MORE_BYTE (c);
2439       switch (c)
2440         {
2441         case ISO_CODE_ESC:
2442           if (inhibit_iso_escape_detection)
2443             break;
2444           single_shifting = 0;
2445           ONE_MORE_BYTE (c);
2446           if (c >= '(' && c <= '/')
2447             {
2448               /* Designation sequence for a charset of dimension 1.  */
2449               ONE_MORE_BYTE (c1);
2450               if (c1 < ' ' || c1 >= 0x80
2451                   || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2452                 /* Invalid designation sequence.  Just ignore.  */
2453                 break;
2454             }
2455           else if (c == '$')
2456             {
2457               /* Designation sequence for a charset of dimension 2.  */
2458               ONE_MORE_BYTE (c);
2459               if (c >= '@' && c <= 'B')
2460                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
2461                 id = iso_charset_table[1][0][c];
2462               else if (c >= '(' && c <= '/')
2463                 {
2464                   ONE_MORE_BYTE (c1);
2465                   if (c1 < ' ' || c1 >= 0x80
2466                       || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2467                     /* Invalid designation sequence.  Just ignore.  */
2468                     break;
2469                 }
2470               else
2471                 /* Invalid designation sequence.  Just ignore it.  */
2472                 break;
2473             }
2474           else if (c == 'N' || c == 'O')
2475             {
2476               /* ESC <Fe> for SS2 or SS3.  */
2477               single_shifting = 1;
2478               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2479               break;
2480             }
2481           else if (c >= '0' && c <= '4')
2482             {
2483               /* ESC <Fp> for start/end composition.  */
2484               found |= CATEGORY_MASK_ISO;
2485               break;
2486             }
2487           else
2488             {
2489               /* Invalid escape sequence.  Just ignore it.  */
2490               break;
2491             }
2492
2493           /* We found a valid designation sequence for CHARSET.  */
2494           rejected |= CATEGORY_MASK_ISO_8BIT;
2495           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2496                               id))
2497             found |= CATEGORY_MASK_ISO_7;
2498           else
2499             rejected |= CATEGORY_MASK_ISO_7;
2500           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2501                               id))
2502             found |= CATEGORY_MASK_ISO_7_TIGHT;
2503           else
2504             rejected |= CATEGORY_MASK_ISO_7_TIGHT;
2505           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2506                               id))
2507             found |= CATEGORY_MASK_ISO_7_ELSE;
2508           else
2509             rejected |= CATEGORY_MASK_ISO_7_ELSE;
2510           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2511                               id))
2512             found |= CATEGORY_MASK_ISO_8_ELSE;
2513           else
2514             rejected |= CATEGORY_MASK_ISO_8_ELSE;
2515           break;
2516
2517         case ISO_CODE_SO:
2518         case ISO_CODE_SI:
2519           /* Locking shift out/in.  */
2520           if (inhibit_iso_escape_detection)
2521             break;
2522           single_shifting = 0;
2523           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2524           found |= CATEGORY_MASK_ISO_ELSE;
2525           break;
2526
2527         case ISO_CODE_CSI:
2528           /* Control sequence introducer.  */
2529           single_shifting = 0;
2530           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2531           found |= CATEGORY_MASK_ISO_8_ELSE;
2532           goto check_extra_latin;
2533
2534
2535         case ISO_CODE_SS2:
2536         case ISO_CODE_SS3:
2537           /* Single shift.   */
2538           if (inhibit_iso_escape_detection)
2539             break;
2540           single_shifting = 1;
2541           rejected |= CATEGORY_MASK_ISO_7BIT;
2542           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2543               & CODING_ISO_FLAG_SINGLE_SHIFT)
2544             found |= CATEGORY_MASK_ISO_8_1;
2545           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2546               & CODING_ISO_FLAG_SINGLE_SHIFT)
2547             found |= CATEGORY_MASK_ISO_8_2;
2548           goto check_extra_latin;
2549
2550         default:
2551           if (c < 0x80)
2552             {
2553               single_shifting = 0;
2554               break;
2555             }
2556           if (c >= 0xA0)
2557             {
2558               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2559               found |= CATEGORY_MASK_ISO_8_1;
2560               /* Check the length of succeeding codes of the range
2561                  0xA0..0FF.  If the byte length is even, we include
2562                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
2563                  only when we are not single shifting.  */
2564               if (! single_shifting
2565                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
2566                 {
2567                   int i = 1;
2568                   while (src < src_end)
2569                     {
2570                       ONE_MORE_BYTE (c);
2571                       if (c < 0xA0)
2572                         break;
2573                       i++;
2574                     }
2575
2576                   if (i & 1 && src < src_end)
2577                     rejected |= CATEGORY_MASK_ISO_8_2;
2578                   else
2579                     found |= CATEGORY_MASK_ISO_8_2;
2580                 }
2581               break;
2582             }
2583         check_extra_latin:
2584           single_shifting = 0;
2585           if (! VECTORP (Vlatin_extra_code_table)
2586               || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2587             {
2588               rejected = CATEGORY_MASK_ISO;
2589               break;
2590             }
2591           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2592               & CODING_ISO_FLAG_LATIN_EXTRA)
2593             found |= CATEGORY_MASK_ISO_8_1;
2594           else
2595             rejected |= CATEGORY_MASK_ISO_8_1;
2596           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2597               & CODING_ISO_FLAG_LATIN_EXTRA)
2598             found |= CATEGORY_MASK_ISO_8_2;
2599           else
2600             rejected |= CATEGORY_MASK_ISO_8_2;
2601         }
2602     }
2603   detect_info->rejected |= CATEGORY_MASK_ISO;
2604   return 0;
2605
2606  no_more_source:
2607   detect_info->rejected |= rejected;
2608   detect_info->found |= (found & ~rejected);
2609   return 1;
2610 }
2611
2612
2613 /* Set designation state into CODING.  */
2614 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
2615   do {                                                                  \
2616     int id, prev;                                                       \
2617                                                                         \
2618     if (final < '0' || final >= 128                                     \
2619         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
2620         || !SAFE_CHARSET_P (coding, id))                                \
2621       {                                                                 \
2622         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
2623         goto invalid_code;                                              \
2624       }                                                                 \
2625     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
2626     if (id == charset_jisx0201_roman)                                   \
2627       {                                                                 \
2628         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
2629           id = charset_ascii;                                           \
2630       }                                                                 \
2631     else if (id == charset_jisx0208_1978)                               \
2632       {                                                                 \
2633         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
2634           id = charset_jisx0208;                                        \
2635       }                                                                 \
2636     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
2637     /* If there was an invalid designation to REG previously, and this  \
2638        designation is ASCII to REG, we should keep this designation     \
2639        sequence.  */                                                    \
2640     if (prev == -2 && id == charset_ascii)                              \
2641       goto invalid_code;                                                \
2642   } while (0)
2643
2644
2645 #define MAYBE_FINISH_COMPOSITION()                              \
2646   do {                                                          \
2647     int i;                                                      \
2648     if (composition_state == COMPOSING_NO)                      \
2649       break;                                                    \
2650     /* It is assured that we have enough room for producing     \
2651        characters stored in the table `components'.  */         \
2652     if (charbuf + component_idx > charbuf_end)                  \
2653       goto no_more_source;                                      \
2654     composition_state = COMPOSING_NO;                           \
2655     if (method == COMPOSITION_RELATIVE                          \
2656         || method == COMPOSITION_WITH_ALTCHARS)                 \
2657       {                                                         \
2658         for (i = 0; i < component_idx; i++)                     \
2659           *charbuf++ = components[i];                           \
2660         char_offset += component_idx;                           \
2661       }                                                         \
2662     else                                                        \
2663       {                                                         \
2664         for (i = 0; i < component_idx; i += 2)                  \
2665           *charbuf++ = components[i];                           \
2666         char_offset += (component_idx / 2) + 1;                 \
2667       }                                                         \
2668   } while (0)
2669
2670
2671 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2672    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2673    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2674    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2675    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2676   */
2677
2678 #define DECODE_COMPOSITION_START(c1)                                    \
2679   do {                                                                  \
2680     if (c1 == '0'                                                       \
2681         && composition_state == COMPOSING_COMPONENT_RULE)               \
2682       {                                                                 \
2683         component_len = component_idx;                                  \
2684         composition_state = COMPOSING_CHAR;                             \
2685       }                                                                 \
2686     else                                                                \
2687       {                                                                 \
2688         unsigned char *p;                                               \
2689                                                                         \
2690         MAYBE_FINISH_COMPOSITION ();                                    \
2691         if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end)         \
2692           goto no_more_source;                                          \
2693         for (p = src; p < src_end - 1; p++)                             \
2694           if (*p == ISO_CODE_ESC && p[1] == '1')                        \
2695             break;                                                      \
2696         if (p == src_end - 1)                                           \
2697           {                                                             \
2698             if (coding->mode & CODING_MODE_LAST_BLOCK)                  \
2699               goto invalid_code;                                        \
2700             goto no_more_source;                                        \
2701           }                                                             \
2702                                                                         \
2703         /* This is surely the start of a composition.  */               \
2704         method = (c1 == '0' ? COMPOSITION_RELATIVE                      \
2705                   : c1 == '2' ? COMPOSITION_WITH_RULE                   \
2706                   : c1 == '3' ? COMPOSITION_WITH_ALTCHARS               \
2707                   : COMPOSITION_WITH_RULE_ALTCHARS);                    \
2708         composition_state = (c1 <= '2' ? COMPOSING_CHAR                 \
2709                              : COMPOSING_COMPONENT_CHAR);               \
2710         component_idx = component_len = 0;                              \
2711       }                                                                 \
2712   } while (0)
2713
2714
2715 /* Handle compositoin end sequence ESC 1.  */
2716
2717 #define DECODE_COMPOSITION_END()                                        \
2718   do {                                                                  \
2719     int nchars = (component_len > 0 ? component_idx - component_len     \
2720                   : method == COMPOSITION_RELATIVE ? component_idx      \
2721                   : (component_idx + 1) / 2);                           \
2722     int i;                                                              \
2723     int *saved_charbuf = charbuf;                                       \
2724     int from = coding->produced_char + char_offset;                     \
2725     int to = from + nchars;                                             \
2726                                                                         \
2727     ADD_COMPOSITION_DATA (charbuf, from, to, method);                   \
2728     if (method != COMPOSITION_RELATIVE)                                 \
2729       {                                                                 \
2730         if (component_len == 0)                                         \
2731           for (i = 0; i < component_idx; i++)                           \
2732             *charbuf++ = components[i];                                 \
2733         else                                                            \
2734           for (i = 0; i < component_len; i++)                           \
2735             *charbuf++ = components[i];                                 \
2736         *saved_charbuf = saved_charbuf - charbuf;                       \
2737       }                                                                 \
2738     if (method == COMPOSITION_WITH_RULE)                                \
2739       for (i = 0; i < component_idx; i += 2, char_offset++)             \
2740         *charbuf++ = components[i];                                     \
2741     else                                                                \
2742       for (i = component_len; i < component_idx; i++, char_offset++)    \
2743         *charbuf++ = components[i];                                     \
2744     coding->annotated = 1;                                              \
2745     composition_state = COMPOSING_NO;                                   \
2746   } while (0)
2747
2748
2749 /* Decode a composition rule from the byte C1 (and maybe one more byte
2750    from SRC) and store one encoded composition rule in
2751    coding->cmp_data.  */
2752
2753 #define DECODE_COMPOSITION_RULE(c1)                                     \
2754   do {                                                                  \
2755     (c1) -= 32;                                                         \
2756     if (c1 < 81)                /* old format (before ver.21) */        \
2757       {                                                                 \
2758         int gref = (c1) / 9;                                            \
2759         int nref = (c1) % 9;                                            \
2760         if (gref == 4) gref = 10;                                       \
2761         if (nref == 4) nref = 10;                                       \
2762         c1 = COMPOSITION_ENCODE_RULE (gref, nref);                      \
2763       }                                                                 \
2764     else if (c1 < 93)           /* new format (after ver.21) */         \
2765       {                                                                 \
2766         ONE_MORE_BYTE (c2);                                             \
2767         c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);                \
2768       }                                                                 \
2769     else                                                                \
2770       c1 = 0;                                                           \
2771   } while (0)
2772
2773
2774 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2775
2776 static void
2777 decode_coding_iso_2022 (coding)
2778      struct coding_system *coding;
2779 {
2780   unsigned char *src = coding->source + coding->consumed;
2781   unsigned char *src_end = coding->source + coding->src_bytes;
2782   unsigned char *src_base;
2783   int *charbuf = coding->charbuf;
2784   int *charbuf_end
2785     = charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
2786   int consumed_chars = 0, consumed_chars_base;
2787   int multibytep = coding->src_multibyte;
2788   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
2789   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2790   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2791   struct charset *charset;
2792   int c;
2793   /* For handling composition sequence.  */
2794 #define COMPOSING_NO                    0
2795 #define COMPOSING_CHAR                  1
2796 #define COMPOSING_RULE                  2
2797 #define COMPOSING_COMPONENT_CHAR        3
2798 #define COMPOSING_COMPONENT_RULE        4
2799
2800   int composition_state = COMPOSING_NO;
2801   enum composition_method method;
2802   int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
2803   int component_idx;
2804   int component_len;
2805   Lisp_Object attrs, eol_type, charset_list;
2806   int char_offset = coding->produced_char;
2807   int last_offset = char_offset;
2808   int last_id = charset_ascii;
2809
2810   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
2811   setup_iso_safe_charsets (attrs);
2812
2813   while (1)
2814     {
2815       int c1, c2;
2816
2817       src_base = src;
2818       consumed_chars_base = consumed_chars;
2819
2820       if (charbuf >= charbuf_end)
2821         break;
2822
2823       ONE_MORE_BYTE (c1);
2824
2825       /* We produce at most one character.  */
2826       switch (iso_code_class [c1])
2827         {
2828         case ISO_0x20_or_0x7F:
2829           if (composition_state != COMPOSING_NO)
2830             {
2831               if (composition_state == COMPOSING_RULE
2832                   || composition_state == COMPOSING_COMPONENT_RULE)
2833                 {
2834                   DECODE_COMPOSITION_RULE (c1);
2835                   components[component_idx++] = c1;
2836                   composition_state--;
2837                   continue;
2838                 }
2839             }
2840           if (charset_id_0 < 0
2841               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
2842             /* This is SPACE or DEL.  */
2843             charset = CHARSET_FROM_ID (charset_ascii);
2844           else
2845             charset = CHARSET_FROM_ID (charset_id_0);
2846           break;
2847
2848         case ISO_graphic_plane_0:
2849           if (composition_state != COMPOSING_NO)
2850             {
2851               if (composition_state == COMPOSING_RULE
2852                   || composition_state == COMPOSING_COMPONENT_RULE)
2853                 {
2854                   DECODE_COMPOSITION_RULE (c1);
2855                   components[component_idx++] = c1;
2856                   composition_state--;
2857                   continue;
2858                 }
2859             }
2860           charset = CHARSET_FROM_ID (charset_id_0);
2861           break;
2862
2863         case ISO_0xA0_or_0xFF:
2864           if (charset_id_1 < 0
2865               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
2866               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
2867             goto invalid_code;
2868           /* This is a graphic character, we fall down ... */
2869
2870         case ISO_graphic_plane_1:
2871           if (charset_id_1 < 0)
2872             goto invalid_code;
2873           charset = CHARSET_FROM_ID (charset_id_1);
2874           break;
2875
2876         case ISO_carriage_return:
2877           if (c1 == '\r')
2878             {
2879               if (EQ (eol_type, Qdos))
2880                 {
2881                   if (src == src_end)
2882                     {
2883                       coding->result = CODING_RESULT_INSUFFICIENT_SRC;
2884                       goto no_more_source;
2885                     }
2886                   if (*src == '\n')
2887                     ONE_MORE_BYTE (c1);
2888                 }
2889               else if (EQ (eol_type, Qmac))
2890                 c1 = '\n';
2891             }
2892           /* fall through */
2893
2894         case ISO_control_0:
2895           MAYBE_FINISH_COMPOSITION ();
2896           charset = CHARSET_FROM_ID (charset_ascii);
2897           break;
2898
2899         case ISO_control_1:
2900           MAYBE_FINISH_COMPOSITION ();
2901           goto invalid_code;
2902
2903         case ISO_shift_out:
2904           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2905               || CODING_ISO_DESIGNATION (coding, 1) < 0)
2906             goto invalid_code;
2907           CODING_ISO_INVOCATION (coding, 0) = 1;
2908           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2909           continue;
2910
2911         case ISO_shift_in:
2912           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
2913             goto invalid_code;
2914           CODING_ISO_INVOCATION (coding, 0) = 0;
2915           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2916           continue;
2917
2918         case ISO_single_shift_2_7:
2919         case ISO_single_shift_2:
2920           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2921             goto invalid_code;
2922           /* SS2 is handled as an escape sequence of ESC 'N' */
2923           c1 = 'N';
2924           goto label_escape_sequence;
2925
2926         case ISO_single_shift_3:
2927           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2928             goto invalid_code;
2929           /* SS2 is handled as an escape sequence of ESC 'O' */
2930           c1 = 'O';
2931           goto label_escape_sequence;
2932
2933         case ISO_control_sequence_introducer:
2934           /* CSI is handled as an escape sequence of ESC '[' ...  */
2935           c1 = '[';
2936           goto label_escape_sequence;
2937
2938         case ISO_escape:
2939           ONE_MORE_BYTE (c1);
2940         label_escape_sequence:
2941           /* Escape sequences handled here are invocation,
2942              designation, direction specification, and character
2943              composition specification.  */
2944           switch (c1)
2945             {
2946             case '&':           /* revision of following character set */
2947               ONE_MORE_BYTE (c1);
2948               if (!(c1 >= '@' && c1 <= '~'))
2949                 goto invalid_code;
2950               ONE_MORE_BYTE (c1);
2951               if (c1 != ISO_CODE_ESC)
2952                 goto invalid_code;
2953               ONE_MORE_BYTE (c1);
2954               goto label_escape_sequence;
2955
2956             case '$':           /* designation of 2-byte character set */
2957               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
2958                 goto invalid_code;
2959               ONE_MORE_BYTE (c1);
2960               if (c1 >= '@' && c1 <= 'B')
2961                 {       /* designation of JISX0208.1978, GB2312.1980,
2962                            or JISX0208.1980 */
2963                   DECODE_DESIGNATION (0, 2, 0, c1);
2964                 }
2965               else if (c1 >= 0x28 && c1 <= 0x2B)
2966                 {       /* designation of DIMENSION2_CHARS94 character set */
2967                   ONE_MORE_BYTE (c2);
2968                   DECODE_DESIGNATION (c1 - 0x28, 2, 0, c2);
2969                 }
2970               else if (c1 >= 0x2C && c1 <= 0x2F)
2971                 {       /* designation of DIMENSION2_CHARS96 character set */
2972                   ONE_MORE_BYTE (c2);
2973                   DECODE_DESIGNATION (c1 - 0x2C, 2, 1, c2);
2974                 }
2975               else
2976                 goto invalid_code;
2977               /* We must update these variables now.  */
2978               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2979               charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2980               continue;
2981
2982             case 'n':           /* invocation of locking-shift-2 */
2983               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2984                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
2985                 goto invalid_code;
2986               CODING_ISO_INVOCATION (coding, 0) = 2;
2987               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2988               continue;
2989
2990             case 'o':           /* invocation of locking-shift-3 */
2991               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2992                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
2993                 goto invalid_code;
2994               CODING_ISO_INVOCATION (coding, 0) = 3;
2995               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2996               continue;
2997
2998             case 'N':           /* invocation of single-shift-2 */
2999               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3000                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3001                 goto invalid_code;
3002               charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 2));
3003               ONE_MORE_BYTE (c1);
3004               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3005                 goto invalid_code;
3006               break;
3007
3008             case 'O':           /* invocation of single-shift-3 */
3009               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3010                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3011                 goto invalid_code;
3012               charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 3));
3013               ONE_MORE_BYTE (c1);
3014               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3015                 goto invalid_code;
3016               break;
3017
3018             case '0': case '2': case '3': case '4': /* start composition */
3019               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3020                 goto invalid_code;
3021               DECODE_COMPOSITION_START (c1);
3022               continue;
3023
3024             case '1':           /* end composition */
3025               if (composition_state == COMPOSING_NO)
3026                 goto invalid_code;
3027               DECODE_COMPOSITION_END ();
3028               continue;
3029
3030             case '[':           /* specification of direction */
3031               if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3032                 goto invalid_code;
3033               /* For the moment, nested direction is not supported.
3034                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3035                  left-to-right, and nozero means right-to-left.  */
3036               ONE_MORE_BYTE (c1);
3037               switch (c1)
3038                 {
3039                 case ']':       /* end of the current direction */
3040                   coding->mode &= ~CODING_MODE_DIRECTION;
3041
3042                 case '0':       /* end of the current direction */
3043                 case '1':       /* start of left-to-right direction */
3044                   ONE_MORE_BYTE (c1);
3045                   if (c1 == ']')
3046                     coding->mode &= ~CODING_MODE_DIRECTION;
3047                   else
3048                     goto invalid_code;
3049                   break;
3050
3051                 case '2':       /* start of right-to-left direction */
3052                   ONE_MORE_BYTE (c1);
3053                   if (c1 == ']')
3054                     coding->mode |= CODING_MODE_DIRECTION;
3055                   else
3056                     goto invalid_code;
3057                   break;
3058
3059                 default:
3060                   goto invalid_code;
3061                 }
3062               continue;
3063
3064             case '%':
3065               ONE_MORE_BYTE (c1);
3066               if (c1 == '/')
3067                 {
3068                   /* CTEXT extended segment:
3069                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3070                      We keep these bytes as is for the moment.
3071                      They may be decoded by post-read-conversion.  */
3072                   int dim, M, L;
3073                   int size;
3074
3075                   ONE_MORE_BYTE (dim);
3076                   ONE_MORE_BYTE (M);
3077                   ONE_MORE_BYTE (L);
3078                   size = ((M - 128) * 128) + (L - 128);
3079                   if (charbuf + 8 + size > charbuf_end)
3080                     goto break_loop;
3081                   *charbuf++ = ISO_CODE_ESC;
3082                   *charbuf++ = '%';
3083                   *charbuf++ = '/';
3084                   *charbuf++ = dim;
3085                   *charbuf++ = BYTE8_TO_CHAR (M);
3086                   *charbuf++ = BYTE8_TO_CHAR (L);
3087                   while (size-- > 0)
3088                     {
3089                       ONE_MORE_BYTE (c1);
3090                       *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3091                     }
3092                 }
3093               else if (c1 == 'G')
3094                 {
3095                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3096                      ESC % G --UTF-8-BYTES-- ESC % @
3097                      We keep these bytes as is for the moment.
3098                      They may be decoded by post-read-conversion.  */
3099                   int *p = charbuf;
3100
3101                   if (p + 6 > charbuf_end)
3102                     goto break_loop;
3103                   *p++ = ISO_CODE_ESC;
3104                   *p++ = '%';
3105                   *p++ = 'G';
3106                   while (p < charbuf_end)
3107                     {
3108                       ONE_MORE_BYTE (c1);
3109                       if (c1 == ISO_CODE_ESC
3110                           && src + 1 < src_end
3111                           && src[0] == '%'
3112                           && src[1] == '@')
3113                         break;
3114                       *p++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3115                     }
3116                   if (p + 3 > charbuf_end)
3117                     goto break_loop;
3118                   *p++ = ISO_CODE_ESC;
3119                   *p++ = '%';
3120                   *p++ = '@';
3121                   charbuf = p;
3122                 }
3123               else
3124                 goto invalid_code;
3125               continue;
3126               break;
3127
3128             default:
3129               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3130                 goto invalid_code;
3131               if (c1 >= 0x28 && c1 <= 0x2B)
3132                 {       /* designation of DIMENSION1_CHARS94 character set */
3133                   ONE_MORE_BYTE (c2);
3134                   DECODE_DESIGNATION (c1 - 0x28, 1, 0, c2);
3135                 }
3136               else if (c1 >= 0x2C && c1 <= 0x2F)
3137                 {       /* designation of DIMENSION1_CHARS96 character set */
3138                   ONE_MORE_BYTE (c2);
3139                   DECODE_DESIGNATION (c1 - 0x2C, 1, 1, c2);
3140                 }
3141               else
3142                 goto invalid_code;
3143               /* We must update these variables now.  */
3144               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3145               charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3146               continue;
3147             }
3148         }
3149
3150       if (charset->id != charset_ascii
3151           && last_id != charset->id)
3152         {
3153           if (last_id != charset_ascii)
3154             ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3155           last_id = charset->id;
3156           last_offset = char_offset;
3157         }
3158
3159       /* Now we know CHARSET and 1st position code C1 of a character.
3160          Produce a decoded character while getting 2nd position code
3161          C2 if necessary.  */
3162       c1 &= 0x7F;
3163       if (CHARSET_DIMENSION (charset) > 1)
3164         {
3165           ONE_MORE_BYTE (c2);
3166           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3167             /* C2 is not in a valid range.  */
3168             goto invalid_code;
3169           c1 = (c1 << 8) | (c2 & 0x7F);
3170           if (CHARSET_DIMENSION (charset) > 2)
3171             {
3172               ONE_MORE_BYTE (c2);
3173               if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3174                 /* C2 is not in a valid range.  */
3175                 goto invalid_code;
3176               c1 = (c1 << 8) | (c2 & 0x7F);
3177             }
3178         }
3179
3180       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3181       if (c < 0)
3182         {
3183           MAYBE_FINISH_COMPOSITION ();
3184           for (; src_base < src; src_base++, char_offset++)
3185             {
3186               if (ASCII_BYTE_P (*src_base))
3187                 *charbuf++ = *src_base;
3188               else
3189                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3190             }
3191         }
3192       else if (composition_state == COMPOSING_NO)
3193         {
3194           *charbuf++ = c;
3195           char_offset++;
3196         }
3197       else
3198         {
3199           components[component_idx++] = c;
3200           if (method == COMPOSITION_WITH_RULE
3201               || (method == COMPOSITION_WITH_RULE_ALTCHARS
3202                   && composition_state == COMPOSING_COMPONENT_CHAR))
3203             composition_state++;
3204         }
3205       continue;
3206
3207     invalid_code:
3208       MAYBE_FINISH_COMPOSITION ();
3209       src = src_base;
3210       consumed_chars = consumed_chars_base;
3211       ONE_MORE_BYTE (c);
3212       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3213       char_offset++;
3214       coding->errors++;
3215       continue;
3216
3217     break_loop:
3218       break;
3219     }
3220
3221  no_more_source:
3222   if (last_id != charset_ascii)
3223     ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3224   coding->consumed_char += consumed_chars_base;
3225   coding->consumed = src_base - coding->source;
3226   coding->charbuf_used = charbuf - coding->charbuf;
3227 }
3228
3229
3230 /* ISO2022 encoding stuff.  */
3231
3232 /*
3233    It is not enough to say just "ISO2022" on encoding, we have to
3234    specify more details.  In Emacs, each coding system of ISO2022
3235    variant has the following specifications:
3236         1. Initial designation to G0 thru G3.
3237         2. Allows short-form designation?
3238         3. ASCII should be designated to G0 before control characters?
3239         4. ASCII should be designated to G0 at end of line?
3240         5. 7-bit environment or 8-bit environment?
3241         6. Use locking-shift?
3242         7. Use Single-shift?
3243    And the following two are only for Japanese:
3244         8. Use ASCII in place of JIS0201-1976-Roman?
3245         9. Use JISX0208-1983 in place of JISX0208-1978?
3246    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3247    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
3248    details.
3249 */
3250
3251 /* Produce codes (escape sequence) for designating CHARSET to graphic
3252    register REG at DST, and increment DST.  If <final-char> of CHARSET is
3253    '@', 'A', or 'B' and the coding system CODING allows, produce
3254    designation sequence of short-form.  */
3255
3256 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
3257   do {                                                                  \
3258     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
3259     char *intermediate_char_94 = "()*+";                                \
3260     char *intermediate_char_96 = ",-./";                                \
3261     int revision = -1;                                                  \
3262     int c;                                                              \
3263                                                                         \
3264     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
3265       revision = CHARSET_ISO_REVISION (charset);                        \
3266                                                                         \
3267     if (revision >= 0)                                                  \
3268       {                                                                 \
3269         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
3270         EMIT_ONE_BYTE ('@' + revision);                                 \
3271       }                                                                 \
3272     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
3273     if (CHARSET_DIMENSION (charset) == 1)                               \
3274       {                                                                 \
3275         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3276           c = intermediate_char_94[reg];                                \
3277         else                                                            \
3278           c = intermediate_char_96[reg];                                \
3279         EMIT_ONE_ASCII_BYTE (c);                                        \
3280       }                                                                 \
3281     else                                                                \
3282       {                                                                 \
3283         EMIT_ONE_ASCII_BYTE ('$');                                      \
3284         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3285           {                                                             \
3286             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
3287                 || reg != 0                                             \
3288                 || final_char < '@' || final_char > 'B')                \
3289               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
3290           }                                                             \
3291         else                                                            \
3292           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
3293       }                                                                 \
3294     EMIT_ONE_ASCII_BYTE (final_char);                                   \
3295                                                                         \
3296     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
3297   } while (0)
3298
3299
3300 /* The following two macros produce codes (control character or escape
3301    sequence) for ISO2022 single-shift functions (single-shift-2 and
3302    single-shift-3).  */
3303
3304 #define ENCODE_SINGLE_SHIFT_2                                           \
3305   do {                                                                  \
3306     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3307       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
3308     else                                                                \
3309       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
3310     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3311   } while (0)
3312
3313
3314 #define ENCODE_SINGLE_SHIFT_3                                           \
3315   do {                                                                  \
3316     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3317       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
3318     else                                                                \
3319       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
3320     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3321   } while (0)
3322
3323
3324 /* The following four macros produce codes (control character or
3325    escape sequence) for ISO2022 locking-shift functions (shift-in,
3326    shift-out, locking-shift-2, and locking-shift-3).  */
3327
3328 #define ENCODE_SHIFT_IN                                 \
3329   do {                                                  \
3330     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
3331     CODING_ISO_INVOCATION (coding, 0) = 0;              \
3332   } while (0)
3333
3334
3335 #define ENCODE_SHIFT_OUT                                \
3336   do {                                                  \
3337     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
3338     CODING_ISO_INVOCATION (coding, 0) = 1;              \
3339   } while (0)
3340
3341
3342 #define ENCODE_LOCKING_SHIFT_2                          \
3343   do {                                                  \
3344     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
3345     CODING_ISO_INVOCATION (coding, 0) = 2;              \
3346   } while (0)
3347
3348
3349 #define ENCODE_LOCKING_SHIFT_3                          \
3350   do {                                                  \
3351     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
3352     CODING_ISO_INVOCATION (coding, 0) = 3;              \
3353   } while (0)
3354
3355
3356 /* Produce codes for a DIMENSION1 character whose character set is
3357    CHARSET and whose position-code is C1.  Designation and invocation
3358    sequences are also produced in advance if necessary.  */
3359
3360 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
3361   do {                                                                  \
3362     int id = CHARSET_ID (charset);                                      \
3363                                                                         \
3364     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
3365         && id == charset_ascii)                                         \
3366       {                                                                 \
3367         id = charset_jisx0201_roman;                                    \
3368         charset = CHARSET_FROM_ID (id);                                 \
3369       }                                                                 \
3370                                                                         \
3371     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
3372       {                                                                 \
3373         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
3374           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
3375         else                                                            \
3376           EMIT_ONE_BYTE (c1 | 0x80);                                    \
3377         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
3378         break;                                                          \
3379       }                                                                 \
3380     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
3381       {                                                                 \
3382         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
3383         break;                                                          \
3384       }                                                                 \
3385     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
3386       {                                                                 \
3387         EMIT_ONE_BYTE (c1 | 0x80);                                      \
3388         break;                                                          \
3389       }                                                                 \
3390     else                                                                \
3391       /* Since CHARSET is not yet invoked to any graphic planes, we     \
3392          must invoke it, or, at first, designate it to some graphic     \
3393          register.  Then repeat the loop to actually produce the        \
3394          character.  */                                                 \
3395       dst = encode_invocation_designation (charset, coding, dst,        \
3396                                            &produced_chars);            \
3397   } while (1)
3398
3399
3400 /* Produce codes for a DIMENSION2 character whose character set is
3401    CHARSET and whose position-codes are C1 and C2.  Designation and
3402    invocation codes are also produced in advance if necessary.  */
3403
3404 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
3405   do {                                                                  \
3406     int id = CHARSET_ID (charset);                                      \
3407                                                                         \
3408     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
3409         && id == charset_jisx0208)                                      \
3410       {                                                                 \
3411         id = charset_jisx0208_1978;                                     \
3412         charset = CHARSET_FROM_ID (id);                                 \
3413       }                                                                 \
3414                                                                         \
3415     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
3416       {                                                                 \
3417         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
3418           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
3419         else                                                            \
3420           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
3421         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
3422         break;                                                          \
3423       }                                                                 \
3424     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
3425       {                                                                 \
3426         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
3427         break;                                                          \
3428       }                                                                 \
3429     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
3430       {                                                                 \
3431         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
3432         break;                                                          \
3433       }                                                                 \
3434     else                                                                \
3435       /* Since CHARSET is not yet invoked to any graphic planes, we     \
3436          must invoke it, or, at first, designate it to some graphic     \
3437          register.  Then repeat the loop to actually produce the        \
3438          character.  */                                                 \
3439       dst = encode_invocation_designation (charset, coding, dst,        \
3440                                            &produced_chars);            \
3441   } while (1)
3442
3443
3444 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
3445   do {                                                                     \
3446     int code = ENCODE_CHAR ((charset),(c));                                \
3447                                                                            \
3448     if (CHARSET_DIMENSION (charset) == 1)                                  \
3449       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
3450     else                                                                   \
3451       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3452   } while (0)
3453
3454
3455 /* Produce designation and invocation codes at a place pointed by DST
3456    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
3457    Return new DST.  */
3458
3459 unsigned char *
3460 encode_invocation_designation (charset, coding, dst, p_nchars)
3461      struct charset *charset;
3462      struct coding_system *coding;
3463      unsigned char *dst;
3464      int *p_nchars;
3465 {
3466   int multibytep = coding->dst_multibyte;
3467   int produced_chars = *p_nchars;
3468   int reg;                      /* graphic register number */
3469   int id = CHARSET_ID (charset);
3470
3471   /* At first, check designations.  */
3472   for (reg = 0; reg < 4; reg++)
3473     if (id == CODING_ISO_DESIGNATION (coding, reg))
3474       break;
3475
3476   if (reg >= 4)
3477     {
3478       /* CHARSET is not yet designated to any graphic registers.  */
3479       /* At first check the requested designation.  */
3480       reg = CODING_ISO_REQUEST (coding, id);
3481       if (reg < 0)
3482         /* Since CHARSET requests no special designation, designate it
3483            to graphic register 0.  */
3484         reg = 0;
3485
3486       ENCODE_DESIGNATION (charset, reg, coding);
3487     }
3488
3489   if (CODING_ISO_INVOCATION (coding, 0) != reg
3490       && CODING_ISO_INVOCATION (coding, 1) != reg)
3491     {
3492       /* Since the graphic register REG is not invoked to any graphic
3493          planes, invoke it to graphic plane 0.  */
3494       switch (reg)
3495         {
3496         case 0:                 /* graphic register 0 */
3497           ENCODE_SHIFT_IN;
3498           break;
3499
3500         case 1:                 /* graphic register 1 */
3501           ENCODE_SHIFT_OUT;
3502           break;
3503
3504         case 2:                 /* graphic register 2 */
3505           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3506             ENCODE_SINGLE_SHIFT_2;
3507           else
3508             ENCODE_LOCKING_SHIFT_2;
3509           break;
3510
3511         case 3:                 /* graphic register 3 */
3512           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3513             ENCODE_SINGLE_SHIFT_3;
3514           else
3515             ENCODE_LOCKING_SHIFT_3;
3516           break;
3517         }
3518     }
3519
3520   *p_nchars = produced_chars;
3521   return dst;
3522 }
3523
3524 /* The following three macros produce codes for indicating direction
3525    of text.  */
3526 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER                              \
3527   do {                                                                  \
3528     if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS)        \
3529       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '[');                         \
3530     else                                                                \
3531       EMIT_ONE_BYTE (ISO_CODE_CSI);                                     \
3532   } while (0)
3533
3534
3535 #define ENCODE_DIRECTION_R2L()                  \
3536   do {                                          \
3537     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
3538     EMIT_TWO_ASCII_BYTES ('2', ']');            \
3539   } while (0)
3540
3541
3542 #define ENCODE_DIRECTION_L2R()                  \
3543   do {                                          \
3544     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
3545     EMIT_TWO_ASCII_BYTES ('0', ']');            \
3546   } while (0)
3547
3548
3549 /* Produce codes for designation and invocation to reset the graphic
3550    planes and registers to initial state.  */
3551 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
3552   do {                                                                  \
3553     int reg;                                                            \
3554     struct charset *charset;                                            \
3555                                                                         \
3556     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
3557       ENCODE_SHIFT_IN;                                                  \
3558     for (reg = 0; reg < 4; reg++)                                       \
3559       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
3560           && (CODING_ISO_DESIGNATION (coding, reg)                      \
3561               != CODING_ISO_INITIAL (coding, reg)))                     \
3562         {                                                               \
3563           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3564           ENCODE_DESIGNATION (charset, reg, coding);                    \
3565         }                                                               \
3566   } while (0)
3567
3568
3569 /* Produce designation sequences of charsets in the line started from
3570    SRC to a place pointed by DST, and return updated DST.
3571
3572    If the current block ends before any end-of-line, we may fail to
3573    find all the necessary designations.  */
3574
3575 static unsigned char *
3576 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
3577      struct coding_system *coding;
3578      int *charbuf, *charbuf_end;
3579      unsigned char *dst;
3580 {
3581   struct charset *charset;
3582   /* Table of charsets to be designated to each graphic register.  */
3583   int r[4];
3584   int c, found = 0, reg;
3585   int produced_chars = 0;
3586   int multibytep = coding->dst_multibyte;
3587   Lisp_Object attrs;
3588   Lisp_Object charset_list;
3589
3590   attrs = CODING_ID_ATTRS (coding->id);
3591   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3592   if (EQ (charset_list, Qiso_2022))
3593     charset_list = Viso_2022_charset_list;
3594
3595   for (reg = 0; reg < 4; reg++)
3596     r[reg] = -1;
3597
3598   while (found < 4)
3599     {
3600       int id;
3601
3602       c = *charbuf++;
3603       if (c == '\n')
3604         break;
3605       charset = char_charset (c, charset_list, NULL);
3606       id = CHARSET_ID (charset);
3607       reg = CODING_ISO_REQUEST (coding, id);
3608       if (reg >= 0 && r[reg] < 0)
3609         {
3610           found++;
3611           r[reg] = id;
3612         }
3613     }
3614
3615   if (found)
3616     {
3617       for (reg = 0; reg < 4; reg++)
3618         if (r[reg] >= 0
3619             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3620           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
3621     }
3622
3623   return dst;
3624 }
3625
3626 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
3627
3628 static int
3629 encode_coding_iso_2022 (coding)
3630      struct coding_system *coding;
3631 {
3632   int multibytep = coding->dst_multibyte;
3633   int *charbuf = coding->charbuf;
3634   int *charbuf_end = charbuf + coding->charbuf_used;
3635   unsigned char *dst = coding->destination + coding->produced;
3636   unsigned char *dst_end = coding->destination + coding->dst_bytes;
3637   int safe_room = 16;
3638   int bol_designation
3639     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3640        && CODING_ISO_BOL (coding));
3641   int produced_chars = 0;
3642   Lisp_Object attrs, eol_type, charset_list;
3643   int ascii_compatible;
3644   int c;
3645   int preferred_charset_id = -1;
3646
3647   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3648   setup_iso_safe_charsets (attrs);
3649   /* Charset list may have been changed.  */
3650   charset_list = CODING_ATTR_CHARSET_LIST (attrs);              \
3651   coding->safe_charsets
3652     = (char *) XSTRING (CODING_ATTR_SAFE_CHARSETS(attrs))->data;
3653
3654   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3655
3656   while (charbuf < charbuf_end)
3657     {
3658       ASSURE_DESTINATION (safe_room);
3659
3660       if (bol_designation)
3661         {
3662           unsigned char *dst_prev = dst;
3663
3664           /* We have to produce designation sequences if any now.  */
3665           dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3666           bol_designation = 0;
3667           /* We are sure that designation sequences are all ASCII bytes.  */
3668           produced_chars += dst - dst_prev;
3669         }
3670
3671       c = *charbuf++;
3672
3673       if (c < 0)
3674         {
3675           /* Handle an annotation.  */
3676           switch (*charbuf)
3677             {
3678             case CODING_ANNOTATE_COMPOSITION_MASK:
3679               /* Not yet implemented.  */
3680               break;
3681             case CODING_ANNOTATE_CHARSET_MASK:
3682               preferred_charset_id = charbuf[3];
3683               if (preferred_charset_id >= 0
3684                   && NILP (Fmemq (make_number (preferred_charset_id),
3685                                   charset_list)))
3686                 preferred_charset_id = -1;
3687               break;
3688             default:
3689               abort ();
3690             }
3691           charbuf += -c - 1;
3692           continue;
3693         }
3694
3695       /* Now encode the character C.  */
3696       if (c < 0x20 || c == 0x7F)
3697         {
3698           if (c == '\n'
3699               || (c == '\r' && EQ (eol_type, Qmac)))
3700             {
3701               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3702                 ENCODE_RESET_PLANE_AND_REGISTER ();
3703               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
3704                 {
3705                   int i;
3706
3707                   for (i = 0; i < 4; i++)
3708                     CODING_ISO_DESIGNATION (coding, i)
3709                       = CODING_ISO_INITIAL (coding, i);
3710                 }
3711               bol_designation
3712                 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
3713             }
3714           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3715             ENCODE_RESET_PLANE_AND_REGISTER ();
3716           EMIT_ONE_ASCII_BYTE (c);
3717         }
3718       else if (ASCII_CHAR_P (c))
3719         {
3720           if (ascii_compatible)
3721             EMIT_ONE_ASCII_BYTE (c);
3722           else
3723             {
3724               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
3725               ENCODE_ISO_CHARACTER (charset, c);
3726             }
3727         }
3728       else if (CHAR_BYTE8_P (c))
3729         {
3730           c = CHAR_TO_BYTE8 (c);
3731           EMIT_ONE_BYTE (c);
3732         }
3733       else
3734         {
3735           struct charset *charset;
3736
3737           if (preferred_charset_id >= 0)
3738             {
3739               charset = CHARSET_FROM_ID (preferred_charset_id);
3740               if (! CHAR_CHARSET_P (c, charset))
3741                 charset = char_charset (c, charset_list, NULL);
3742             }
3743           else
3744             charset = char_charset (c, charset_list, NULL);
3745           if (!charset)
3746             {
3747               if (coding->mode & CODING_MODE_SAFE_ENCODING)
3748                 {
3749                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3750                   charset = CHARSET_FROM_ID (charset_ascii);
3751                 }
3752               else
3753                 {
3754                   c = coding->default_char;
3755                   charset = char_charset (c, charset_list, NULL);
3756                 }
3757             }
3758           ENCODE_ISO_CHARACTER (charset, c);
3759         }
3760     }
3761
3762   if (coding->mode & CODING_MODE_LAST_BLOCK
3763       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3764     {
3765       ASSURE_DESTINATION (safe_room);
3766       ENCODE_RESET_PLANE_AND_REGISTER ();
3767     }
3768   coding->result = CODING_RESULT_SUCCESS;
3769   CODING_ISO_BOL (coding) = bol_designation;
3770   coding->produced_char += produced_chars;
3771   coding->produced = dst - coding->destination;
3772   return 0;
3773 }
3774
3775 \f
3776 /*** 8,9. SJIS and BIG5 handlers ***/
3777
3778 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3779    quite widely.  So, for the moment, Emacs supports them in the bare
3780    C code.  But, in the future, they may be supported only by CCL.  */
3781
3782 /* SJIS is a coding system encoding three character sets: ASCII, right
3783    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
3784    as is.  A character of charset katakana-jisx0201 is encoded by
3785    "position-code + 0x80".  A character of charset japanese-jisx0208
3786    is encoded in 2-byte but two position-codes are divided and shifted
3787    so that it fit in the range below.
3788
3789    --- CODE RANGE of SJIS ---
3790    (character set)      (range)
3791    ASCII                0x00 .. 0x7F
3792    KATAKANA-JISX0201    0xA0 .. 0xDF
3793    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
3794             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
3795    -------------------------------
3796
3797 */
3798
3799 /* BIG5 is a coding system encoding two character sets: ASCII and
3800    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
3801    character set and is encoded in two-byte.
3802
3803    --- CODE RANGE of BIG5 ---
3804    (character set)      (range)
3805    ASCII                0x00 .. 0x7F
3806    Big5 (1st byte)      0xA1 .. 0xFE
3807         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3808    --------------------------
3809
3810   */
3811
3812 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3813    Check if a text is encoded in SJIS.  If it is, return
3814    CATEGORY_MASK_SJIS, else return 0.  */
3815
3816 static int
3817 detect_coding_sjis (coding, detect_info)
3818      struct coding_system *coding;
3819      struct coding_detection_info *detect_info;
3820 {
3821   unsigned char *src = coding->source, *src_base = src;
3822   unsigned char *src_end = coding->source + coding->src_bytes;
3823   int multibytep = coding->src_multibyte;
3824   int consumed_chars = 0;
3825   int found = 0;
3826   int c;
3827   int incomplete;
3828
3829   detect_info->checked |= CATEGORY_MASK_SJIS;
3830   /* A coding system of this category is always ASCII compatible.  */
3831   src += coding->head_ascii;
3832
3833   while (1)
3834     {
3835       incomplete = 0;
3836       ONE_MORE_BYTE (c);
3837       incomplete = 1;
3838       if (c < 0x80)
3839         continue;
3840       if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
3841         {
3842           ONE_MORE_BYTE (c);
3843           if (c < 0x40 || c == 0x7F || c > 0xFC)
3844             break;
3845           found = CATEGORY_MASK_SJIS;
3846         }
3847       else if (c >= 0xA0 && c < 0xE0)
3848         found = CATEGORY_MASK_SJIS;
3849       else
3850         break;
3851     }
3852   detect_info->rejected |= CATEGORY_MASK_SJIS;
3853   return 0;
3854
3855  no_more_source:
3856   if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
3857     {
3858       detect_info->rejected |= CATEGORY_MASK_SJIS;
3859       return 0;
3860     }
3861   detect_info->found |= found;
3862   return 1;
3863 }
3864
3865 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3866    Check if a text is encoded in BIG5.  If it is, return
3867    CATEGORY_MASK_BIG5, else return 0.  */
3868
3869 static int
3870 detect_coding_big5 (coding, detect_info)
3871      struct coding_system *coding;
3872      struct coding_detection_info *detect_info;
3873 {
3874   unsigned char *src = coding->source, *src_base = src;
3875   unsigned char *src_end = coding->source + coding->src_bytes;
3876   int multibytep = coding->src_multibyte;
3877   int consumed_chars = 0;
3878   int found = 0;
3879   int c;
3880   int incomplete;
3881
3882   detect_info->checked |= CATEGORY_MASK_BIG5;
3883   /* A coding system of this category is always ASCII compatible.  */
3884   src += coding->head_ascii;
3885
3886   while (1)
3887     {
3888       incomplete = 0;
3889       ONE_MORE_BYTE (c);
3890       incomplete = 1;
3891       if (c < 0x80)
3892         continue;
3893       if (c >= 0xA1)
3894         {
3895           ONE_MORE_BYTE (c);
3896           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
3897             return 0;
3898           found = CATEGORY_MASK_BIG5;
3899         }
3900       else
3901         break;
3902     }
3903   detect_info->rejected |= CATEGORY_MASK_BIG5;
3904   return 0;
3905
3906  no_more_source:
3907   if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
3908     {
3909       detect_info->rejected |= CATEGORY_MASK_BIG5;
3910       return 0;
3911     }
3912   detect_info->found |= found;
3913   return 1;
3914 }
3915
3916 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3917    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
3918
3919 static void
3920 decode_coding_sjis (coding)
3921      struct coding_system *coding;
3922 {
3923   unsigned char *src = coding->source + coding->consumed;
3924   unsigned char *src_end = coding->source + coding->src_bytes;
3925   unsigned char *src_base;
3926   int *charbuf = coding->charbuf;
3927   int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
3928   int consumed_chars = 0, consumed_chars_base;
3929   int multibytep = coding->src_multibyte;
3930   struct charset *charset_roman, *charset_kanji, *charset_kana;
3931   Lisp_Object attrs, eol_type, charset_list, val;
3932   int char_offset = coding->produced_char;
3933   int last_offset = char_offset;
3934   int last_id = charset_ascii;
3935
3936   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3937
3938   val = charset_list;
3939   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3940   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3941   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
3942
3943   while (1)
3944     {
3945       int c, c1;
3946
3947       src_base = src;
3948       consumed_chars_base = consumed_chars;
3949
3950       if (charbuf >= charbuf_end)
3951         break;
3952
3953       ONE_MORE_BYTE (c);
3954
3955       if (c == '\r')
3956         {
3957           if (EQ (eol_type, Qdos))
3958             {
3959               if (src == src_end)
3960                 {
3961                   coding->result = CODING_RESULT_INSUFFICIENT_SRC;
3962                   goto no_more_source;
3963                 }
3964               if (*src == '\n')
3965                 ONE_MORE_BYTE (c);
3966             }
3967           else if (EQ (eol_type, Qmac))
3968             c = '\n';
3969         }
3970       else
3971         {
3972           struct charset *charset;
3973
3974           if (c < 0x80)
3975             charset = charset_roman;
3976           else
3977             {
3978               if (c >= 0xF0)
3979                 goto invalid_code;
3980               if (c < 0xA0 || c >= 0xE0)
3981                 {
3982                   /* SJIS -> JISX0208 */
3983                   ONE_MORE_BYTE (c1);
3984                   if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
3985                     goto invalid_code;
3986                   c = (c << 8) | c1;
3987                   SJIS_TO_JIS (c);
3988                   charset = charset_kanji;
3989                 }
3990               else if (c > 0xA0)
3991                 {
3992                   /* SJIS -> JISX0201-Kana */
3993                   c &= 0x7F;
3994                   charset = charset_kana;
3995                 }
3996               else
3997                 goto invalid_code;
3998             }
3999           if (charset->id != charset_ascii
4000               && last_id != charset->id)
4001             {
4002               if (last_id != charset_ascii)
4003                 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4004               last_id = charset->id;
4005               last_offset = char_offset;
4006             }
4007           CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4008         }
4009       *charbuf++ = c;
4010       char_offset++;
4011       continue;
4012
4013     invalid_code:
4014       src = src_base;
4015       consumed_chars = consumed_chars_base;
4016       ONE_MORE_BYTE (c);
4017       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4018       char_offset++;
4019       coding->errors++;
4020     }
4021
4022  no_more_source:
4023   if (last_id != charset_ascii)
4024     ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4025   coding->consumed_char += consumed_chars_base;
4026   coding->consumed = src_base - coding->source;
4027   coding->charbuf_used = charbuf - coding->charbuf;
4028 }
4029
4030 static void
4031 decode_coding_big5 (coding)
4032      struct coding_system *coding;
4033 {
4034   unsigned char *src = coding->source + coding->consumed;
4035   unsigned char *src_end = coding->source + coding->src_bytes;
4036   unsigned char *src_base;
4037   int *charbuf = coding->charbuf;
4038   int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4039   int consumed_chars = 0, consumed_chars_base;
4040   int multibytep = coding->src_multibyte;
4041   struct charset *charset_roman, *charset_big5;
4042   Lisp_Object attrs, eol_type, charset_list, val;
4043   int char_offset = coding->produced_char;
4044   int last_offset = char_offset;
4045   int last_id = charset_ascii;
4046
4047   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4048   val = charset_list;
4049   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4050   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4051
4052   while (1)
4053     {
4054       int c, c1;
4055
4056       src_base = src;
4057       consumed_chars_base = consumed_chars;
4058
4059       if (charbuf >= charbuf_end)
4060         break;
4061
4062       ONE_MORE_BYTE (c);
4063
4064       if (c == '\r')
4065         {
4066           if (EQ (eol_type, Qdos))
4067             {
4068               if (src == src_end)
4069                 {
4070                   coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4071                   goto no_more_source;
4072                 }
4073               if (*src == '\n')
4074                 ONE_MORE_BYTE (c);
4075             }
4076           else if (EQ (eol_type, Qmac))
4077             c = '\n';
4078         }
4079       else
4080         {
4081           struct charset *charset;
4082           if (c < 0x80)
4083             charset = charset_roman;
4084           else
4085             {
4086               /* BIG5 -> Big5 */
4087               if (c < 0xA1 || c > 0xFE)
4088                 goto invalid_code;
4089               ONE_MORE_BYTE (c1);
4090               if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4091                 goto invalid_code;
4092               c = c << 8 | c1;
4093               charset = charset_big5;
4094             }
4095           if (charset->id != charset_ascii
4096               && last_id != charset->id)
4097             {
4098               if (last_id != charset_ascii)
4099                 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4100               last_id = charset->id;
4101               last_offset = char_offset;
4102             }
4103           CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4104         }
4105
4106       *charbuf++ = c;
4107       char_offset++;
4108       continue;
4109
4110     invalid_code:
4111       src = src_base;
4112       consumed_chars = consumed_chars_base;
4113       ONE_MORE_BYTE (c);
4114       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4115       char_offset++;
4116       coding->errors++;
4117     }
4118
4119  no_more_source:
4120   if (last_id != charset_ascii)
4121     ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4122   coding->consumed_char += consumed_chars_base;
4123   coding->consumed = src_base - coding->source;
4124   coding->charbuf_used = charbuf - coding->charbuf;
4125 }
4126
4127 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4128    This function can encode charsets `ascii', `katakana-jisx0201',
4129    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4130    are sure that all these charsets are registered as official charset
4131    (i.e. do not have extended leading-codes).  Characters of other
4132    charsets are produced without any encoding.  If SJIS_P is 1, encode
4133    SJIS text, else encode BIG5 text.  */
4134
4135 static int
4136 encode_coding_sjis (coding)
4137      struct coding_system *coding;
4138 {
4139   int multibytep = coding->dst_multibyte;
4140   int *charbuf = coding->charbuf;
4141   int *charbuf_end = charbuf + coding->charbuf_used;
4142   unsigned char *dst = coding->destination + coding->produced;
4143   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4144   int safe_room = 4;
4145   int produced_chars = 0;
4146   Lisp_Object attrs, eol_type, charset_list, val;
4147   int ascii_compatible;
4148   struct charset *charset_roman, *charset_kanji, *charset_kana;
4149   int c;
4150
4151   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4152   val = charset_list;
4153   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4154   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4155   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
4156
4157   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4158
4159   while (charbuf < charbuf_end)
4160     {
4161       ASSURE_DESTINATION (safe_room);
4162       c = *charbuf++;
4163       /* Now encode the character C.  */
4164       if (ASCII_CHAR_P (c) && ascii_compatible)
4165         EMIT_ONE_ASCII_BYTE (c);
4166       else if (CHAR_BYTE8_P (c))
4167         {
4168           c = CHAR_TO_BYTE8 (c);
4169           EMIT_ONE_BYTE (c);
4170         }
4171       else
4172         {
4173           unsigned code;
4174           struct charset *charset = char_charset (c, charset_list, &code);
4175
4176           if (!charset)
4177             {
4178               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4179                 {
4180                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4181                   charset = CHARSET_FROM_ID (charset_ascii);
4182                 }
4183               else
4184                 {
4185                   c = coding->default_char;
4186                   charset = char_charset (c, charset_list, &code);
4187                 }
4188             }
4189           if (code == CHARSET_INVALID_CODE (charset))
4190             abort ();
4191           if (charset == charset_kanji)
4192             {
4193               int c1, c2;
4194               JIS_TO_SJIS (code);
4195               c1 = code >> 8, c2 = code & 0xFF;
4196               EMIT_TWO_BYTES (c1, c2);
4197             }
4198           else if (charset == charset_kana)
4199             EMIT_ONE_BYTE (code | 0x80);
4200           else
4201             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4202         }
4203     }
4204   coding->result = CODING_RESULT_SUCCESS;
4205   coding->produced_char += produced_chars;
4206   coding->produced = dst - coding->destination;
4207   return 0;
4208 }
4209
4210 static int
4211 encode_coding_big5 (coding)
4212      struct coding_system *coding;
4213 {
4214   int multibytep = coding->dst_multibyte;
4215   int *charbuf = coding->charbuf;
4216   int *charbuf_end = charbuf + coding->charbuf_used;
4217   unsigned char *dst = coding->destination + coding->produced;
4218   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4219   int safe_room = 4;
4220   int produced_chars = 0;
4221   Lisp_Object attrs, eol_type, charset_list, val;
4222   int ascii_compatible;
4223   struct charset *charset_roman, *charset_big5;
4224   int c;
4225
4226   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4227   val = charset_list;
4228   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4229   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4230   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4231
4232   while (charbuf < charbuf_end)
4233     {
4234       ASSURE_DESTINATION (safe_room);
4235       c = *charbuf++;
4236       /* Now encode the character C.  */
4237       if (ASCII_CHAR_P (c) && ascii_compatible)
4238         EMIT_ONE_ASCII_BYTE (c);
4239       else if (CHAR_BYTE8_P (c))
4240         {
4241           c = CHAR_TO_BYTE8 (c);
4242           EMIT_ONE_BYTE (c);
4243         }
4244       else
4245         {
4246           unsigned code;
4247           struct charset *charset = char_charset (c, charset_list, &code);
4248
4249           if (! charset)
4250             {
4251               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4252                 {
4253                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4254                   charset = CHARSET_FROM_ID (charset_ascii);
4255                 }
4256               else
4257                 {
4258                   c = coding->default_char;
4259                   charset = char_charset (c, charset_list, &code);
4260                 }
4261             }
4262           if (code == CHARSET_INVALID_CODE (charset))
4263             abort ();
4264           if (charset == charset_big5)
4265             {
4266               int c1, c2;
4267
4268               c1 = code >> 8, c2 = code & 0xFF;
4269               EMIT_TWO_BYTES (c1, c2);
4270             }
4271           else
4272             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4273         }
4274     }
4275   coding->result = CODING_RESULT_SUCCESS;
4276   coding->produced_char += produced_chars;
4277   coding->produced = dst - coding->destination;
4278   return 0;
4279 }
4280
4281 \f
4282 /*** 10. CCL handlers ***/
4283
4284 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4285    Check if a text is encoded in a coding system of which
4286    encoder/decoder are written in CCL program.  If it is, return
4287    CATEGORY_MASK_CCL, else return 0.  */
4288
4289 static int
4290 detect_coding_ccl (coding, detect_info)
4291      struct coding_system *coding;
4292      struct coding_detection_info *detect_info;
4293 {
4294   unsigned char *src = coding->source, *src_base = src;
4295   unsigned char *src_end = coding->source + coding->src_bytes;
4296   int multibytep = coding->src_multibyte;
4297   int consumed_chars = 0;
4298   int found = 0;
4299   unsigned char *valids = CODING_CCL_VALIDS (coding);
4300   int head_ascii = coding->head_ascii;
4301   Lisp_Object attrs;
4302
4303   detect_info->checked |= CATEGORY_MASK_CCL;
4304
4305   coding = &coding_categories[coding_category_ccl];
4306   attrs = CODING_ID_ATTRS (coding->id);
4307   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4308     src += head_ascii;
4309
4310   while (1)
4311     {
4312       int c;
4313       ONE_MORE_BYTE (c);
4314       if (! valids[c])
4315         break;
4316       if ((valids[c] > 1))
4317         found = CATEGORY_MASK_CCL;
4318     }
4319   detect_info->rejected |= CATEGORY_MASK_CCL;
4320   return 0;
4321
4322  no_more_source:
4323   detect_info->found |= found;
4324   return 1;
4325 }
4326
4327 static void
4328 decode_coding_ccl (coding)
4329      struct coding_system *coding;
4330 {
4331   const unsigned char *src = coding->source + coding->consumed;
4332   unsigned char *src_end = coding->source + coding->src_bytes;
4333   int *charbuf = coding->charbuf;
4334   int *charbuf_end = charbuf + coding->charbuf_size;
4335   int consumed_chars = 0;
4336   int multibytep = coding->src_multibyte;
4337   struct ccl_program ccl;
4338   int source_charbuf[1024];
4339   int source_byteidx[1024];
4340   Lisp_Object attrs, eol_type, charset_list;
4341
4342   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4343   setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4344
4345   while (src < src_end)
4346     {
4347       const unsigned char *p = src;
4348       int *source, *source_end;
4349       int i = 0;
4350
4351       if (multibytep)
4352         while (i < 1024 && p < src_end)
4353           {
4354             source_byteidx[i] = p - src;
4355             source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4356           }
4357       else
4358         while (i < 1024 && p < src_end)
4359           source_charbuf[i++] = *p++;
4360
4361       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4362         ccl.last_block = 1;
4363
4364       source = source_charbuf;
4365       source_end = source + i;
4366       while (source < source_end)
4367         {
4368           ccl_driver (&ccl, source, charbuf,
4369                       source_end - source, charbuf_end - charbuf,
4370                       charset_list);
4371           source += ccl.consumed;
4372           charbuf += ccl.produced;
4373           if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4374             break;
4375         }
4376       if (source < source_end)
4377         src += source_byteidx[source - source_charbuf];
4378       else
4379         src = p;
4380       consumed_chars += source - source_charbuf;
4381
4382       if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4383           && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4384         break;
4385     }
4386
4387   switch (ccl.status)
4388     {
4389     case CCL_STAT_SUSPEND_BY_SRC:
4390       coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4391       break;
4392     case CCL_STAT_SUSPEND_BY_DST:
4393       break;
4394     case CCL_STAT_QUIT:
4395     case CCL_STAT_INVALID_CMD:
4396       coding->result = CODING_RESULT_INTERRUPT;
4397       break;
4398     default:
4399       coding->result = CODING_RESULT_SUCCESS;
4400       break;
4401     }
4402   coding->consumed_char += consumed_chars;
4403   coding->consumed = src - coding->source;
4404   coding->charbuf_used = charbuf - coding->charbuf;
4405 }
4406
4407 static int
4408 encode_coding_ccl (coding)
4409      struct coding_system *coding;
4410 {
4411   struct ccl_program ccl;
4412   int multibytep = coding->dst_multibyte;
4413   int *charbuf = coding->charbuf;
4414   int *charbuf_end = charbuf + coding->charbuf_used;
4415   unsigned char *dst = coding->destination + coding->produced;
4416   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4417   unsigned char *adjusted_dst_end = dst_end - 1;
4418   int destination_charbuf[1024];
4419   int i, produced_chars = 0;
4420   Lisp_Object attrs, eol_type, charset_list;
4421
4422   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4423   setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4424
4425   ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4426   ccl.dst_multibyte = coding->dst_multibyte;
4427
4428   while (charbuf < charbuf_end && dst < adjusted_dst_end)
4429     {
4430       int dst_bytes = dst_end - dst;
4431       if (dst_bytes > 1024)
4432         dst_bytes = 1024;
4433
4434       ccl_driver (&ccl, charbuf, destination_charbuf,
4435                   charbuf_end - charbuf, dst_bytes, charset_list);
4436       charbuf += ccl.consumed;
4437       if (multibytep)
4438         for (i = 0; i < ccl.produced; i++)
4439           EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4440       else
4441         {
4442           for (i = 0; i < ccl.produced; i++)
4443             *dst++ = destination_charbuf[i] & 0xFF;
4444           produced_chars += ccl.produced;
4445         }
4446     }
4447
4448   switch (ccl.status)
4449     {
4450     case CCL_STAT_SUSPEND_BY_SRC:
4451       coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4452       break;
4453     case CCL_STAT_SUSPEND_BY_DST:
4454       coding->result = CODING_RESULT_INSUFFICIENT_DST;
4455       break;
4456     case CCL_STAT_QUIT:
4457     case CCL_STAT_INVALID_CMD:
4458       coding->result = CODING_RESULT_INTERRUPT;
4459       break;
4460     default:
4461       coding->result = CODING_RESULT_SUCCESS;
4462       break;
4463     }
4464
4465   coding->produced_char += produced_chars;
4466   coding->produced = dst - coding->destination;
4467   return 0;
4468 }
4469
4470
4471 \f
4472 /*** 10, 11. no-conversion handlers ***/
4473
4474 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4475
4476 static void
4477 decode_coding_raw_text (coding)
4478      struct coding_system *coding;
4479 {
4480   coding->chars_at_source = 1;
4481   coding->consumed_char = 0;
4482   coding->consumed = 0;
4483   coding->result = CODING_RESULT_SUCCESS;
4484 }
4485
4486 static int
4487 encode_coding_raw_text (coding)
4488      struct coding_system *coding;
4489 {
4490   int multibytep = coding->dst_multibyte;
4491   int *charbuf = coding->charbuf;
4492   int *charbuf_end = coding->charbuf + coding->charbuf_used;
4493   unsigned char *dst = coding->destination + coding->produced;
4494   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4495   int produced_chars = 0;
4496   int c;
4497
4498   if (multibytep)
4499     {
4500       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4501
4502       if (coding->src_multibyte)
4503         while (charbuf < charbuf_end)
4504           {
4505             ASSURE_DESTINATION (safe_room);
4506             c = *charbuf++;
4507             if (ASCII_CHAR_P (c))
4508               EMIT_ONE_ASCII_BYTE (c);
4509             else if (CHAR_BYTE8_P (c))
4510               {
4511                 c = CHAR_TO_BYTE8 (c);
4512                 EMIT_ONE_BYTE (c);
4513               }
4514             else
4515               {
4516                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
4517
4518                 CHAR_STRING_ADVANCE (c, p1);
4519                 while (p0 < p1)
4520                   {
4521                     EMIT_ONE_BYTE (*p0);
4522                     p0++;
4523                   }
4524               }
4525           }
4526       else
4527         while (charbuf < charbuf_end)
4528           {
4529             ASSURE_DESTINATION (safe_room);
4530             c = *charbuf++;
4531             EMIT_ONE_BYTE (c);
4532           }
4533     }
4534   else
4535     {
4536       if (coding->src_multibyte)
4537         {
4538           int safe_room = MAX_MULTIBYTE_LENGTH;
4539
4540           while (charbuf < charbuf_end)
4541             {
4542               ASSURE_DESTINATION (safe_room);
4543               c = *charbuf++;
4544               if (ASCII_CHAR_P (c))
4545                 *dst++ = c;
4546               else if (CHAR_BYTE8_P (c))
4547                 *dst++ = CHAR_TO_BYTE8 (c);
4548               else
4549                 CHAR_STRING_ADVANCE (c, dst);
4550               produced_chars++;
4551             }
4552         }
4553       else
4554         {
4555           ASSURE_DESTINATION (charbuf_end - charbuf);
4556           while (charbuf < charbuf_end && dst < dst_end)
4557             *dst++ = *charbuf++;
4558           produced_chars = dst - (coding->destination + coding->dst_bytes);
4559         }
4560     }
4561   coding->result = CODING_RESULT_SUCCESS;
4562   coding->produced_char += produced_chars;
4563   coding->produced = dst - coding->destination;
4564   return 0;
4565 }
4566
4567 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4568    Check if a text is encoded in a charset-based coding system.  If it
4569    is, return 1, else return 0.  */
4570
4571 static int
4572 detect_coding_charset (coding, detect_info)
4573      struct coding_system *coding;
4574      struct coding_detection_info *detect_info;
4575 {
4576   unsigned char *src = coding->source, *src_base = src;
4577   unsigned char *src_end = coding->source + coding->src_bytes;
4578   int multibytep = coding->src_multibyte;
4579   int consumed_chars = 0;
4580   Lisp_Object attrs, valids;
4581   int found = 0;
4582
4583   detect_info->checked |= CATEGORY_MASK_CHARSET;
4584
4585   coding = &coding_categories[coding_category_charset];
4586   attrs = CODING_ID_ATTRS (coding->id);
4587   valids = AREF (attrs, coding_attr_charset_valids);
4588
4589   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4590     src += coding->head_ascii;
4591
4592   while (1)
4593     {
4594       int c;
4595
4596       ONE_MORE_BYTE (c);
4597       if (NILP (AREF (valids, c)))
4598         break;
4599       if (c >= 0x80)
4600         found = CATEGORY_MASK_CHARSET;
4601     }
4602   detect_info->rejected |= CATEGORY_MASK_CHARSET;
4603   return 0;
4604
4605  no_more_source:
4606   detect_info->found |= found;
4607   return 1;
4608 }
4609
4610 static void
4611 decode_coding_charset (coding)
4612      struct coding_system *coding;
4613 {
4614   unsigned char *src = coding->source + coding->consumed;
4615   unsigned char *src_end = coding->source + coding->src_bytes;
4616   unsigned char *src_base;
4617   int *charbuf = coding->charbuf;
4618   int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4619   int consumed_chars = 0, consumed_chars_base;
4620   int multibytep = coding->src_multibyte;
4621   Lisp_Object attrs, eol_type, charset_list, valids;
4622   int char_offset = coding->produced_char;
4623   int last_offset = char_offset;
4624   int last_id = charset_ascii;
4625
4626   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4627   valids = AREF (attrs, coding_attr_charset_valids);
4628
4629   while (1)
4630     {
4631       int c;
4632
4633       src_base = src;
4634       consumed_chars_base = consumed_chars;
4635
4636       if (charbuf >= charbuf_end)
4637         break;
4638
4639       ONE_MORE_BYTE (c);
4640       if (c == '\r')
4641         {
4642           /* Here we assume that no charset maps '\r' to something
4643              else.  */
4644           if (EQ (eol_type, Qdos))
4645             {
4646               if (src == src_end)
4647                 {
4648                   coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4649                   goto no_more_source;
4650                 }
4651               if (*src == '\n')
4652                 ONE_MORE_BYTE (c);
4653             }
4654           else if (EQ (eol_type, Qmac))
4655             c = '\n';
4656         }
4657       else
4658         {
4659           Lisp_Object val;
4660           struct charset *charset;
4661           int dim;
4662           int len = 1;
4663           unsigned code = c;
4664
4665           val = AREF (valids, c);
4666           if (NILP (val))
4667             goto invalid_code;
4668           if (INTEGERP (val))
4669             {
4670               charset = CHARSET_FROM_ID (XFASTINT (val));
4671               dim = CHARSET_DIMENSION (charset);
4672               while (len < dim)
4673                 {
4674                   ONE_MORE_BYTE (c);
4675                   code = (code << 8) | c;
4676                   len++;
4677                 }
4678               CODING_DECODE_CHAR (coding, src, src_base, src_end,
4679                                   charset, code, c);
4680             }
4681           else
4682             {
4683               /* VAL is a list of charset IDs.  It is assured that the
4684                  list is sorted by charset dimensions (smaller one
4685                  comes first).  */
4686               while (CONSP (val))
4687                 {
4688                   charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
4689                   dim = CHARSET_DIMENSION (charset);
4690                   while (len < dim)
4691                     {
4692                       ONE_MORE_BYTE (c);
4693                       code = (code << 8) | c;
4694                       len++;
4695                     }
4696                   CODING_DECODE_CHAR (coding, src, src_base,
4697                                       src_end, charset, code, c);
4698                   if (c >= 0)
4699                     break;
4700                   val = XCDR (val);
4701                 }
4702             }
4703           if (c < 0)
4704             goto invalid_code;
4705           if (charset->id != charset_ascii
4706               && last_id != charset->id)
4707             {
4708               if (last_id != charset_ascii)
4709                 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4710               last_id = charset->id;
4711               last_offset = char_offset;
4712             }
4713         }
4714       *charbuf++ = c;
4715       char_offset++;
4716       continue;
4717
4718     invalid_code:
4719       src = src_base;
4720       consumed_chars = consumed_chars_base;
4721       ONE_MORE_BYTE (c);
4722       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4723       char_offset++;
4724       coding->errors++;
4725     }
4726
4727  no_more_source:
4728   if (last_id != charset_ascii)
4729     ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4730   coding->consumed_char += consumed_chars_base;
4731   coding->consumed = src_base - coding->source;
4732   coding->charbuf_used = charbuf - coding->charbuf;
4733 }
4734
4735 static int
4736 encode_coding_charset (coding)
4737      struct coding_system *coding;
4738 {
4739   int multibytep = coding->dst_multibyte;
4740   int *charbuf = coding->charbuf;
4741   int *charbuf_end = charbuf + coding->charbuf_used;
4742   unsigned char *dst = coding->destination + coding->produced;
4743   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4744   int safe_room = MAX_MULTIBYTE_LENGTH;
4745   int produced_chars = 0;
4746   Lisp_Object attrs, eol_type, charset_list;
4747   int ascii_compatible;
4748   int c;
4749
4750   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4751   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4752
4753   while (charbuf < charbuf_end)
4754     {
4755       struct charset *charset;
4756       unsigned code;
4757
4758       ASSURE_DESTINATION (safe_room);
4759       c = *charbuf++;
4760       if (ascii_compatible && ASCII_CHAR_P (c))
4761         EMIT_ONE_ASCII_BYTE (c);
4762       else if (CHAR_BYTE8_P (c))
4763         {
4764           c = CHAR_TO_BYTE8 (c);
4765           EMIT_ONE_BYTE (c);
4766         }
4767       else
4768         {
4769           charset = char_charset (c, charset_list, &code);
4770           if (charset)
4771             {
4772               if (CHARSET_DIMENSION (charset) == 1)
4773                 EMIT_ONE_BYTE (code);
4774               else if (CHARSET_DIMENSION (charset) == 2)
4775                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
4776               else if (CHARSET_DIMENSION (charset) == 3)
4777                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
4778               else
4779                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
4780                                  (code >> 8) & 0xFF, code & 0xFF);
4781             }
4782           else
4783             {
4784               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4785                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4786               else
4787                 c = coding->default_char;
4788               EMIT_ONE_BYTE (c);
4789             }
4790         }
4791     }
4792
4793   coding->result = CODING_RESULT_SUCCESS;
4794   coding->produced_char += produced_chars;
4795   coding->produced = dst - coding->destination;
4796   return 0;
4797 }
4798
4799 \f
4800 /*** 7. C library functions ***/
4801
4802 /* Setup coding context CODING from information about CODING_SYSTEM.
4803    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
4804    CODING_SYSTEM is invalid, signal an error.  */
4805
4806 void
4807 setup_coding_system (coding_system, coding)
4808      Lisp_Object coding_system;
4809      struct coding_system *coding;
4810 {
4811   Lisp_Object attrs;
4812   Lisp_Object eol_type;
4813   Lisp_Object coding_type;
4814   Lisp_Object val;
4815
4816   if (NILP (coding_system))
4817     coding_system = Qno_conversion;
4818
4819   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
4820
4821   attrs = CODING_ID_ATTRS (coding->id);
4822   eol_type = CODING_ID_EOL_TYPE (coding->id);
4823
4824   coding->mode = 0;
4825   coding->head_ascii = -1;
4826   coding->common_flags
4827     = (VECTORP (eol_type) ? CODING_REQUIRE_DETECTION_MASK : 0);
4828   if (! NILP (CODING_ATTR_POST_READ (attrs)))
4829     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
4830   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
4831     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
4832
4833   val = CODING_ATTR_SAFE_CHARSETS (attrs);
4834   coding->max_charset_id = XSTRING (val)->size - 1;
4835   coding->safe_charsets = (char *) XSTRING (val)->data;
4836   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4837
4838   coding_type = CODING_ATTR_TYPE (attrs);
4839   if (EQ (coding_type, Qundecided))
4840     {
4841       coding->detector = NULL;
4842       coding->decoder = decode_coding_raw_text;
4843       coding->encoder = encode_coding_raw_text;
4844       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4845     }
4846   else if (EQ (coding_type, Qiso_2022))
4847     {
4848       int i;
4849       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
4850
4851       /* Invoke graphic register 0 to plane 0.  */
4852       CODING_ISO_INVOCATION (coding, 0) = 0;
4853       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
4854       CODING_ISO_INVOCATION (coding, 1)
4855         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
4856       /* Setup the initial status of designation.  */
4857       for (i = 0; i < 4; i++)
4858         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
4859       /* Not single shifting initially.  */
4860       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
4861       /* Beginning of buffer should also be regarded as bol. */
4862       CODING_ISO_BOL (coding) = 1;
4863       coding->detector = detect_coding_iso_2022;
4864       coding->decoder = decode_coding_iso_2022;
4865       coding->encoder = encode_coding_iso_2022;
4866       if (flags & CODING_ISO_FLAG_SAFE)
4867         coding->mode |= CODING_MODE_SAFE_ENCODING;
4868       coding->common_flags
4869         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4870             | CODING_REQUIRE_FLUSHING_MASK);
4871       if (flags & CODING_ISO_FLAG_COMPOSITION)
4872         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
4873       if (flags & CODING_ISO_FLAG_DESIGNATION)
4874         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
4875       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
4876         {
4877           setup_iso_safe_charsets (attrs);
4878           val = CODING_ATTR_SAFE_CHARSETS (attrs);
4879           coding->max_charset_id = XSTRING (val)->size - 1;
4880           coding->safe_charsets = (char *) XSTRING (val)->data;
4881         }
4882       CODING_ISO_FLAGS (coding) = flags;
4883     }
4884   else if (EQ (coding_type, Qcharset))
4885     {
4886       coding->detector = detect_coding_charset;
4887       coding->decoder = decode_coding_charset;
4888       coding->encoder = encode_coding_charset;
4889       coding->common_flags
4890         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4891     }
4892   else if (EQ (coding_type, Qutf_8))
4893     {
4894       coding->detector = detect_coding_utf_8;
4895       coding->decoder = decode_coding_utf_8;
4896       coding->encoder = encode_coding_utf_8;
4897       coding->common_flags
4898         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4899     }
4900   else if (EQ (coding_type, Qutf_16))
4901     {
4902       val = AREF (attrs, coding_attr_utf_16_bom);
4903       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
4904                                     : EQ (val, Qt) ? utf_16_with_bom
4905                                     : utf_16_without_bom);
4906       val = AREF (attrs, coding_attr_utf_16_endian);
4907       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
4908                                        : utf_16_little_endian);
4909       CODING_UTF_16_SURROGATE (coding) = 0;
4910       coding->detector = detect_coding_utf_16;
4911       coding->decoder = decode_coding_utf_16;
4912       coding->encoder = encode_coding_utf_16;
4913       coding->common_flags
4914         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4915       if (CODING_UTF_16_BOM (coding) == utf_16_detect_bom)
4916         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4917     }
4918   else if (EQ (coding_type, Qccl))
4919     {
4920       coding->detector = detect_coding_ccl;
4921       coding->decoder = decode_coding_ccl;
4922       coding->encoder = encode_coding_ccl;
4923       coding->common_flags
4924         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4925             | CODING_REQUIRE_FLUSHING_MASK);
4926     }
4927   else if (EQ (coding_type, Qemacs_mule))
4928     {
4929       coding->detector = detect_coding_emacs_mule;
4930       coding->decoder = decode_coding_emacs_mule;
4931       coding->encoder = encode_coding_emacs_mule;
4932       coding->common_flags
4933         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4934       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
4935           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
4936         {
4937           Lisp_Object tail, safe_charsets;
4938           int max_charset_id = 0;
4939
4940           for (tail = Vemacs_mule_charset_list; CONSP (tail);
4941                tail = XCDR (tail))
4942             if (max_charset_id < XFASTINT (XCAR (tail)))
4943               max_charset_id = XFASTINT (XCAR (tail));
4944           safe_charsets = Fmake_string (make_number (max_charset_id + 1),
4945                                         make_number (255));
4946           for (tail = Vemacs_mule_charset_list; CONSP (tail);
4947                tail = XCDR (tail))
4948             XSTRING (safe_charsets)->data[XFASTINT (XCAR (tail))] = 0;
4949           coding->max_charset_id = max_charset_id;
4950           coding->safe_charsets = (char *) XSTRING (safe_charsets)->data;
4951         }
4952     }
4953   else if (EQ (coding_type, Qshift_jis))
4954     {
4955       coding->detector = detect_coding_sjis;
4956       coding->decoder = decode_coding_sjis;
4957       coding->encoder = encode_coding_sjis;
4958       coding->common_flags
4959         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4960     }
4961   else if (EQ (coding_type, Qbig5))
4962     {
4963       coding->detector = detect_coding_big5;
4964       coding->decoder = decode_coding_big5;
4965       coding->encoder = encode_coding_big5;
4966       coding->common_flags
4967         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4968     }
4969   else                          /* EQ (coding_type, Qraw_text) */
4970     {
4971       coding->detector = NULL;
4972       coding->decoder = decode_coding_raw_text;
4973       coding->encoder = encode_coding_raw_text;
4974       coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
4975     }
4976
4977   return;
4978 }
4979
4980 /* Return raw-text or one of its subsidiaries that has the same
4981    eol_type as CODING-SYSTEM.  */
4982
4983 Lisp_Object
4984 raw_text_coding_system (coding_system)
4985      Lisp_Object coding_system;
4986 {
4987   Lisp_Object spec, attrs;
4988   Lisp_Object eol_type, raw_text_eol_type;
4989
4990   if (NILP (coding_system))
4991     return Qraw_text;
4992   spec = CODING_SYSTEM_SPEC (coding_system);
4993   attrs = AREF (spec, 0);
4994
4995   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
4996     return coding_system;
4997
4998   eol_type = AREF (spec, 2);
4999   if (VECTORP (eol_type))
5000     return Qraw_text;
5001   spec = CODING_SYSTEM_SPEC (Qraw_text);
5002   raw_text_eol_type = AREF (spec, 2);
5003   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5004           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5005           : AREF (raw_text_eol_type, 2));
5006 }
5007
5008
5009 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5010    does, return one of the subsidiary that has the same eol-spec as
5011    PARENT.  Otherwise, return CODING_SYSTEM.  */
5012
5013 Lisp_Object
5014 coding_inherit_eol_type (coding_system, parent)
5015      Lisp_Object coding_system, parent;
5016 {
5017   Lisp_Object spec, attrs, eol_type;
5018
5019   if (NILP (coding_system))
5020     coding_system = Qraw_text;
5021   spec = CODING_SYSTEM_SPEC (coding_system);
5022   attrs = AREF (spec, 0);
5023   eol_type = AREF (spec, 2);
5024   if (VECTORP (eol_type)
5025       && ! NILP (parent))
5026     {
5027       Lisp_Object parent_spec;
5028       Lisp_Object parent_eol_type;
5029
5030       parent_spec
5031         = CODING_SYSTEM_SPEC (buffer_defaults.buffer_file_coding_system);
5032       parent_eol_type = AREF (parent_spec, 2);
5033       if (EQ (parent_eol_type, Qunix))
5034         coding_system = AREF (eol_type, 0);
5035       else if (EQ (parent_eol_type, Qdos))
5036         coding_system = AREF (eol_type, 1);
5037       else if (EQ (parent_eol_type, Qmac))
5038         coding_system = AREF (eol_type, 2);
5039     }
5040   return coding_system;
5041 }
5042
5043 /* Emacs has a mechanism to automatically detect a coding system if it
5044    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
5045    it's impossible to distinguish some coding systems accurately
5046    because they use the same range of codes.  So, at first, coding
5047    systems are categorized into 7, those are:
5048
5049    o coding-category-emacs-mule
5050
5051         The category for a coding system which has the same code range
5052         as Emacs' internal format.  Assigned the coding-system (Lisp
5053         symbol) `emacs-mule' by default.
5054
5055    o coding-category-sjis
5056
5057         The category for a coding system which has the same code range
5058         as SJIS.  Assigned the coding-system (Lisp
5059         symbol) `japanese-shift-jis' by default.
5060
5061    o coding-category-iso-7
5062
5063         The category for a coding system which has the same code range
5064         as ISO2022 of 7-bit environment.  This doesn't use any locking
5065         shift and single shift functions.  This can encode/decode all
5066         charsets.  Assigned the coding-system (Lisp symbol)
5067         `iso-2022-7bit' by default.
5068
5069    o coding-category-iso-7-tight
5070
5071         Same as coding-category-iso-7 except that this can
5072         encode/decode only the specified charsets.
5073
5074    o coding-category-iso-8-1
5075
5076         The category for a coding system which has the same code range
5077         as ISO2022 of 8-bit environment and graphic plane 1 used only
5078         for DIMENSION1 charset.  This doesn't use any locking shift
5079         and single shift functions.  Assigned the coding-system (Lisp
5080         symbol) `iso-latin-1' by default.
5081
5082    o coding-category-iso-8-2
5083
5084         The category for a coding system which has the same code range
5085         as ISO2022 of 8-bit environment and graphic plane 1 used only
5086         for DIMENSION2 charset.  This doesn't use any locking shift
5087         and single shift functions.  Assigned the coding-system (Lisp
5088         symbol) `japanese-iso-8bit' by default.
5089
5090    o coding-category-iso-7-else
5091
5092         The category for a coding system which has the same code range
5093         as ISO2022 of 7-bit environemnt but uses locking shift or
5094         single shift functions.  Assigned the coding-system (Lisp
5095         symbol) `iso-2022-7bit-lock' by default.
5096
5097    o coding-category-iso-8-else
5098
5099         The category for a coding system which has the same code range
5100         as ISO2022 of 8-bit environemnt but uses locking shift or
5101         single shift functions.  Assigned the coding-system (Lisp
5102         symbol) `iso-2022-8bit-ss2' by default.
5103
5104    o coding-category-big5
5105
5106         The category for a coding system which has the same code range
5107         as BIG5.  Assigned the coding-system (Lisp symbol)
5108         `cn-big5' by default.
5109
5110    o coding-category-utf-8
5111
5112         The category for a coding system which has the same code range
5113         as UTF-8 (cf. RFC2279).  Assigned the coding-system (Lisp
5114         symbol) `utf-8' by default.
5115
5116    o coding-category-utf-16-be
5117
5118         The category for a coding system in which a text has an
5119         Unicode signature (cf. Unicode Standard) in the order of BIG
5120         endian at the head.  Assigned the coding-system (Lisp symbol)
5121         `utf-16-be' by default.
5122
5123    o coding-category-utf-16-le
5124
5125         The category for a coding system in which a text has an
5126         Unicode signature (cf. Unicode Standard) in the order of
5127         LITTLE endian at the head.  Assigned the coding-system (Lisp
5128         symbol) `utf-16-le' by default.
5129
5130    o coding-category-ccl
5131
5132         The category for a coding system of which encoder/decoder is
5133         written in CCL programs.  The default value is nil, i.e., no
5134         coding system is assigned.
5135
5136    o coding-category-binary
5137
5138         The category for a coding system not categorized in any of the
5139         above.  Assigned the coding-system (Lisp symbol)
5140         `no-conversion' by default.
5141
5142    Each of them is a Lisp symbol and the value is an actual
5143    `coding-system's (this is also a Lisp symbol) assigned by a user.
5144    What Emacs does actually is to detect a category of coding system.
5145    Then, it uses a `coding-system' assigned to it.  If Emacs can't
5146    decide only one possible category, it selects a category of the
5147    highest priority.  Priorities of categories are also specified by a
5148    user in a Lisp variable `coding-category-list'.
5149
5150 */
5151
5152 #define EOL_SEEN_NONE   0
5153 #define EOL_SEEN_LF     1
5154 #define EOL_SEEN_CR     2
5155 #define EOL_SEEN_CRLF   4
5156
5157 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
5158    SOURCE is encoded.  If CATEGORY is one of
5159    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5160    two-byte, else they are encoded by one-byte.
5161
5162    Return one of EOL_SEEN_XXX.  */
5163
5164 #define MAX_EOL_CHECK_COUNT 3
5165
5166 static int
5167 detect_eol (source, src_bytes, category)
5168      unsigned char *source;
5169      EMACS_INT src_bytes;
5170      enum coding_category category;
5171 {
5172   unsigned char *src = source, *src_end = src + src_bytes;
5173   unsigned char c;
5174   int total  = 0;
5175   int eol_seen = EOL_SEEN_NONE;
5176
5177   if ((1 << category) & CATEGORY_MASK_UTF_16)
5178     {
5179       int msb, lsb;
5180
5181       msb = category == (coding_category_utf_16_le
5182                          | coding_category_utf_16_le_nosig);
5183       lsb = 1 - msb;
5184
5185       while (src + 1 < src_end)
5186         {
5187           c = src[lsb];
5188           if (src[msb] == 0 && (c == '\n' || c == '\r'))
5189             {
5190               int this_eol;
5191
5192               if (c == '\n')
5193                 this_eol = EOL_SEEN_LF;
5194               else if (src + 3 >= src_end
5195                        || src[msb + 2] != 0
5196                        || src[lsb + 2] != '\n')
5197                 this_eol = EOL_SEEN_CR;
5198               else
5199                 this_eol = EOL_SEEN_CRLF;
5200
5201               if (eol_seen == EOL_SEEN_NONE)
5202                 /* This is the first end-of-line.  */
5203                 eol_seen = this_eol;
5204               else if (eol_seen != this_eol)
5205                 {
5206                   /* The found type is different from what found before.  */
5207                   eol_seen = EOL_SEEN_LF;
5208                   break;
5209                 }
5210               if (++total == MAX_EOL_CHECK_COUNT)
5211                 break;
5212             }
5213           src += 2;
5214         }
5215     }
5216   else
5217     {
5218       while (src < src_end)
5219         {
5220           c = *src++;
5221           if (c == '\n' || c == '\r')
5222             {
5223               int this_eol;
5224
5225               if (c == '\n')
5226                 this_eol = EOL_SEEN_LF;
5227               else if (src >= src_end || *src != '\n')
5228                 this_eol = EOL_SEEN_CR;
5229               else
5230                 this_eol = EOL_SEEN_CRLF, src++;
5231
5232               if (eol_seen == EOL_SEEN_NONE)
5233                 /* This is the first end-of-line.  */
5234                 eol_seen = this_eol;
5235               else if (eol_seen != this_eol)
5236                 {
5237                   /* The found type is different from what found before.  */
5238                   eol_seen = EOL_SEEN_LF;
5239                   break;
5240                 }
5241               if (++total == MAX_EOL_CHECK_COUNT)
5242                 break;
5243             }
5244         }
5245     }
5246   return eol_seen;
5247 }
5248
5249
5250 static void
5251 adjust_coding_eol_type (coding, eol_seen)
5252      struct coding_system *coding;
5253      int eol_seen;
5254 {
5255   Lisp_Object eol_type;
5256
5257   eol_type = CODING_ID_EOL_TYPE (coding->id);
5258   if (eol_seen & EOL_SEEN_LF)
5259     coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5260   else if (eol_seen & EOL_SEEN_CRLF)
5261     coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5262   else if (eol_seen & EOL_SEEN_CR)
5263     coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5264 }
5265
5266 /* Detect how a text specified in CODING is encoded.  If a coding
5267    system is detected, update fields of CODING by the detected coding
5268    system.  */
5269
5270 void
5271 detect_coding (coding)
5272      struct coding_system *coding;
5273 {
5274   unsigned char *src, *src_end;
5275   Lisp_Object attrs, coding_type;
5276
5277   coding->consumed = coding->consumed_char = 0;
5278   coding->produced = coding->produced_char = 0;
5279   coding_set_source (coding);
5280
5281   src_end = coding->source + coding->src_bytes;
5282
5283   /* If we have not yet decided the text encoding type, detect it
5284      now.  */
5285   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5286     {
5287       int c, i;
5288
5289       for (src = coding->source; src < src_end; src++)
5290         {
5291           c = *src;
5292           if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC
5293                                         || c == ISO_CODE_SI
5294                                         || c == ISO_CODE_SO)))
5295             break;
5296         }
5297       coding->head_ascii = src - (coding->source + coding->consumed);
5298
5299       if (coding->head_ascii < coding->src_bytes)
5300         {
5301           struct coding_detection_info detect_info;
5302           enum coding_category category;
5303           struct coding_system *this;
5304
5305           detect_info.checked = detect_info.found = detect_info.rejected = 0;
5306           for (i = 0; i < coding_category_raw_text; i++)
5307             {
5308               category = coding_priorities[i];
5309               this = coding_categories + category;
5310               if (this->id < 0)
5311                 {
5312                   /* No coding system of this category is defined.  */
5313                   detect_info.rejected |= (1 << category);
5314                 }
5315               else if (category >= coding_category_raw_text)
5316                 continue;
5317               else if (detect_info.checked & (1 << category))
5318                 {
5319                   if (detect_info.found & (1 << category))
5320                     break;
5321                 }
5322               else if ((*(this->detector)) (coding, &detect_info)
5323                        && detect_info.found & (1 << category))
5324                 break;
5325             }
5326           if (i < coding_category_raw_text)
5327             setup_coding_system (CODING_ID_NAME (this->id), coding);
5328           else if (detect_info.rejected == CATEGORY_MASK_ANY)
5329             setup_coding_system (Qraw_text, coding);
5330           else if (detect_info.rejected)
5331             for (i = 0; i < coding_category_raw_text; i++)
5332               if (! (detect_info.rejected & (1 << coding_priorities[i])))
5333                 {
5334                   this = coding_categories + coding_priorities[i];
5335                   setup_coding_system (CODING_ID_NAME (this->id), coding);
5336                   break;
5337                 }
5338         }
5339     }
5340   else if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qutf_16))
5341     {
5342       Lisp_Object coding_systems;
5343       struct coding_detection_info detect_info;
5344
5345       coding_systems
5346         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_16_bom);
5347       detect_info.found = detect_info.rejected = 0;
5348       if (CONSP (coding_systems)
5349           && detect_coding_utf_16 (coding, &detect_info)
5350           && (detect_info.found & (CATEGORY_MASK_UTF_16_LE
5351                                    | CATEGORY_MASK_UTF_16_BE)))
5352         {
5353           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5354             setup_coding_system (XCAR (coding_systems), coding);
5355           else
5356             setup_coding_system (XCDR (coding_systems), coding);
5357         }
5358     }
5359
5360   attrs = CODING_ID_ATTRS (coding->id);
5361   coding_type = CODING_ATTR_TYPE (attrs);
5362
5363   /* If we have not yet decided the EOL type, detect it now.  But, the
5364      detection is impossible for a CCL based coding system, in which
5365      case, we detct the EOL type after decoding.  */
5366   if (VECTORP (CODING_ID_EOL_TYPE (coding->id))
5367       && ! EQ (coding_type, Qccl))
5368     {
5369       int eol_seen = detect_eol (coding->source, coding->src_bytes,
5370                                  XINT (CODING_ATTR_CATEGORY (attrs)));
5371
5372       if (eol_seen != EOL_SEEN_NONE)
5373         adjust_coding_eol_type (coding, eol_seen);
5374     }
5375 }
5376
5377
5378 static void
5379 decode_eol (coding)
5380      struct coding_system *coding;
5381 {
5382   if (VECTORP (CODING_ID_EOL_TYPE (coding->id)))
5383     {
5384       unsigned char *p = CHAR_POS_ADDR (coding->dst_pos);
5385       unsigned char *pend = p + coding->produced;
5386       int eol_seen = EOL_SEEN_NONE;
5387
5388       for (; p < pend; p++)
5389         {
5390           if (*p == '\n')
5391             eol_seen |= EOL_SEEN_LF;
5392           else if (*p == '\r')
5393             {
5394               if (p + 1 < pend && *(p + 1) == '\n')
5395                 {
5396                   eol_seen |= EOL_SEEN_CRLF;
5397                   p++;
5398                 }
5399               else
5400                 eol_seen |= EOL_SEEN_CR;
5401             }
5402         }
5403       if (eol_seen != EOL_SEEN_NONE)
5404         adjust_coding_eol_type (coding, eol_seen);
5405     }
5406
5407   if (EQ (CODING_ID_EOL_TYPE (coding->id), Qmac))
5408     {
5409       unsigned char *p = CHAR_POS_ADDR (coding->dst_pos);
5410       unsigned char *pend = p + coding->produced;
5411
5412       for (; p < pend; p++)
5413         if (*p == '\r')
5414           *p = '\n';
5415     }
5416   else if (EQ (CODING_ID_EOL_TYPE (coding->id), Qdos))
5417     {
5418       unsigned char *p, *pbeg, *pend;
5419       Lisp_Object undo_list;
5420
5421       move_gap_both (coding->dst_pos + coding->produced_char,
5422                      coding->dst_pos_byte + coding->produced);
5423       undo_list = current_buffer->undo_list;
5424       current_buffer->undo_list = Qt;
5425       del_range_2 (coding->dst_pos, coding->dst_pos_byte, GPT, GPT_BYTE, 0);
5426       current_buffer->undo_list = undo_list;
5427       pbeg = GPT_ADDR;
5428       pend = pbeg + coding->produced;
5429
5430       for (p = pend - 1; p >= pbeg; p--)
5431         if (*p == '\r')
5432           {
5433             safe_bcopy ((char *) (p + 1), (char *) p, pend - p - 1);
5434             pend--;
5435           }
5436       coding->produced_char -= coding->produced - (pend - pbeg);
5437       coding->produced = pend - pbeg;
5438       insert_from_gap (coding->produced_char, coding->produced);
5439     }
5440 }
5441
5442 static void
5443 translate_chars (coding, table)
5444      struct coding_system *coding;
5445      Lisp_Object table;
5446 {
5447   int *charbuf = coding->charbuf;
5448   int *charbuf_end = charbuf + coding->charbuf_used;
5449   int c;
5450
5451   if (coding->chars_at_source)
5452     return;
5453
5454   while (charbuf < charbuf_end)
5455     {
5456       c = *charbuf;
5457       if (c < 0)
5458         charbuf += c;
5459       else
5460         *charbuf++ = translate_char (table, c);
5461     }
5462 }
5463
5464 static int
5465 produce_chars (coding)
5466      struct coding_system *coding;
5467 {
5468   unsigned char *dst = coding->destination + coding->produced;
5469   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5470   int produced;
5471   int produced_chars = 0;
5472
5473   if (! coding->chars_at_source)
5474     {
5475       /* Characters are in coding->charbuf.  */
5476       int *buf = coding->charbuf;
5477       int *buf_end = buf + coding->charbuf_used;
5478       unsigned char *adjusted_dst_end;
5479
5480       if (BUFFERP (coding->src_object)
5481           && EQ (coding->src_object, coding->dst_object))
5482         dst_end = coding->source + coding->consumed;
5483       adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5484
5485       while (buf < buf_end)
5486         {
5487           int c = *buf++;
5488
5489           if (dst >= adjusted_dst_end)
5490             {
5491               dst = alloc_destination (coding,
5492                                        buf_end - buf + MAX_MULTIBYTE_LENGTH,
5493                                        dst);
5494               dst_end = coding->destination + coding->dst_bytes;
5495               adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5496             }
5497           if (c >= 0)
5498             {
5499               if (coding->dst_multibyte
5500                   || ! CHAR_BYTE8_P (c))
5501                 CHAR_STRING_ADVANCE (c, dst);
5502               else
5503                 *dst++ = CHAR_TO_BYTE8 (c);
5504               produced_chars++;
5505             }
5506           else
5507             /* This is an annotation datum.  (-C) is the length of
5508                it.  */
5509             buf += -c - 1;
5510         }
5511     }
5512   else
5513     {
5514       unsigned char *src = coding->source;
5515       unsigned char *src_end = src + coding->src_bytes;
5516       Lisp_Object eol_type;
5517
5518       eol_type = CODING_ID_EOL_TYPE (coding->id);
5519
5520       if (coding->src_multibyte != coding->dst_multibyte)
5521         {
5522           if (coding->src_multibyte)
5523             {
5524               int multibytep = 1;
5525               int consumed_chars;
5526
5527               while (1)
5528                 {
5529                   unsigned char *src_base = src;
5530                   int c;
5531
5532                   ONE_MORE_BYTE (c);
5533                   if (c == '\r')
5534                     {
5535                       if (EQ (eol_type, Qdos))
5536                         {
5537                           if (src == src_end)
5538                             {
5539                               coding->result = CODING_RESULT_INSUFFICIENT_SRC;
5540                               goto no_more_source;
5541                             }
5542                           if (*src == '\n')
5543                             c = *src++;
5544                         }
5545                       else if (EQ (eol_type, Qmac))
5546                         c = '\n';
5547                     }
5548                   if (dst == dst_end)
5549                     {
5550                       coding->consumed = src - coding->source;
5551
5552                     if (EQ (coding->src_object, coding->dst_object))
5553                       dst_end = src;
5554                     if (dst == dst_end)
5555                       {
5556                         dst = alloc_destination (coding, src_end - src + 1,
5557                                                  dst);
5558                         dst_end = coding->destination + coding->dst_bytes;
5559                         coding_set_source (coding);
5560                         src = coding->source + coding->consumed;
5561                         src_end = coding->source + coding->src_bytes;
5562                       }
5563                     }
5564                   *dst++ = c;
5565                   produced_chars++;
5566                 }
5567             no_more_source:
5568               ;
5569             }
5570           else
5571             while (src < src_end)
5572               {
5573                 int multibytep = 1;
5574                 int c = *src++;
5575
5576                 if (c == '\r')
5577                   {
5578                     if (EQ (eol_type, Qdos))
5579                       {
5580                         if (src < src_end
5581                             && *src == '\n')
5582                           c = *src++;
5583                       }
5584                     else if (EQ (eol_type, Qmac))
5585                       c = '\n';
5586                   }
5587                 if (dst >= dst_end - 1)
5588                   {
5589                     coding->consumed = src - coding->source;
5590
5591                     if (EQ (coding->src_object, coding->dst_object))
5592                       dst_end = src;
5593                     if (dst >= dst_end - 1)
5594                       {
5595                         dst = alloc_destination (coding, src_end - src + 2,
5596                                                  dst);
5597                         dst_end = coding->destination + coding->dst_bytes;
5598                         coding_set_source (coding);
5599                         src = coding->source + coding->consumed;
5600                         src_end = coding->source + coding->src_bytes;
5601                       }
5602                   }
5603                 EMIT_ONE_BYTE (c);
5604               }
5605         }
5606       else
5607         {
5608           if (!EQ (coding->src_object, coding->dst_object))
5609             {
5610               int require = coding->src_bytes - coding->dst_bytes;
5611
5612               if (require > 0)
5613                 {
5614                   EMACS_INT offset = src - coding->source;
5615
5616                   dst = alloc_destination (coding, require, dst);
5617                   coding_set_source (coding);
5618                   src = coding->source + offset;
5619                   src_end = coding->source + coding->src_bytes;
5620                 }
5621             }
5622           produced_chars = coding->src_chars;
5623           while (src < src_end)
5624             {
5625               int c = *src++;
5626
5627               if (c == '\r')
5628                 {
5629                   if (EQ (eol_type, Qdos))
5630                     {
5631                       if (src < src_end
5632                           && *src == '\n')
5633                         c = *src++;
5634                       produced_chars--;
5635                     }
5636                   else if (EQ (eol_type, Qmac))
5637                     c = '\n';
5638                 }
5639               *dst++ = c;
5640             }
5641         }
5642       coding->consumed = coding->src_bytes;
5643       coding->consumed_char = coding->src_chars;
5644     }
5645
5646   produced = dst - (coding->destination + coding->produced);
5647   if (BUFFERP (coding->dst_object))
5648     insert_from_gap (produced_chars, produced);
5649   coding->produced += produced;
5650   coding->produced_char += produced_chars;
5651   return produced_chars;
5652 }
5653
5654 /* Compose text in CODING->object according to the annotation data at
5655    CHARBUF.  CHARBUF is an array:
5656      [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
5657  */
5658
5659 static INLINE void
5660 produce_composition (coding, charbuf)
5661      struct coding_system *coding;
5662      int *charbuf;
5663 {
5664   int len;
5665   EMACS_INT from, to;
5666   enum composition_method method;
5667   Lisp_Object components;
5668
5669   len = -charbuf[0];
5670   from = coding->dst_pos + charbuf[2];
5671   to = coding->dst_pos + charbuf[3];
5672   method = (enum composition_method) (charbuf[4]);
5673
5674   if (method == COMPOSITION_RELATIVE)
5675     components = Qnil;
5676   else
5677     {
5678       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5679       int i;
5680
5681       len -= 5;
5682       charbuf += 5;
5683       for (i = 0; i < len; i++)
5684         args[i] = make_number (charbuf[i]);
5685       components = (method == COMPOSITION_WITH_ALTCHARS
5686                     ? Fstring (len, args) : Fvector (len, args));
5687     }
5688   compose_text (from, to, components, Qnil, coding->dst_object);
5689 }
5690
5691
5692 /* Put `charset' property on text in CODING->object according to
5693    the annotation data at CHARBUF.  CHARBUF is an array:
5694      [ -LENGTH ANNOTATION_MASK FROM TO CHARSET-ID ]
5695  */
5696
5697 static INLINE void
5698 produce_charset (coding, charbuf)
5699      struct coding_system *coding;
5700      int *charbuf;
5701 {
5702   EMACS_INT from = coding->dst_pos + charbuf[2];
5703   EMACS_INT to = coding->dst_pos + charbuf[3];
5704   struct charset *charset = CHARSET_FROM_ID (charbuf[4]);
5705
5706   Fput_text_property (make_number (from), make_number (to),
5707                       Qcharset, CHARSET_NAME (charset),
5708                       coding->dst_object);
5709 }
5710
5711
5712 #define CHARBUF_SIZE 0x4000
5713
5714 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
5715   do {                                                                  \
5716     int size = CHARBUF_SIZE;;                                           \
5717                                                                         \
5718     coding->charbuf = NULL;                                             \
5719     while (size > 1024)                                                 \
5720       {                                                                 \
5721         coding->charbuf = (int *) alloca (sizeof (int) * size);         \
5722         if (coding->charbuf)                                            \
5723           break;                                                        \
5724         size >>= 1;                                                     \
5725       }                                                                 \
5726     if (! coding->charbuf)                                              \
5727       {                                                                 \
5728         coding->result = CODING_RESULT_INSUFFICIENT_MEM;                \
5729         return coding->result;                                          \
5730       }                                                                 \
5731     coding->charbuf_size = size;                                        \
5732   } while (0)
5733
5734
5735 static void
5736 produce_annotation (coding)
5737      struct coding_system *coding;
5738 {
5739   int *charbuf = coding->charbuf;
5740   int *charbuf_end = charbuf + coding->charbuf_used;
5741
5742   if (NILP (coding->dst_object))
5743     return;
5744
5745   while (charbuf < charbuf_end)
5746     {
5747       if (*charbuf >= 0)
5748         charbuf++;
5749       else
5750         {
5751           int len = -*charbuf;
5752           switch (charbuf[1])
5753             {
5754             case CODING_ANNOTATE_COMPOSITION_MASK:
5755               produce_composition (coding, charbuf);
5756               break;
5757             case CODING_ANNOTATE_CHARSET_MASK:
5758               produce_charset (coding, charbuf);
5759               break;
5760             default:
5761               abort ();
5762             }
5763           charbuf += len;
5764         }
5765     }
5766 }
5767
5768 /* Decode the data at CODING->src_object into CODING->dst_object.
5769    CODING->src_object is a buffer, a string, or nil.
5770    CODING->dst_object is a buffer.
5771
5772    If CODING->src_object is a buffer, it must be the current buffer.
5773    In this case, if CODING->src_pos is positive, it is a position of
5774    the source text in the buffer, otherwise, the source text is in the
5775    gap area of the buffer, and CODING->src_pos specifies the offset of
5776    the text from GPT (which must be the same as PT).  If this is the
5777    same buffer as CODING->dst_object, CODING->src_pos must be
5778    negative.
5779
5780    If CODING->src_object is a string, CODING->src_pos in an index to
5781    that string.
5782
5783    If CODING->src_object is nil, CODING->source must already point to
5784    the non-relocatable memory area.  In this case, CODING->src_pos is
5785    an offset from CODING->source.
5786
5787    The decoded data is inserted at the current point of the buffer
5788    CODING->dst_object.
5789 */
5790
5791 static int
5792 decode_coding (coding)
5793      struct coding_system *coding;
5794 {
5795   Lisp_Object attrs;
5796
5797   if (BUFFERP (coding->src_object)
5798       && coding->src_pos > 0
5799       && coding->src_pos < GPT
5800       && coding->src_pos + coding->src_chars > GPT)
5801     move_gap_both (coding->src_pos, coding->src_pos_byte);
5802
5803   if (BUFFERP (coding->dst_object))
5804     {
5805       if (current_buffer != XBUFFER (coding->dst_object))
5806         set_buffer_internal (XBUFFER (coding->dst_object));
5807       if (GPT != PT)
5808         move_gap_both (PT, PT_BYTE);
5809     }
5810
5811   coding->consumed = coding->consumed_char = 0;
5812   coding->produced = coding->produced_char = 0;
5813   coding->chars_at_source = 0;
5814   coding->result = CODING_RESULT_SUCCESS;
5815   coding->errors = 0;
5816
5817   ALLOC_CONVERSION_WORK_AREA (coding);
5818
5819   attrs = CODING_ID_ATTRS (coding->id);
5820
5821   do
5822     {
5823       coding_set_source (coding);
5824       coding->annotated = 0;
5825       (*(coding->decoder)) (coding);
5826       if (!NILP (CODING_ATTR_DECODE_TBL (attrs)))
5827         translate_chars (coding, CODING_ATTR_DECODE_TBL (attrs));
5828       else if (!NILP (Vstandard_translation_table_for_decode))
5829         translate_chars (coding, Vstandard_translation_table_for_decode);
5830       coding_set_destination (coding);
5831       produce_chars (coding);
5832       if (coding->annotated)
5833         produce_annotation (coding);
5834     }
5835   while (coding->consumed < coding->src_bytes
5836          && ! coding->result);
5837
5838   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qccl)
5839       && SYMBOLP (CODING_ID_EOL_TYPE (coding->id))
5840       && ! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
5841     decode_eol (coding);
5842
5843   coding->carryover_bytes = 0;
5844   if (coding->consumed < coding->src_bytes)
5845     {
5846       int nbytes = coding->src_bytes - coding->consumed;
5847       unsigned char *src;
5848
5849       coding_set_source (coding);
5850       coding_set_destination (coding);
5851       src = coding->source + coding->consumed;
5852
5853       if (coding->mode & CODING_MODE_LAST_BLOCK)
5854         {
5855           /* Flush out unprocessed data as binary chars.  We are sure
5856              that the number of data is less than the size of
5857              coding->charbuf.  */
5858           while (nbytes-- > 0)
5859             {
5860               int c = *src++;
5861
5862               coding->charbuf[coding->charbuf_used++] = (c & 0x80 ? - c : c);
5863             }
5864           produce_chars (coding);
5865         }
5866       else
5867         {
5868           /* Record unprocessed bytes in coding->carryover.  We are
5869              sure that the number of data is less than the size of
5870              coding->carryover.  */
5871           unsigned char *p = coding->carryover;
5872
5873           coding->carryover_bytes = nbytes;
5874           while (nbytes-- > 0)
5875             *p++ = *src++;
5876         }
5877       coding->consumed = coding->src_bytes;
5878     }
5879
5880   return coding->result;
5881 }
5882
5883
5884 /* Extract an annotation datum from a composition starting at POS and
5885    ending before LIMIT of CODING->src_object (buffer or string), store
5886    the data in BUF, set *STOP to a starting position of the next
5887    composition (if any) or to LIMIT, and return the address of the
5888    next element of BUF.
5889
5890    If such an annotation is not found, set *STOP to a starting
5891    position of a composition after POS (if any) or to LIMIT, and
5892    return BUF.  */
5893
5894 static INLINE int *
5895 handle_composition_annotation (pos, limit, coding, buf, stop)
5896      EMACS_INT pos, limit;
5897      struct coding_system *coding;
5898      int *buf;
5899      EMACS_INT *stop;
5900 {
5901   EMACS_INT start, end;
5902   Lisp_Object prop;
5903
5904   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
5905       || end > limit)
5906     *stop = limit;
5907   else if (start > pos)
5908     *stop = start;
5909   else
5910     {
5911       if (start == pos)
5912         {
5913           /* We found a composition.  Store the corresponding
5914              annotation data in BUF.  */
5915           int *head = buf;
5916           enum composition_method method = COMPOSITION_METHOD (prop);
5917           int nchars = COMPOSITION_LENGTH (prop);
5918
5919           ADD_COMPOSITION_DATA (buf, 0, nchars, method);
5920           if (method != COMPOSITION_RELATIVE)
5921             {
5922               Lisp_Object components;
5923               int len, i, i_byte;
5924
5925               components = COMPOSITION_COMPONENTS (prop);
5926               if (VECTORP (components))
5927                 {
5928                   len = XVECTOR (components)->size;
5929                   for (i = 0; i < len; i++)
5930                     *buf++ = XINT (AREF (components, i));
5931                 }
5932               else if (STRINGP (components))
5933                 {
5934                   len = XSTRING (components)->size;
5935                   i = i_byte = 0;
5936                   while (i < len)
5937                     {
5938                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
5939                       buf++;
5940                     }
5941                 }
5942               else if (INTEGERP (components))
5943                 {
5944                   len = 1;
5945                   *buf++ = XINT (components);
5946                 }
5947               else if (CONSP (components))
5948                 {
5949                   for (len = 0; CONSP (components);
5950                        len++, components = XCDR (components))
5951                     *buf++ = XINT (XCAR (components));
5952                 }
5953               else
5954                 abort ();
5955               *head -= len;
5956             }
5957         }
5958
5959       if (find_composition (end, limit, &start, &end, &prop,
5960                             coding->src_object)
5961           && end <= limit)
5962         *stop = start;
5963       else
5964         *stop = limit;
5965     }
5966   return buf;
5967 }
5968
5969
5970 /* Extract an annotation datum from a text property `charset' at POS of
5971    CODING->src_object (buffer of string), store the data in BUF, set
5972    *STOP to the position where the value of `charset' property changes
5973    (limiting by LIMIT), and return the address of the next element of
5974    BUF.
5975
5976    If the property value is nil, set *STOP to the position where the
5977    property value is non-nil (limiting by LIMIT), and return BUF.  */
5978
5979 static INLINE int *
5980 handle_charset_annotation (pos, limit, coding, buf, stop)
5981      EMACS_INT pos, limit;
5982      struct coding_system *coding;
5983      int *buf;
5984      EMACS_INT *stop;
5985 {
5986   Lisp_Object val, next;
5987   int id;
5988
5989   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
5990   if (! NILP (val) && CHARSETP (val))
5991     id = XINT (CHARSET_SYMBOL_ID (val));
5992   else
5993     id = -1;
5994   ADD_CHARSET_DATA (buf, 0, 0, id);
5995   next = Fnext_single_property_change (make_number (pos), Qcharset,
5996                                        coding->src_object,
5997                                        make_number (limit));
5998   *stop = XINT (next);
5999   return buf;
6000 }
6001
6002
6003 static void
6004 consume_chars (coding)
6005      struct coding_system *coding;
6006 {
6007   int *buf = coding->charbuf;
6008   int *buf_end = coding->charbuf + coding->charbuf_size;
6009   const unsigned char *src = coding->source + coding->consumed;
6010   const unsigned char *src_end = coding->source + coding->src_bytes;
6011   EMACS_INT pos = coding->src_pos + coding->consumed_char;
6012   EMACS_INT end_pos = coding->src_pos + coding->src_chars;
6013   int multibytep = coding->src_multibyte;
6014   Lisp_Object eol_type;
6015   int c;
6016   EMACS_INT stop, stop_composition, stop_charset;
6017
6018   eol_type = CODING_ID_EOL_TYPE (coding->id);
6019   if (VECTORP (eol_type))
6020     eol_type = Qunix;
6021
6022   /* Note: composition handling is not yet implemented.  */
6023   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
6024
6025   if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
6026     stop = stop_composition = pos;
6027   else
6028     stop = stop_composition = end_pos;
6029   if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
6030     stop = stop_charset = pos;
6031   else
6032     stop_charset = end_pos;
6033
6034   /* Compensate for CRLF and annotation.  */
6035   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
6036   while (buf < buf_end)
6037     {
6038       if (pos == stop)
6039         {
6040           if (pos == end_pos)
6041             break;
6042           if (pos == stop_composition)
6043             buf = handle_composition_annotation (pos, end_pos, coding,
6044                                                  buf, &stop_composition);
6045           if (pos == stop_charset)
6046             buf = handle_charset_annotation (pos, end_pos, coding,
6047                                              buf, &stop_charset);
6048           stop = (stop_composition < stop_charset
6049                   ? stop_composition : stop_charset);
6050         }
6051
6052       if (! multibytep)
6053         {
6054           EMACS_INT bytes;
6055
6056           if (! CODING_FOR_UNIBYTE (coding)
6057               && (bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
6058             c = STRING_CHAR_ADVANCE (src), pos += bytes;
6059           else
6060             c = *src++, pos++;
6061         }
6062       else
6063         c = STRING_CHAR_ADVANCE (src), pos++;
6064       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
6065         c = '\n';
6066       if (! EQ (eol_type, Qunix))
6067         {
6068           if (c == '\n')
6069             {
6070               if (EQ (eol_type, Qdos))
6071                 *buf++ = '\r';
6072               else
6073                 c = '\r';
6074             }
6075         }
6076       *buf++ = c;
6077     }
6078
6079   coding->consumed = src - coding->source;
6080   coding->consumed_char = pos - coding->src_pos;
6081   coding->charbuf_used = buf - coding->charbuf;
6082   coding->chars_at_source = 0;
6083 }
6084
6085
6086 /* Encode the text at CODING->src_object into CODING->dst_object.
6087    CODING->src_object is a buffer or a string.
6088    CODING->dst_object is a buffer or nil.
6089
6090    If CODING->src_object is a buffer, it must be the current buffer.
6091    In this case, if CODING->src_pos is positive, it is a position of
6092    the source text in the buffer, otherwise. the source text is in the
6093    gap area of the buffer, and coding->src_pos specifies the offset of
6094    the text from GPT (which must be the same as PT).  If this is the
6095    same buffer as CODING->dst_object, CODING->src_pos must be
6096    negative and CODING should not have `pre-write-conversion'.
6097
6098    If CODING->src_object is a string, CODING should not have
6099    `pre-write-conversion'.
6100
6101    If CODING->dst_object is a buffer, the encoded data is inserted at
6102    the current point of that buffer.
6103
6104    If CODING->dst_object is nil, the encoded data is placed at the
6105    memory area specified by CODING->destination.  */
6106
6107 static int
6108 encode_coding (coding)
6109      struct coding_system *coding;
6110 {
6111   Lisp_Object attrs;
6112
6113   attrs = CODING_ID_ATTRS (coding->id);
6114
6115   if (BUFFERP (coding->dst_object))
6116     {
6117       set_buffer_internal (XBUFFER (coding->dst_object));
6118       coding->dst_multibyte
6119         = ! NILP (current_buffer->enable_multibyte_characters);
6120     }
6121
6122   coding->consumed = coding->consumed_char = 0;
6123   coding->produced = coding->produced_char = 0;
6124   coding->result = CODING_RESULT_SUCCESS;
6125   coding->errors = 0;
6126
6127   ALLOC_CONVERSION_WORK_AREA (coding);
6128
6129   do {
6130     coding_set_source (coding);
6131     consume_chars (coding);
6132
6133     if (!NILP (CODING_ATTR_ENCODE_TBL (attrs)))
6134       translate_chars (coding, CODING_ATTR_ENCODE_TBL (attrs));
6135     else if (!NILP (Vstandard_translation_table_for_encode))
6136       translate_chars (coding, Vstandard_translation_table_for_encode);
6137
6138     coding_set_destination (coding);
6139     (*(coding->encoder)) (coding);
6140   } while (coding->consumed_char < coding->src_chars);
6141
6142   if (BUFFERP (coding->dst_object))
6143     insert_from_gap (coding->produced_char, coding->produced);
6144
6145   return (coding->result);
6146 }
6147
6148
6149 /* Stack of working buffers used in code conversion.  An nil element
6150    means that the code conversion of that level is not using a working
6151    buffer.  */
6152 Lisp_Object Vcode_conversion_work_buf_list;
6153
6154 /* A working buffer used by the top level conversion.  */
6155 Lisp_Object Vcode_conversion_reused_work_buf;
6156
6157
6158 /* Return a working buffer that can be freely used by the following
6159    code conversion.  MULTIBYTEP specifies the multibyteness of the
6160    buffer.  */
6161
6162 Lisp_Object
6163 make_conversion_work_buffer (multibytep, depth)
6164      int multibytep, depth;
6165 {
6166   struct buffer *current = current_buffer;
6167   Lisp_Object buf, name;
6168
6169   if (depth == 0)
6170     {
6171       if (NILP (Vcode_conversion_reused_work_buf))
6172         Vcode_conversion_reused_work_buf
6173           = Fget_buffer_create (build_string (" *code-converting-work<0>*"));
6174       buf = Vcode_conversion_reused_work_buf;
6175     }
6176   else
6177     {
6178       if (depth < 0)
6179         {
6180           name = build_string (" *code-converting-work*");
6181           name = Fgenerate_new_buffer_name (name, Qnil);
6182         }
6183       else
6184         {
6185           char str[128];
6186
6187           sprintf (str, " *code-converting-work*<%d>", depth);
6188           name = build_string (str);
6189         }
6190       buf = Fget_buffer_create (name);
6191     }
6192   set_buffer_internal (XBUFFER (buf));
6193   current_buffer->undo_list = Qt;
6194   Ferase_buffer ();
6195   Fset_buffer_multibyte (multibytep ? Qt : Qnil, Qnil);
6196   set_buffer_internal (current);
6197   return buf;
6198 }
6199
6200 static Lisp_Object
6201 code_conversion_restore (buffer)
6202      Lisp_Object buffer;
6203 {
6204   Lisp_Object workbuf;
6205
6206   workbuf = XCAR (Vcode_conversion_work_buf_list);
6207   if (! NILP (workbuf)
6208       && ! EQ (workbuf, Vcode_conversion_reused_work_buf)
6209       && ! NILP (Fbuffer_live_p (workbuf)))
6210     Fkill_buffer (workbuf);
6211   Vcode_conversion_work_buf_list = XCDR (Vcode_conversion_work_buf_list);
6212   set_buffer_internal (XBUFFER (buffer));
6213   return Qnil;
6214 }
6215
6216 static Lisp_Object
6217 code_conversion_save (buffer, with_work_buf, multibyte)
6218      Lisp_Object buffer;
6219      int with_work_buf, multibyte;
6220 {
6221   Lisp_Object workbuf;
6222
6223   if (with_work_buf)
6224     {
6225       int depth = XINT (Flength (Vcode_conversion_work_buf_list));
6226
6227       workbuf = make_conversion_work_buffer (multibyte, depth);
6228     }
6229   else
6230     workbuf = Qnil;
6231   Vcode_conversion_work_buf_list
6232     = Fcons (workbuf, Vcode_conversion_work_buf_list);
6233   record_unwind_protect (code_conversion_restore, buffer);
6234   return workbuf;
6235 }
6236
6237 int
6238 decode_coding_gap (coding, chars, bytes)
6239      struct coding_system *coding;
6240      EMACS_INT chars, bytes;
6241 {
6242   int count = specpdl_ptr - specpdl;
6243   Lisp_Object attrs;
6244   Lisp_Object buffer;
6245
6246   buffer = Fcurrent_buffer ();
6247   code_conversion_save (buffer, 0, 0);
6248
6249   coding->src_object = buffer;
6250   coding->src_chars = chars;
6251   coding->src_bytes = bytes;
6252   coding->src_pos = -chars;
6253   coding->src_pos_byte = -bytes;
6254   coding->src_multibyte = chars < bytes;
6255   coding->dst_object = buffer;
6256   coding->dst_pos = PT;
6257   coding->dst_pos_byte = PT_BYTE;
6258   coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
6259   coding->mode |= CODING_MODE_LAST_BLOCK;
6260
6261   if (CODING_REQUIRE_DETECTION (coding))
6262     detect_coding (coding);
6263
6264   decode_coding (coding);
6265
6266   attrs = CODING_ID_ATTRS (coding->id);
6267   if (! NILP (CODING_ATTR_POST_READ (attrs)))
6268     {
6269       struct gcpro gcpro1;
6270       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6271       Lisp_Object val;
6272
6273       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6274       GCPRO1 (buffer);
6275       val = call1 (CODING_ATTR_POST_READ (attrs),
6276                    make_number (coding->produced_char));
6277       UNGCPRO;
6278       CHECK_NATNUM (val);
6279       coding->produced_char += Z - prev_Z;
6280       coding->produced += Z_BYTE - prev_Z_BYTE;
6281     }
6282
6283   unbind_to (count, Qnil);
6284   return coding->result;
6285 }
6286
6287 int
6288 encode_coding_gap (coding, chars, bytes)
6289      struct coding_system *coding;
6290      EMACS_INT chars, bytes;
6291 {
6292   int count = specpdl_ptr - specpdl;
6293   Lisp_Object buffer;
6294
6295   buffer = Fcurrent_buffer ();
6296   code_conversion_save (buffer, 0, 0);
6297
6298   coding->src_object = buffer;
6299   coding->src_chars = chars;
6300   coding->src_bytes = bytes;
6301   coding->src_pos = -chars;
6302   coding->src_pos_byte = -bytes;
6303   coding->src_multibyte = chars < bytes;
6304   coding->dst_object = coding->src_object;
6305   coding->dst_pos = PT;
6306   coding->dst_pos_byte = PT_BYTE;
6307
6308   encode_coding (coding);
6309
6310   unbind_to (count, Qnil);
6311   return coding->result;
6312 }
6313
6314
6315 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
6316    SRC_OBJECT into DST_OBJECT by coding context CODING.
6317
6318    SRC_OBJECT is a buffer, a string, or Qnil.
6319
6320    If it is a buffer, the text is at point of the buffer.  FROM and TO
6321    are positions in the buffer.
6322
6323    If it is a string, the text is at the beginning of the string.
6324    FROM and TO are indices to the string.
6325
6326    If it is nil, the text is at coding->source.  FROM and TO are
6327    indices to coding->source.
6328
6329    DST_OBJECT is a buffer, Qt, or Qnil.
6330
6331    If it is a buffer, the decoded text is inserted at point of the
6332    buffer.  If the buffer is the same as SRC_OBJECT, the source text
6333    is deleted.
6334
6335    If it is Qt, a string is made from the decoded text, and
6336    set in CODING->dst_object.
6337
6338    If it is Qnil, the decoded text is stored at CODING->destination.
6339    The caller must allocate CODING->dst_bytes bytes at
6340    CODING->destination by xmalloc.  If the decoded text is longer than
6341    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
6342  */
6343
6344 void
6345 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6346                       dst_object)
6347      struct coding_system *coding;
6348      Lisp_Object src_object;
6349      EMACS_INT from, from_byte, to, to_byte;
6350      Lisp_Object dst_object;
6351 {
6352   int count = specpdl_ptr - specpdl;
6353   unsigned char *destination;
6354   EMACS_INT dst_bytes;
6355   EMACS_INT chars = to - from;
6356   EMACS_INT bytes = to_byte - from_byte;
6357   Lisp_Object attrs;
6358   Lisp_Object buffer;
6359   int saved_pt = -1, saved_pt_byte;
6360
6361   buffer = Fcurrent_buffer ();
6362
6363   if (NILP (dst_object))
6364     {
6365       destination = coding->destination;
6366       dst_bytes = coding->dst_bytes;
6367     }
6368
6369   coding->src_object = src_object;
6370   coding->src_chars = chars;
6371   coding->src_bytes = bytes;
6372   coding->src_multibyte = chars < bytes;
6373
6374   if (STRINGP (src_object))
6375     {
6376       coding->src_pos = from;
6377       coding->src_pos_byte = from_byte;
6378     }
6379   else if (BUFFERP (src_object))
6380     {
6381       set_buffer_internal (XBUFFER (src_object));
6382       if (from != GPT)
6383         move_gap_both (from, from_byte);
6384       if (EQ (src_object, dst_object))
6385         {
6386           saved_pt = PT, saved_pt_byte = PT_BYTE;
6387           TEMP_SET_PT_BOTH (from, from_byte);
6388           del_range_both (from, from_byte, to, to_byte, 1);
6389           coding->src_pos = -chars;
6390           coding->src_pos_byte = -bytes;
6391         }
6392       else
6393         {
6394           coding->src_pos = from;
6395           coding->src_pos_byte = from_byte;
6396         }
6397     }
6398
6399   if (CODING_REQUIRE_DETECTION (coding))
6400     detect_coding (coding);
6401   attrs = CODING_ID_ATTRS (coding->id);
6402
6403   if (EQ (dst_object, Qt)
6404       || (! NILP (CODING_ATTR_POST_READ (attrs))
6405           && NILP (dst_object)))
6406     {
6407       coding->dst_object = code_conversion_save (buffer, 1, 1);
6408       coding->dst_pos = BEG;
6409       coding->dst_pos_byte = BEG_BYTE;
6410       coding->dst_multibyte = 1;
6411     }
6412   else if (BUFFERP (dst_object))
6413     {
6414       code_conversion_save (buffer, 0, 0);
6415       coding->dst_object = dst_object;
6416       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6417       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6418       coding->dst_multibyte
6419         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6420     }
6421   else
6422     {
6423       code_conversion_save (buffer, 0, 0);
6424       coding->dst_object = Qnil;
6425       coding->dst_multibyte = 1;
6426     }
6427
6428   decode_coding (coding);
6429
6430   if (BUFFERP (coding->dst_object))
6431     set_buffer_internal (XBUFFER (coding->dst_object));
6432
6433   if (! NILP (CODING_ATTR_POST_READ (attrs)))
6434     {
6435       struct gcpro gcpro1, gcpro2;
6436       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6437       Lisp_Object val;
6438
6439       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6440       GCPRO2 (coding->src_object, coding->dst_object);
6441       val = call1 (CODING_ATTR_POST_READ (attrs),
6442                    make_number (coding->produced_char));
6443       UNGCPRO;
6444       CHECK_NATNUM (val);
6445       coding->produced_char += Z - prev_Z;
6446       coding->produced += Z_BYTE - prev_Z_BYTE;
6447     }
6448
6449   if (EQ (dst_object, Qt))
6450     {
6451       coding->dst_object = Fbuffer_string ();
6452     }
6453   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
6454     {
6455       set_buffer_internal (XBUFFER (coding->dst_object));
6456       if (dst_bytes < coding->produced)
6457         {
6458           destination
6459             = (unsigned char *) xrealloc (destination, coding->produced);
6460           if (! destination)
6461             {
6462               coding->result = CODING_RESULT_INSUFFICIENT_DST;
6463               unbind_to (count, Qnil);
6464               return;
6465             }
6466           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
6467             move_gap_both (BEGV, BEGV_BYTE);
6468           bcopy (BEGV_ADDR, destination, coding->produced);
6469           coding->destination = destination;
6470         }
6471     }
6472
6473   if (saved_pt >= 0)
6474     {
6475       /* This is the case of:
6476          (BUFFERP (src_object) && EQ (src_object, dst_object))
6477          As we have moved PT while replacing the original buffer
6478          contents, we must recover it now.  */
6479       set_buffer_internal (XBUFFER (src_object));
6480       if (saved_pt < from)
6481         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6482       else if (saved_pt < from + chars)
6483         TEMP_SET_PT_BOTH (from, from_byte);
6484       else if (! NILP (current_buffer->enable_multibyte_characters))
6485         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6486                           saved_pt_byte + (coding->produced - bytes));
6487       else
6488         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6489                           saved_pt_byte + (coding->produced - bytes));
6490     }
6491
6492   unbind_to (count, Qnil);
6493 }
6494
6495
6496 void
6497 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6498                       dst_object)
6499      struct coding_system *coding;
6500      Lisp_Object src_object;
6501      EMACS_INT from, from_byte, to, to_byte;
6502      Lisp_Object dst_object;
6503 {
6504   int count = specpdl_ptr - specpdl;
6505   EMACS_INT chars = to - from;
6506   EMACS_INT bytes = to_byte - from_byte;
6507   Lisp_Object attrs;
6508   Lisp_Object buffer;
6509   int saved_pt = -1, saved_pt_byte;
6510
6511   buffer = Fcurrent_buffer ();
6512
6513   coding->src_object = src_object;
6514   coding->src_chars = chars;
6515   coding->src_bytes = bytes;
6516   coding->src_multibyte = chars < bytes;
6517
6518   attrs = CODING_ID_ATTRS (coding->id);
6519
6520   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6521     {
6522       coding->src_object = code_conversion_save (buffer, 1,
6523                                                  coding->src_multibyte);
6524       set_buffer_internal (XBUFFER (coding->src_object));
6525       if (STRINGP (src_object))
6526         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
6527       else if (BUFFERP (src_object))
6528         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
6529       else
6530         insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
6531
6532       if (EQ (src_object, dst_object))
6533         {
6534           set_buffer_internal (XBUFFER (src_object));
6535           saved_pt = PT, saved_pt_byte = PT_BYTE;
6536           del_range_both (from, from_byte, to, to_byte, 1);
6537           set_buffer_internal (XBUFFER (coding->src_object));
6538         }
6539
6540       call2 (CODING_ATTR_PRE_WRITE (attrs),
6541              make_number (BEG), make_number (Z));
6542       coding->src_object = Fcurrent_buffer ();
6543       if (BEG != GPT)
6544         move_gap_both (BEG, BEG_BYTE);
6545       coding->src_chars = Z - BEG;
6546       coding->src_bytes = Z_BYTE - BEG_BYTE;
6547       coding->src_pos = BEG;
6548       coding->src_pos_byte = BEG_BYTE;
6549       coding->src_multibyte = Z < Z_BYTE;
6550     }
6551   else if (STRINGP (src_object))
6552     {
6553       code_conversion_save (buffer, 0, 0);
6554       coding->src_pos = from;
6555       coding->src_pos_byte = from_byte;
6556     }
6557   else if (BUFFERP (src_object))
6558     {
6559       code_conversion_save (buffer, 0, 0);
6560       set_buffer_internal (XBUFFER (src_object));
6561       if (EQ (src_object, dst_object))
6562         {
6563           saved_pt = PT, saved_pt_byte = PT_BYTE;
6564           coding->src_object = del_range_1 (from, to, 1, 1);
6565           coding->src_pos = 0;
6566           coding->src_pos_byte = 0;
6567         }
6568       else
6569         {
6570           if (from < GPT && to >= GPT)
6571             move_gap_both (from, from_byte);
6572           coding->src_pos = from;
6573           coding->src_pos_byte = from_byte;
6574         }
6575     }
6576   else
6577     code_conversion_save (buffer, 0, 0);
6578
6579   if (BUFFERP (dst_object))
6580     {
6581       coding->dst_object = dst_object;
6582       if (EQ (src_object, dst_object))
6583         {
6584           coding->dst_pos = from;
6585           coding->dst_pos_byte = from_byte;
6586         }
6587       else
6588         {
6589           coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6590           coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6591         }
6592       coding->dst_multibyte
6593         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6594     }
6595   else if (EQ (dst_object, Qt))
6596     {
6597       coding->dst_object = Qnil;
6598       coding->dst_bytes = coding->src_chars;
6599       if (coding->dst_bytes == 0)
6600         coding->dst_bytes = 1;
6601       coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
6602       coding->dst_multibyte = 0;
6603     }
6604   else
6605     {
6606       coding->dst_object = Qnil;
6607       coding->dst_multibyte = 0;
6608     }
6609
6610   encode_coding (coding);
6611
6612   if (EQ (dst_object, Qt))
6613     {
6614       if (BUFFERP (coding->dst_object))
6615         coding->dst_object = Fbuffer_string ();
6616       else
6617         {
6618           coding->dst_object
6619             = make_unibyte_string ((char *) coding->destination,
6620                                    coding->produced);
6621           xfree (coding->destination);
6622         }
6623     }
6624
6625   if (saved_pt >= 0)
6626     {
6627       /* This is the case of:
6628          (BUFFERP (src_object) && EQ (src_object, dst_object))
6629          As we have moved PT while replacing the original buffer
6630          contents, we must recover it now.  */
6631       set_buffer_internal (XBUFFER (src_object));
6632       if (saved_pt < from)
6633         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6634       else if (saved_pt < from + chars)
6635         TEMP_SET_PT_BOTH (from, from_byte);
6636       else if (! NILP (current_buffer->enable_multibyte_characters))
6637         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6638                           saved_pt_byte + (coding->produced - bytes));
6639       else
6640         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6641                           saved_pt_byte + (coding->produced - bytes));
6642     }
6643
6644   unbind_to (count, Qnil);
6645 }
6646
6647
6648 Lisp_Object
6649 preferred_coding_system ()
6650 {
6651   int id = coding_categories[coding_priorities[0]].id;
6652
6653   return CODING_ID_NAME (id);
6654 }
6655
6656 \f
6657 #ifdef emacs
6658 /*** 8. Emacs Lisp library functions ***/
6659
6660 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6661        doc: /* Return t if OBJECT is nil or a coding-system.
6662 See the documentation of `define-coding-system' for information
6663 about coding-system objects.  */)
6664      (obj)
6665      Lisp_Object obj;
6666 {
6667   return ((NILP (obj) || CODING_SYSTEM_P (obj)) ? Qt : Qnil);
6668 }
6669
6670 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6671        Sread_non_nil_coding_system, 1, 1, 0,
6672        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
6673      (prompt)
6674      Lisp_Object prompt;
6675 {
6676   Lisp_Object val;
6677   do
6678     {
6679       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6680                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6681     }
6682   while (XSTRING (val)->size == 0);
6683   return (Fintern (val, Qnil));
6684 }
6685
6686 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6687        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6688 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.  */)
6689      (prompt, default_coding_system)
6690      Lisp_Object prompt, default_coding_system;
6691 {
6692   Lisp_Object val;
6693   if (SYMBOLP (default_coding_system))
6694     XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
6695   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6696                           Qt, Qnil, Qcoding_system_history,
6697                           default_coding_system, Qnil);
6698   return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
6699 }
6700
6701 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6702        1, 1, 0,
6703        doc: /* Check validity of CODING-SYSTEM.
6704 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.  */)
6705   (coding_system)
6706      Lisp_Object coding_system;
6707 {
6708   CHECK_SYMBOL (coding_system);
6709   if (!NILP (Fcoding_system_p (coding_system)))
6710     return coding_system;
6711   while (1)
6712     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
6713 }
6714
6715 \f
6716 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
6717    HIGHEST is nonzero, return the coding system of the highest
6718    priority among the detected coding systems.  Otherwize return a
6719    list of detected coding systems sorted by their priorities.  If
6720    MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
6721    multibyte form but contains only ASCII and eight-bit chars.
6722    Otherwise, the bytes are raw bytes.
6723
6724    CODING-SYSTEM controls the detection as below:
6725
6726    If it is nil, detect both text-format and eol-format.  If the
6727    text-format part of CODING-SYSTEM is already specified
6728    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
6729    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
6730    detect only text-format.  */
6731
6732 Lisp_Object
6733 detect_coding_system (src, src_bytes, highest, multibytep, coding_system)
6734      unsigned char *src;
6735      int src_bytes, highest;
6736      int multibytep;
6737      Lisp_Object coding_system;
6738 {
6739   unsigned char *src_end = src + src_bytes;
6740   Lisp_Object attrs, eol_type;
6741   Lisp_Object val;
6742   struct coding_system coding;
6743   int id;
6744   struct coding_detection_info detect_info;
6745
6746   if (NILP (coding_system))
6747     coding_system = Qundecided;
6748   setup_coding_system (coding_system, &coding);
6749   attrs = CODING_ID_ATTRS (coding.id);
6750   eol_type = CODING_ID_EOL_TYPE (coding.id);
6751   coding_system = CODING_ATTR_BASE_NAME (attrs);
6752
6753   coding.source = src;
6754   coding.src_bytes = src_bytes;
6755   coding.src_multibyte = multibytep;
6756   coding.consumed = 0;
6757   coding.mode |= CODING_MODE_LAST_BLOCK;
6758
6759   detect_info.checked = detect_info.found = detect_info.rejected = 0;
6760
6761   /* At first, detect text-format if necessary.  */
6762   if (XINT (CODING_ATTR_CATEGORY (attrs)) == coding_category_undecided)
6763     {
6764       enum coding_category category;
6765       struct coding_system *this;
6766       int c, i;
6767
6768       for (; src < src_end; src++)
6769         {
6770           c = *src;
6771           if (c & 0x80
6772               || (c < 0x20 && (c == ISO_CODE_ESC
6773                                || c == ISO_CODE_SI
6774                                || c == ISO_CODE_SO)))
6775             break;
6776         }
6777       coding.head_ascii = src - coding.source;
6778
6779       if (src < src_end)
6780         for (i = 0; i < coding_category_raw_text; i++)
6781           {
6782             category = coding_priorities[i];
6783             this = coding_categories + category;
6784
6785             if (this->id < 0)
6786               {
6787                 /* No coding system of this category is defined.  */
6788                 detect_info.rejected |= (1 << category);
6789               }
6790             else if (category >= coding_category_raw_text)
6791               continue;
6792             else if (detect_info.checked & (1 << category))
6793               {
6794                 if (highest
6795                     && (detect_info.found & (1 << category)))
6796                   break;
6797               }
6798             else
6799               {
6800                 if ((*(this->detector)) (&coding, &detect_info)
6801                     && highest
6802                     && (detect_info.found & (1 << category)))
6803                   break;
6804               }
6805           }
6806
6807
6808       if (detect_info.rejected == CATEGORY_MASK_ANY)
6809         {
6810           detect_info.found = CATEGORY_MASK_RAW_TEXT;
6811           id = coding_categories[coding_category_raw_text].id;
6812           val = Fcons (make_number (id), Qnil);
6813         }
6814       else if (! detect_info.rejected && ! detect_info.found)
6815         {
6816           detect_info.found = CATEGORY_MASK_ANY;
6817           id = coding_categories[coding_category_undecided].id;
6818           val = Fcons (make_number (id), Qnil);
6819         }
6820       else if (highest)
6821         {
6822           if (detect_info.found)
6823             {
6824               detect_info.found = 1 << category;
6825               val = Fcons (make_number (this->id), Qnil);
6826             }
6827           else
6828             for (i = 0; i < coding_category_raw_text; i++)
6829               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6830                 {
6831                   detect_info.found = 1 << coding_priorities[i];
6832                   id = coding_categories[coding_priorities[i]].id;
6833                   val = Fcons (make_number (id), Qnil);
6834                   break;
6835                 }
6836         }
6837       else
6838         {
6839           int mask = detect_info.rejected | detect_info.found;
6840           int found = 0;
6841           val = Qnil;
6842
6843           for (i = coding_category_raw_text - 1; i >= 0; i--)
6844             {
6845               category = coding_priorities[i];
6846               if (! (mask & (1 << category)))
6847                 {
6848                   found |= 1 << category;
6849                   id = coding_categories[category].id;
6850                   val = Fcons (make_number (id), val);
6851                 }
6852             }
6853           for (i = coding_category_raw_text - 1; i >= 0; i--)
6854             {
6855               category = coding_priorities[i];
6856               if (detect_info.found & (1 << category))
6857                 {
6858                   id = coding_categories[category].id;
6859                   val = Fcons (make_number (id), val);
6860                 }
6861             }
6862           detect_info.found |= found;
6863         }
6864     }
6865   else
6866     {
6867       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
6868       val = Fcons (make_number (coding.id), Qnil);
6869     }
6870
6871   /* Then, detect eol-format if necessary.  */
6872   {
6873     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
6874     Lisp_Object tail;
6875
6876     if (VECTORP (eol_type))
6877       {
6878         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
6879           normal_eol = detect_eol (coding.source, src_bytes,
6880                                    coding_category_raw_text);
6881         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
6882                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
6883           utf_16_be_eol = detect_eol (coding.source, src_bytes,
6884                                       coding_category_utf_16_be);
6885         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
6886                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
6887           utf_16_le_eol = detect_eol (coding.source, src_bytes,
6888                                       coding_category_utf_16_le);
6889       }
6890     else
6891       {
6892         if (EQ (eol_type, Qunix))
6893           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
6894         else if (EQ (eol_type, Qdos))
6895           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
6896         else
6897           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
6898       }
6899
6900     for (tail = val; CONSP (tail); tail = XCDR (tail))
6901       {
6902         enum coding_category category;
6903         int this_eol;
6904
6905         id = XINT (XCAR (tail));
6906         attrs = CODING_ID_ATTRS (id);
6907         category = XINT (CODING_ATTR_CATEGORY (attrs));
6908         eol_type = CODING_ID_EOL_TYPE (id);
6909         if (VECTORP (eol_type))
6910           {
6911             if (category == coding_category_utf_16_be
6912                 || category == coding_category_utf_16_be_nosig)
6913               this_eol = utf_16_be_eol;
6914             else if (category == coding_category_utf_16_le
6915                      || category == coding_category_utf_16_le_nosig)
6916               this_eol = utf_16_le_eol;
6917             else
6918               this_eol = normal_eol;
6919
6920             if (this_eol == EOL_SEEN_LF)
6921               XSETCAR (tail, AREF (eol_type, 0));
6922             else if (this_eol == EOL_SEEN_CRLF)
6923               XSETCAR (tail, AREF (eol_type, 1));
6924             else if (this_eol == EOL_SEEN_CR)
6925               XSETCAR (tail, AREF (eol_type, 2));
6926             else
6927               XSETCAR (tail, CODING_ID_NAME (id));
6928           }
6929         else
6930           XSETCAR (tail, CODING_ID_NAME (id));
6931       }
6932   }
6933
6934   return (highest ? XCAR (val) : val);
6935 }
6936
6937
6938 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6939        2, 3, 0,
6940        doc: /* Detect coding system of the text in the region between START and END.
6941 Return a list of possible coding systems ordered by priority.
6942
6943 If only ASCII characters are found, it returns a list of single element
6944 `undecided' or its subsidiary coding system according to a detected
6945 end-of-line format.
6946
6947 If optional argument HIGHEST is non-nil, return the coding system of
6948 highest priority.  */)
6949      (start, end, highest)
6950      Lisp_Object start, end, highest;
6951 {
6952   int from, to;
6953   int from_byte, to_byte;
6954
6955   CHECK_NUMBER_COERCE_MARKER (start);
6956   CHECK_NUMBER_COERCE_MARKER (end);
6957
6958   validate_region (&start, &end);
6959   from = XINT (start), to = XINT (end);
6960   from_byte = CHAR_TO_BYTE (from);
6961   to_byte = CHAR_TO_BYTE (to);
6962
6963   if (from < GPT && to >= GPT)
6964     move_gap_both (to, to_byte);
6965
6966   return detect_coding_system (BYTE_POS_ADDR (from_byte),
6967                                to_byte - from_byte,
6968                                !NILP (highest),
6969                                !NILP (current_buffer
6970                                       ->enable_multibyte_characters),
6971                                Qnil);
6972 }
6973
6974 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6975        1, 2, 0,
6976        doc: /* Detect coding system of the text in STRING.
6977 Return a list of possible coding systems ordered by priority.
6978
6979 If only ASCII characters are found, it returns a list of single element
6980 `undecided' or its subsidiary coding system according to a detected
6981 end-of-line format.
6982
6983 If optional argument HIGHEST is non-nil, return the coding system of
6984 highest priority.  */)
6985      (string, highest)
6986      Lisp_Object string, highest;
6987 {
6988   CHECK_STRING (string);
6989
6990   return detect_coding_system (XSTRING (string)->data,
6991                                STRING_BYTES (XSTRING (string)),
6992                                !NILP (highest),
6993                                STRING_MULTIBYTE (string),
6994                                Qnil);
6995 }
6996
6997
6998 static INLINE int
6999 char_encodable_p (c, attrs)
7000      int c;
7001      Lisp_Object attrs;
7002 {
7003   Lisp_Object tail;
7004   struct charset *charset;
7005
7006   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
7007        CONSP (tail); tail = XCDR (tail))
7008     {
7009       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
7010       if (CHAR_CHARSET_P (c, charset))
7011         break;
7012     }
7013   return (! NILP (tail));
7014 }
7015
7016
7017 /* Return a list of coding systems that safely encode the text between
7018    START and END.  If EXCLUDE is non-nil, it is a list of coding
7019    systems not to check.  The returned list doesn't contain any such
7020    coding systems.  In any case, if the text contains only ASCII or is
7021    unibyte, return t.  */
7022
7023 DEFUN ("find-coding-systems-region-internal",
7024        Ffind_coding_systems_region_internal,
7025        Sfind_coding_systems_region_internal, 2, 3, 0,
7026        doc: /* Internal use only.  */)
7027      (start, end, exclude)
7028      Lisp_Object start, end, exclude;
7029 {
7030   Lisp_Object coding_attrs_list, safe_codings;
7031   EMACS_INT start_byte, end_byte;
7032   const unsigned char *p, *pbeg, *pend;
7033   int c;
7034   Lisp_Object tail, elt;
7035
7036   if (STRINGP (start))
7037     {
7038       if (!STRING_MULTIBYTE (start)
7039           || XSTRING (start)->size == STRING_BYTES (XSTRING (start)))
7040         return Qt;
7041       start_byte = 0;
7042       end_byte = STRING_BYTES (XSTRING (start));
7043     }
7044   else
7045     {
7046       CHECK_NUMBER_COERCE_MARKER (start);
7047       CHECK_NUMBER_COERCE_MARKER (end);
7048       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7049         args_out_of_range (start, end);
7050       if (NILP (current_buffer->enable_multibyte_characters))
7051         return Qt;
7052       start_byte = CHAR_TO_BYTE (XINT (start));
7053       end_byte = CHAR_TO_BYTE (XINT (end));
7054       if (XINT (end) - XINT (start) == end_byte - start_byte)
7055         return Qt;
7056
7057       if (XINT (start) < GPT && XINT (end) > GPT)
7058         {
7059           if ((GPT - XINT (start)) < (XINT (end) - GPT))
7060             move_gap_both (XINT (start), start_byte);
7061           else
7062             move_gap_both (XINT (end), end_byte);
7063         }
7064     }
7065
7066   coding_attrs_list = Qnil;
7067   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
7068     if (NILP (exclude)
7069         || NILP (Fmemq (XCAR (tail), exclude)))
7070       {
7071         Lisp_Object attrs;
7072
7073         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
7074         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
7075             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
7076           coding_attrs_list = Fcons (attrs, coding_attrs_list);
7077       }
7078
7079   if (STRINGP (start))
7080     p = pbeg = XSTRING (start)->data;
7081   else
7082     p = pbeg = BYTE_POS_ADDR (start_byte);
7083   pend = p + (end_byte - start_byte);
7084
7085   while (p < pend && ASCII_BYTE_P (*p)) p++;
7086   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7087
7088   while (p < pend)
7089     {
7090       if (ASCII_BYTE_P (*p))
7091         p++;
7092       else
7093         {
7094           c = STRING_CHAR_ADVANCE (p);
7095
7096           charset_map_loaded = 0;
7097           for (tail = coding_attrs_list; CONSP (tail);)
7098             {
7099               elt = XCAR (tail);
7100               if (NILP (elt))
7101                 tail = XCDR (tail);
7102               else if (char_encodable_p (c, elt))
7103                 tail = XCDR (tail);
7104               else if (CONSP (XCDR (tail)))
7105                 {
7106                   XSETCAR (tail, XCAR (XCDR (tail)));
7107                   XSETCDR (tail, XCDR (XCDR (tail)));
7108                 }
7109               else
7110                 {
7111                   XSETCAR (tail, Qnil);
7112                   tail = XCDR (tail);
7113                 }
7114             }
7115           if (charset_map_loaded)
7116             {
7117               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7118
7119               if (STRINGP (start))
7120                 pbeg = XSTRING (start)->data;
7121               else
7122                 pbeg = BYTE_POS_ADDR (start_byte);
7123               p = pbeg + p_offset;
7124               pend = pbeg + pend_offset;
7125             }
7126         }
7127     }
7128
7129   safe_codings = Qnil;
7130   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
7131     if (! NILP (XCAR (tail)))
7132       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
7133
7134   return safe_codings;
7135 }
7136
7137
7138 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
7139        Scheck_coding_systems_region, 3, 3, 0,
7140        doc: /* Check if the region is encodable by coding systems.
7141
7142 START and END are buffer positions specifying the region.
7143 CODING-SYSTEM-LIST is a list of coding systems to check.
7144
7145 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
7146 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
7147 whole region, POS0, POS1, ... are buffer positions where non-encodable
7148 characters are found.
7149
7150 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
7151 value is nil.
7152
7153 START may be a string.  In that case, check if the string is
7154 encodable, and the value contains indices to the string instead of
7155 buffer positions.  END is ignored.  */)
7156      (start, end, coding_system_list)
7157      Lisp_Object start, end, coding_system_list;
7158 {
7159   Lisp_Object list;
7160   EMACS_INT start_byte, end_byte;
7161   int pos;
7162   const unsigned char *p, *pbeg, *pend;
7163   int c;
7164   Lisp_Object tail, elt;
7165
7166   if (STRINGP (start))
7167     {
7168       if (!STRING_MULTIBYTE (start)
7169           && XSTRING (start)->size != STRING_BYTES (XSTRING (start)))
7170         return Qnil;
7171       start_byte = 0;
7172       end_byte = STRING_BYTES (XSTRING (start));
7173       pos = 0;
7174     }
7175   else
7176     {
7177       CHECK_NUMBER_COERCE_MARKER (start);
7178       CHECK_NUMBER_COERCE_MARKER (end);
7179       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7180         args_out_of_range (start, end);
7181       if (NILP (current_buffer->enable_multibyte_characters))
7182         return Qnil;
7183       start_byte = CHAR_TO_BYTE (XINT (start));
7184       end_byte = CHAR_TO_BYTE (XINT (end));
7185       if (XINT (end) - XINT (start) == end_byte - start_byte)
7186         return Qt;
7187
7188       if (XINT (start) < GPT && XINT (end) > GPT)
7189         {
7190           if ((GPT - XINT (start)) < (XINT (end) - GPT))
7191             move_gap_both (XINT (start), start_byte);
7192           else
7193             move_gap_both (XINT (end), end_byte);
7194         }
7195       pos = XINT (start);
7196     }
7197
7198   list = Qnil;
7199   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
7200     {
7201       elt = XCAR (tail);
7202       list = Fcons (Fcons (elt, Fcons (AREF (CODING_SYSTEM_SPEC (elt), 0),
7203                                        Qnil)),
7204                     list);
7205     }
7206
7207   if (STRINGP (start))
7208     p = pbeg = XSTRING (start)->data;
7209   else
7210     p = pbeg = BYTE_POS_ADDR (start_byte);
7211   pend = p + (end_byte - start_byte);
7212
7213   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
7214   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7215
7216   while (p < pend)
7217     {
7218       if (ASCII_BYTE_P (*p))
7219         p++;
7220       else
7221         {
7222           c = STRING_CHAR_ADVANCE (p);
7223
7224           charset_map_loaded = 0;
7225           for (tail = list; CONSP (tail); tail = XCDR (tail))
7226             {
7227               elt = XCDR (XCAR (tail));
7228               if (! char_encodable_p (c, XCAR (elt)))
7229                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
7230             }
7231           if (charset_map_loaded)
7232             {
7233               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7234
7235               if (STRINGP (start))
7236                 pbeg = XSTRING (start)->data;
7237               else
7238                 pbeg = BYTE_POS_ADDR (start_byte);
7239               p = pbeg + p_offset;
7240               pend = pbeg + pend_offset;
7241             }
7242         }
7243       pos++;
7244     }
7245
7246   tail = list;
7247   list = Qnil;
7248   for (; CONSP (tail); tail = XCDR (tail))
7249     {
7250       elt = XCAR (tail);
7251       if (CONSP (XCDR (XCDR (elt))))
7252         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
7253                       list);
7254     }
7255
7256   return list;
7257 }
7258
7259
7260
7261 Lisp_Object
7262 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
7263      Lisp_Object start, end, coding_system, dst_object;
7264      int encodep, norecord;
7265 {
7266   struct coding_system coding;
7267   EMACS_INT from, from_byte, to, to_byte;
7268   Lisp_Object src_object;
7269
7270   CHECK_NUMBER_COERCE_MARKER (start);
7271   CHECK_NUMBER_COERCE_MARKER (end);
7272   if (NILP (coding_system))
7273     coding_system = Qno_conversion;
7274   else
7275     CHECK_CODING_SYSTEM (coding_system);
7276   src_object = Fcurrent_buffer ();
7277   if (NILP (dst_object))
7278     dst_object = src_object;
7279   else if (! EQ (dst_object, Qt))
7280     CHECK_BUFFER (dst_object);
7281
7282   validate_region (&start, &end);
7283   from = XFASTINT (start);
7284   from_byte = CHAR_TO_BYTE (from);
7285   to = XFASTINT (end);
7286   to_byte = CHAR_TO_BYTE (to);
7287
7288   setup_coding_system (coding_system, &coding);
7289   coding.mode |= CODING_MODE_LAST_BLOCK;
7290
7291   if (encodep)
7292     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7293                           dst_object);
7294   else
7295     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7296                           dst_object);
7297   if (! norecord)
7298     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7299
7300   if (coding.result != CODING_RESULT_SUCCESS)
7301     error ("Code conversion error: %d", coding.result);
7302
7303   return (BUFFERP (dst_object)
7304           ? make_number (coding.produced_char)
7305           : coding.dst_object);
7306 }
7307
7308
7309 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7310        3, 4, "r\nzCoding system: ",
7311        doc: /* Decode the current region from the specified coding system.
7312 When called from a program, takes four arguments:
7313         START, END, CODING-SYSTEM, and DESTINATION.
7314 START and END are buffer positions.
7315
7316 Optional 4th arguments DESTINATION specifies where the decoded text goes.
7317 If nil, the region between START and END is replace by the decoded text.
7318 If buffer, the decoded text is inserted in the buffer.
7319 If t, the decoded text is returned.
7320
7321 This function sets `last-coding-system-used' to the precise coding system
7322 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7323 not fully specified.)
7324 It returns the length of the decoded text.  */)
7325      (start, end, coding_system, destination)
7326      Lisp_Object start, end, coding_system, destination;
7327 {
7328   return code_convert_region (start, end, coding_system, destination, 0, 0);
7329 }
7330
7331 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7332        3, 4, "r\nzCoding system: ",
7333        doc: /* Encode the current region by specified coding system.
7334 When called from a program, takes three arguments:
7335 START, END, and CODING-SYSTEM.  START and END are buffer positions.
7336
7337 Optional 4th arguments DESTINATION specifies where the encoded text goes.
7338 If nil, the region between START and END is replace by the encoded text.
7339 If buffer, the encoded text is inserted in the buffer.
7340 If t, the encoded text is returned.
7341
7342 This function sets `last-coding-system-used' to the precise coding system
7343 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7344 not fully specified.)
7345 It returns the length of the encoded text.  */)
7346   (start, end, coding_system, destination)
7347      Lisp_Object start, end, coding_system, destination;
7348 {
7349   return code_convert_region (start, end, coding_system, destination, 1, 0);
7350 }
7351
7352 Lisp_Object
7353 code_convert_string (string, coding_system, dst_object,
7354                      encodep, nocopy, norecord)
7355      Lisp_Object string, coding_system, dst_object;
7356      int encodep, nocopy, norecord;
7357 {
7358   struct coding_system coding;
7359   EMACS_INT chars, bytes;
7360
7361   CHECK_STRING (string);
7362   if (NILP (coding_system))
7363     {
7364       if (! norecord)
7365         Vlast_coding_system_used = Qno_conversion;
7366       if (NILP (dst_object))
7367         return (nocopy ? Fcopy_sequence (string) : string);
7368     }
7369
7370   if (NILP (coding_system))
7371     coding_system = Qno_conversion;
7372   else
7373     CHECK_CODING_SYSTEM (coding_system);
7374   if (NILP (dst_object))
7375     dst_object = Qt;
7376   else if (! EQ (dst_object, Qt))
7377     CHECK_BUFFER (dst_object);
7378
7379   setup_coding_system (coding_system, &coding);
7380   coding.mode |= CODING_MODE_LAST_BLOCK;
7381   chars = XSTRING (string)->size;
7382   bytes = STRING_BYTES (XSTRING (string));
7383   if (encodep)
7384     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7385   else
7386     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7387   if (! norecord)
7388     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7389
7390   if (coding.result != CODING_RESULT_SUCCESS)
7391     error ("Code conversion error: %d", coding.result);
7392
7393   return (BUFFERP (dst_object)
7394           ? make_number (coding.produced_char)
7395           : coding.dst_object);
7396 }
7397
7398
7399 /* Encode or decode STRING according to CODING_SYSTEM.
7400    Do not set Vlast_coding_system_used.
7401
7402    This function is called only from macros DECODE_FILE and
7403    ENCODE_FILE, thus we ignore character composition.  */
7404
7405 Lisp_Object
7406 code_convert_string_norecord (string, coding_system, encodep)
7407      Lisp_Object string, coding_system;
7408      int encodep;
7409 {
7410   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
7411 }
7412
7413
7414 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7415        2, 4, 0,
7416        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7417
7418 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
7419 if the decoding operation is trivial.
7420
7421 Optional fourth arg BUFFER non-nil meant that the decoded text is
7422 inserted in BUFFER instead of returned as a string.  In this case,
7423 the return value is BUFFER.
7424
7425 This function sets `last-coding-system-used' to the precise coding system
7426 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7427 not fully specified.  */)
7428   (string, coding_system, nocopy, buffer)
7429      Lisp_Object string, coding_system, nocopy, buffer;
7430 {
7431   return code_convert_string (string, coding_system, buffer,
7432                               0, ! NILP (nocopy), 0);
7433 }
7434
7435 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
7436        2, 4, 0,
7437        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7438
7439 Optional third arg NOCOPY non-nil means it is OK to return STRING
7440 itself if the encoding operation is trivial.
7441
7442 Optional fourth arg BUFFER non-nil meant that the encoded text is
7443 inserted in BUFFER instead of returned as a string.  In this case,
7444 the return value is BUFFER.
7445
7446 This function sets `last-coding-system-used' to the precise coding system
7447 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7448 not fully specified.)  */)
7449      (string, coding_system, nocopy, buffer)
7450      Lisp_Object string, coding_system, nocopy, buffer;
7451 {
7452   return code_convert_string (string, coding_system, buffer,
7453                               1, ! NILP (nocopy), 1);
7454 }
7455
7456 \f
7457 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
7458        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7459 Return the corresponding character.  */)
7460      (code)
7461      Lisp_Object code;
7462 {
7463   Lisp_Object spec, attrs, val;
7464   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
7465   int c;
7466
7467   CHECK_NATNUM (code);
7468   c = XFASTINT (code);
7469   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7470   attrs = AREF (spec, 0);
7471
7472   if (ASCII_BYTE_P (c)
7473       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7474     return code;
7475
7476   val = CODING_ATTR_CHARSET_LIST (attrs);
7477   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7478   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7479   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
7480
7481   if (c <= 0x7F)
7482     charset = charset_roman;
7483   else if (c >= 0xA0 && c < 0xDF)
7484     {
7485       charset = charset_kana;
7486       c -= 0x80;
7487     }
7488   else
7489     {
7490       int s1 = c >> 8, s2 = c & 0xFF;
7491
7492       if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
7493           || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
7494         error ("Invalid code: %d", code);
7495       SJIS_TO_JIS (c);
7496       charset = charset_kanji;
7497     }
7498   c = DECODE_CHAR (charset, c);
7499   if (c < 0)
7500     error ("Invalid code: %d", code);
7501   return make_number (c);
7502 }
7503
7504
7505 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
7506        doc: /* Encode a Japanese character CHAR to shift_jis encoding.
7507 Return the corresponding code in SJIS.  */)
7508      (ch)
7509     Lisp_Object ch;
7510 {
7511   Lisp_Object spec, attrs, charset_list;
7512   int c;
7513   struct charset *charset;
7514   unsigned code;
7515
7516   CHECK_CHARACTER (ch);
7517   c = XFASTINT (ch);
7518   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7519   attrs = AREF (spec, 0);
7520
7521   if (ASCII_CHAR_P (c)
7522       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7523     return ch;
7524
7525   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7526   charset = char_charset (c, charset_list, &code);
7527   if (code == CHARSET_INVALID_CODE (charset))
7528     error ("Can't encode by shift_jis encoding: %d", c);
7529   JIS_TO_SJIS (code);
7530
7531   return make_number (code);
7532 }
7533
7534 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7535        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7536 Return the corresponding character.  */)
7537      (code)
7538      Lisp_Object code;
7539 {
7540   Lisp_Object spec, attrs, val;
7541   struct charset *charset_roman, *charset_big5, *charset;
7542   int c;
7543
7544   CHECK_NATNUM (code);
7545   c = XFASTINT (code);
7546   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7547   attrs = AREF (spec, 0);
7548
7549   if (ASCII_BYTE_P (c)
7550       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7551     return code;
7552
7553   val = CODING_ATTR_CHARSET_LIST (attrs);
7554   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7555   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
7556
7557   if (c <= 0x7F)
7558     charset = charset_roman;
7559   else
7560     {
7561       int b1 = c >> 8, b2 = c & 0x7F;
7562       if (b1 < 0xA1 || b1 > 0xFE
7563           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
7564         error ("Invalid code: %d", code);
7565       charset = charset_big5;
7566     }
7567   c = DECODE_CHAR (charset, (unsigned )c);
7568   if (c < 0)
7569     error ("Invalid code: %d", code);
7570   return make_number (c);
7571 }
7572
7573 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7574        doc: /* Encode the Big5 character CHAR to BIG5 coding system.
7575 Return the corresponding character code in Big5.  */)
7576      (ch)
7577      Lisp_Object ch;
7578 {
7579   Lisp_Object spec, attrs, charset_list;
7580   struct charset *charset;
7581   int c;
7582   unsigned code;
7583
7584   CHECK_CHARACTER (ch);
7585   c = XFASTINT (ch);
7586   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7587   attrs = AREF (spec, 0);
7588   if (ASCII_CHAR_P (c)
7589       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7590     return ch;
7591
7592   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7593   charset = char_charset (c, charset_list, &code);
7594   if (code == CHARSET_INVALID_CODE (charset))
7595     error ("Can't encode by Big5 encoding: %d", c);
7596
7597   return make_number (code);
7598 }
7599
7600 \f
7601 DEFUN ("set-terminal-coding-system-internal",
7602        Fset_terminal_coding_system_internal,
7603        Sset_terminal_coding_system_internal, 1, 1, 0,
7604        doc: /* Internal use only.  */)
7605      (coding_system)
7606      Lisp_Object coding_system;
7607 {
7608   CHECK_SYMBOL (coding_system);
7609   setup_coding_system (Fcheck_coding_system (coding_system),
7610                         &terminal_coding);
7611
7612   /* We had better not send unsafe characters to terminal.  */
7613   terminal_coding.mode |= CODING_MODE_SAFE_ENCODING;
7614   /* Characer composition should be disabled.  */
7615   terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7616   terminal_coding.src_multibyte = 1;
7617   terminal_coding.dst_multibyte = 0;
7618   return Qnil;
7619 }
7620
7621 DEFUN ("set-safe-terminal-coding-system-internal",
7622        Fset_safe_terminal_coding_system_internal,
7623        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7624        doc: /* Internal use only.  */)
7625      (coding_system)
7626      Lisp_Object coding_system;
7627 {
7628   CHECK_SYMBOL (coding_system);
7629   setup_coding_system (Fcheck_coding_system (coding_system),
7630                        &safe_terminal_coding);
7631   /* Characer composition should be disabled.  */
7632   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7633   safe_terminal_coding.src_multibyte = 1;
7634   safe_terminal_coding.dst_multibyte = 0;
7635   return Qnil;
7636 }
7637
7638 DEFUN ("terminal-coding-system",
7639        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
7640        doc: /* Return coding system specified for terminal output.  */)
7641      ()
7642 {
7643   return CODING_ID_NAME (terminal_coding.id);
7644 }
7645
7646 DEFUN ("set-keyboard-coding-system-internal",
7647        Fset_keyboard_coding_system_internal,
7648        Sset_keyboard_coding_system_internal, 1, 1, 0,
7649        doc: /* Internal use only.  */)
7650      (coding_system)
7651      Lisp_Object coding_system;
7652 {
7653   CHECK_SYMBOL (coding_system);
7654   setup_coding_system (Fcheck_coding_system (coding_system),
7655                        &keyboard_coding);
7656   /* Characer composition should be disabled.  */
7657   keyboard_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7658   return Qnil;
7659 }
7660
7661 DEFUN ("keyboard-coding-system",
7662        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
7663        doc: /* Return coding system specified for decoding keyboard input.  */)
7664      ()
7665 {
7666   return CODING_ID_NAME (keyboard_coding.id);
7667 }
7668
7669 \f
7670 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7671        Sfind_operation_coding_system,  1, MANY, 0,
7672        doc: /* Choose a coding system for an operation based on the target name.
7673 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7674 DECODING-SYSTEM is the coding system to use for decoding
7675 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7676 for encoding (in case OPERATION does encoding).
7677
7678 The first argument OPERATION specifies an I/O primitive:
7679   For file I/O, `insert-file-contents' or `write-region'.
7680   For process I/O, `call-process', `call-process-region', or `start-process'.
7681   For network I/O, `open-network-stream'.
7682
7683 The remaining arguments should be the same arguments that were passed
7684 to the primitive.  Depending on which primitive, one of those arguments
7685 is selected as the TARGET.  For example, if OPERATION does file I/O,
7686 whichever argument specifies the file name is TARGET.
7687
7688 TARGET has a meaning which depends on OPERATION:
7689   For file I/O, TARGET is a file name.
7690   For process I/O, TARGET is a process name.
7691   For network I/O, TARGET is a service name or a port number
7692
7693 This function looks up what specified for TARGET in,
7694 `file-coding-system-alist', `process-coding-system-alist',
7695 or `network-coding-system-alist' depending on OPERATION.
7696 They may specify a coding system, a cons of coding systems,
7697 or a function symbol to call.
7698 In the last case, we call the function with one argument,
7699 which is a list of all the arguments given to this function.
7700
7701 usage: (find-operation-coding-system OPERATION ARGUMENTS ...)  */)
7702      (nargs, args)
7703      int nargs;
7704      Lisp_Object *args;
7705 {
7706   Lisp_Object operation, target_idx, target, val;
7707   register Lisp_Object chain;
7708
7709   if (nargs < 2)
7710     error ("Too few arguments");
7711   operation = args[0];
7712   if (!SYMBOLP (operation)
7713       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7714     error ("Invalid first arguement");
7715   if (nargs < 1 + XINT (target_idx))
7716     error ("Too few arguments for operation: %s",
7717            XSYMBOL (operation)->name->data);
7718   target = args[XINT (target_idx) + 1];
7719   if (!(STRINGP (target)
7720         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7721     error ("Invalid %dth argument", XINT (target_idx) + 1);
7722
7723   chain = ((EQ (operation, Qinsert_file_contents)
7724             || EQ (operation, Qwrite_region))
7725            ? Vfile_coding_system_alist
7726            : (EQ (operation, Qopen_network_stream)
7727               ? Vnetwork_coding_system_alist
7728               : Vprocess_coding_system_alist));
7729   if (NILP (chain))
7730     return Qnil;
7731
7732   for (; CONSP (chain); chain = XCDR (chain))
7733     {
7734       Lisp_Object elt;
7735
7736       elt = XCAR (chain);
7737       if (CONSP (elt)
7738           && ((STRINGP (target)
7739                && STRINGP (XCAR (elt))
7740                && fast_string_match (XCAR (elt), target) >= 0)
7741               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7742         {
7743           val = XCDR (elt);
7744           /* Here, if VAL is both a valid coding system and a valid
7745              function symbol, we return VAL as a coding system.  */
7746           if (CONSP (val))
7747             return val;
7748           if (! SYMBOLP (val))
7749             return Qnil;
7750           if (! NILP (Fcoding_system_p (val)))
7751             return Fcons (val, val);
7752           if (! NILP (Ffboundp (val)))
7753             {
7754               val = call1 (val, Flist (nargs, args));
7755               if (CONSP (val))
7756                 return val;
7757               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7758                 return Fcons (val, val);
7759             }
7760           return Qnil;
7761         }
7762     }
7763   return Qnil;
7764 }
7765
7766 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
7767        Sset_coding_system_priority, 0, MANY, 0,
7768        doc: /* Assign higher priority to the coding systems given as arguments.
7769 If multiple coding systems belongs to the same category,
7770 all but the first one are ignored.  */)
7771      (nargs, args)
7772      int nargs;
7773      Lisp_Object *args;
7774 {
7775   int i, j;
7776   int changed[coding_category_max];
7777   enum coding_category priorities[coding_category_max];
7778
7779   bzero (changed, sizeof changed);
7780
7781   for (i = j = 0; i < nargs; i++)
7782     {
7783       enum coding_category category;
7784       Lisp_Object spec, attrs;
7785
7786       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
7787       attrs = AREF (spec, 0);
7788       category = XINT (CODING_ATTR_CATEGORY (attrs));
7789       if (changed[category])
7790         /* Ignore this coding system because a coding system of the
7791            same category already had a higher priority.  */
7792         continue;
7793       changed[category] = 1;
7794       priorities[j++] = category;
7795       if (coding_categories[category].id >= 0
7796           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
7797         setup_coding_system (args[i], &coding_categories[category]);
7798       Fset (AREF (Vcoding_category_table, category), args[i]);
7799     }
7800
7801   /* Now we have decided top J priorities.  Reflect the order of the
7802      original priorities to the remaining priorities.  */
7803
7804   for (i = j, j = 0; i < coding_category_max; i++, j++)
7805     {
7806       while (j < coding_category_max
7807              && changed[coding_priorities[j]])
7808         j++;
7809       if (j == coding_category_max)
7810         abort ();
7811       priorities[i] = coding_priorities[j];
7812     }
7813
7814   bcopy (priorities, coding_priorities, sizeof priorities);
7815
7816   /* Update `coding-category-list'.  */
7817   Vcoding_category_list = Qnil;
7818   for (i = coding_category_max - 1; i >= 0; i--)
7819     Vcoding_category_list
7820       = Fcons (AREF (Vcoding_category_table, priorities[i]),
7821                Vcoding_category_list);
7822
7823   return Qnil;
7824 }
7825
7826 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
7827        Scoding_system_priority_list, 0, 1, 0,
7828        doc: /* Return a list of coding systems ordered by their priorities.
7829 HIGHESTP non-nil means just return the highest priority one.  */)
7830      (highestp)
7831      Lisp_Object highestp;
7832 {
7833   int i;
7834   Lisp_Object val;
7835
7836   for (i = 0, val = Qnil; i < coding_category_max; i++)
7837     {
7838       enum coding_category category = coding_priorities[i];
7839       int id = coding_categories[category].id;
7840       Lisp_Object attrs;
7841
7842       if (id < 0)
7843         continue;
7844       attrs = CODING_ID_ATTRS (id);
7845       if (! NILP (highestp))
7846         return CODING_ATTR_BASE_NAME (attrs);
7847       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
7848     }
7849   return Fnreverse (val);
7850 }
7851
7852 static char *suffixes[] = { "-unix", "-dos", "-mac" };
7853
7854 static Lisp_Object
7855 make_subsidiaries (base)
7856      Lisp_Object base;
7857 {
7858   Lisp_Object subsidiaries;
7859   int base_name_len = STRING_BYTES (XSYMBOL (base)->name);
7860   char *buf = (char *) alloca (base_name_len + 6);
7861   int i;
7862
7863   bcopy (XSYMBOL (base)->name->data, buf, base_name_len);
7864   subsidiaries = Fmake_vector (make_number (3), Qnil);
7865   for (i = 0; i < 3; i++)
7866     {
7867       bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
7868       ASET (subsidiaries, i, intern (buf));
7869     }
7870   return subsidiaries;
7871 }
7872
7873
7874 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7875        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
7876        doc: /* For internal use only.
7877 usage: (define-coding-system-internal ...)  */)
7878      (nargs, args)
7879      int nargs;
7880      Lisp_Object *args;
7881 {
7882   Lisp_Object name;
7883   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
7884   Lisp_Object attrs;            /* Vector of attributes.  */
7885   Lisp_Object eol_type;
7886   Lisp_Object aliases;
7887   Lisp_Object coding_type, charset_list, safe_charsets;
7888   enum coding_category category;
7889   Lisp_Object tail, val;
7890   int max_charset_id = 0;
7891   int i;
7892
7893   if (nargs < coding_arg_max)
7894     goto short_args;
7895
7896   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
7897
7898   name = args[coding_arg_name];
7899   CHECK_SYMBOL (name);
7900   CODING_ATTR_BASE_NAME (attrs) = name;
7901
7902   val = args[coding_arg_mnemonic];
7903   if (! STRINGP (val))
7904     CHECK_CHARACTER (val);
7905   CODING_ATTR_MNEMONIC (attrs) = val;
7906
7907   coding_type = args[coding_arg_coding_type];
7908   CHECK_SYMBOL (coding_type);
7909   CODING_ATTR_TYPE (attrs) = coding_type;
7910
7911   charset_list = args[coding_arg_charset_list];
7912   if (SYMBOLP (charset_list))
7913     {
7914       if (EQ (charset_list, Qiso_2022))
7915         {
7916           if (! EQ (coding_type, Qiso_2022))
7917             error ("Invalid charset-list");
7918           charset_list = Viso_2022_charset_list;
7919         }
7920       else if (EQ (charset_list, Qemacs_mule))
7921         {
7922           if (! EQ (coding_type, Qemacs_mule))
7923             error ("Invalid charset-list");
7924           charset_list = Vemacs_mule_charset_list;
7925         }
7926       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
7927         if (max_charset_id < XFASTINT (XCAR (tail)))
7928           max_charset_id = XFASTINT (XCAR (tail));
7929     }
7930   else
7931     {
7932       charset_list = Fcopy_sequence (charset_list);
7933       for (tail = charset_list; !NILP (tail); tail = Fcdr (tail))
7934         {
7935           struct charset *charset;
7936
7937           val = Fcar (tail);
7938           CHECK_CHARSET_GET_CHARSET (val, charset);
7939           if (EQ (coding_type, Qiso_2022)
7940               ? CHARSET_ISO_FINAL (charset) < 0
7941               : EQ (coding_type, Qemacs_mule)
7942               ? CHARSET_EMACS_MULE_ID (charset) < 0
7943               : 0)
7944             error ("Can't handle charset `%s'",
7945                    XSYMBOL (CHARSET_NAME (charset))->name->data);
7946
7947           XCAR (tail) = make_number (charset->id);
7948           if (max_charset_id < charset->id)
7949             max_charset_id = charset->id;
7950         }
7951     }
7952   CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
7953
7954   safe_charsets = Fmake_string (make_number (max_charset_id + 1),
7955                                 make_number (255));
7956   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
7957     XSTRING (safe_charsets)->data[XFASTINT (XCAR (tail))] = 0;
7958   CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
7959
7960   CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
7961
7962   val = args[coding_arg_decode_translation_table];
7963   if (! NILP (val))
7964     CHECK_CHAR_TABLE (val);
7965   CODING_ATTR_DECODE_TBL (attrs) = val;
7966
7967   val = args[coding_arg_encode_translation_table];
7968   if (! NILP (val))
7969     CHECK_CHAR_TABLE (val);
7970   CODING_ATTR_ENCODE_TBL (attrs) = val;
7971
7972   val = args[coding_arg_post_read_conversion];
7973   CHECK_SYMBOL (val);
7974   CODING_ATTR_POST_READ (attrs) = val;
7975
7976   val = args[coding_arg_pre_write_conversion];
7977   CHECK_SYMBOL (val);
7978   CODING_ATTR_PRE_WRITE (attrs) = val;
7979
7980   val = args[coding_arg_default_char];
7981   if (NILP (val))
7982     CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
7983   else
7984     {
7985       CHECK_CHARACTER (val);
7986       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
7987     }
7988
7989   val = args[coding_arg_plist];
7990   CHECK_LIST (val);
7991   CODING_ATTR_PLIST (attrs) = val;
7992
7993   if (EQ (coding_type, Qcharset))
7994     {
7995       Lisp_Object list;
7996       /* Generate a lisp vector of 256 elements.  Each element is nil,
7997          integer, or a list of charset IDs.
7998
7999          If Nth element is nil, the byte code N is invalid in this
8000          coding system.
8001
8002          If Nth element is a number NUM, N is the first byte of a
8003          charset whose ID is NUM.
8004
8005          If Nth element is a list of charset IDs, N is the first byte
8006          of one of them.  The list is sorted by dimensions of the
8007          charsets.  A charset of smaller dimension comes firtst.
8008       */
8009       for (list = Qnil, tail = charset_list; CONSP (tail); tail = XCDR (tail))
8010         {
8011           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
8012
8013           if (charset->method == CHARSET_METHOD_SUPERSET)
8014             {
8015               val = CHARSET_SUPERSET (charset);
8016               for (; CONSP (val); val = XCDR (val))
8017                 list = Fcons (XCAR (XCAR (val)), list);
8018             }
8019           else
8020             list = Fcons (XCAR (tail), list);
8021         }
8022
8023       val = Fmake_vector (make_number (256), Qnil);
8024
8025       for (tail = Fnreverse (list); CONSP (tail); tail = XCDR (tail))
8026         {
8027           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
8028           int dim = CHARSET_DIMENSION (charset);
8029           int idx = (dim - 1) * 4;
8030
8031           if (CHARSET_ASCII_COMPATIBLE_P (charset))
8032             CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8033
8034           for (i = charset->code_space[idx];
8035                i <= charset->code_space[idx + 1]; i++)
8036             {
8037               Lisp_Object tmp, tmp2;
8038               int dim2;
8039
8040               tmp = AREF (val, i);
8041               if (NILP (tmp))
8042                 tmp = XCAR (tail);
8043               else if (NUMBERP (tmp))
8044                 {
8045                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
8046                   if (dim < dim2)
8047                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
8048                   else
8049                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
8050                 }
8051               else
8052                 {
8053                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
8054                     {
8055                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
8056                       if (dim < dim2)
8057                         break;
8058                     }
8059                   if (NILP (tmp2))
8060                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
8061                   else
8062                     {
8063                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
8064                       XSETCAR (tmp2, XCAR (tail));
8065                     }
8066                 }
8067               ASET (val, i, tmp);
8068             }
8069         }
8070       ASET (attrs, coding_attr_charset_valids, val);
8071       category = coding_category_charset;
8072     }
8073   else if (EQ (coding_type, Qccl))
8074     {
8075       Lisp_Object valids;
8076
8077       if (nargs < coding_arg_ccl_max)
8078         goto short_args;
8079
8080       val = args[coding_arg_ccl_decoder];
8081       CHECK_CCL_PROGRAM (val);
8082       if (VECTORP (val))
8083         val = Fcopy_sequence (val);
8084       ASET (attrs, coding_attr_ccl_decoder, val);
8085
8086       val = args[coding_arg_ccl_encoder];
8087       CHECK_CCL_PROGRAM (val);
8088       if (VECTORP (val))
8089         val = Fcopy_sequence (val);
8090       ASET (attrs, coding_attr_ccl_encoder, val);
8091
8092       val = args[coding_arg_ccl_valids];
8093       valids = Fmake_string (make_number (256), make_number (0));
8094       for (tail = val; !NILP (tail); tail = Fcdr (tail))
8095         {
8096           int from, to;
8097
8098           val = Fcar (tail);
8099           if (INTEGERP (val))
8100             {
8101               from = to = XINT (val);
8102               if (from < 0 || from > 255)
8103                 args_out_of_range_3 (val, make_number (0), make_number (255));
8104             }
8105           else
8106             {
8107               CHECK_CONS (val);
8108               CHECK_NUMBER (XCAR (val));
8109               CHECK_NUMBER (XCDR (val));
8110               from = XINT (XCAR (val));
8111               if (from < 0 || from > 255)
8112                 args_out_of_range_3 (XCAR (val),
8113                                      make_number (0), make_number (255));
8114               to = XINT (XCDR (val));
8115               if (to < from || to > 255)
8116                 args_out_of_range_3 (XCDR (val),
8117                                      XCAR (val), make_number (255));
8118             }
8119           for (i = from; i <= to; i++)
8120             XSTRING (valids)->data[i] = 1;
8121         }
8122       ASET (attrs, coding_attr_ccl_valids, valids);
8123
8124       category = coding_category_ccl;
8125     }
8126   else if (EQ (coding_type, Qutf_16))
8127     {
8128       Lisp_Object bom, endian;
8129
8130       CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8131
8132       if (nargs < coding_arg_utf16_max)
8133         goto short_args;
8134
8135       bom = args[coding_arg_utf16_bom];
8136       if (! NILP (bom) && ! EQ (bom, Qt))
8137         {
8138           CHECK_CONS (bom);
8139           CHECK_CODING_SYSTEM (XCAR (bom));
8140           CHECK_CODING_SYSTEM (XCDR (bom));
8141         }
8142       ASET (attrs, coding_attr_utf_16_bom, bom);
8143
8144       endian = args[coding_arg_utf16_endian];
8145       CHECK_SYMBOL (endian);
8146       if (NILP (endian))
8147         endian = Qbig;
8148       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
8149         error ("Invalid endian: %s", XSYMBOL (endian)->name->data);
8150       ASET (attrs, coding_attr_utf_16_endian, endian);
8151
8152       category = (CONSP (bom)
8153                   ? coding_category_utf_16_auto
8154                   : NILP (bom)
8155                   ? (EQ (endian, Qbig)
8156                      ? coding_category_utf_16_be_nosig
8157                      : coding_category_utf_16_le_nosig)
8158                   : (EQ (endian, Qbig)
8159                      ? coding_category_utf_16_be
8160                      : coding_category_utf_16_le));
8161     }
8162   else if (EQ (coding_type, Qiso_2022))
8163     {
8164       Lisp_Object initial, reg_usage, request, flags;
8165       int i;
8166
8167       if (nargs < coding_arg_iso2022_max)
8168         goto short_args;
8169
8170       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
8171       CHECK_VECTOR (initial);
8172       for (i = 0; i < 4; i++)
8173         {
8174           val = Faref (initial, make_number (i));
8175           if (! NILP (val))
8176             {
8177               struct charset *charset;
8178
8179               CHECK_CHARSET_GET_CHARSET (val, charset);
8180               ASET (initial, i, make_number (CHARSET_ID (charset)));
8181               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
8182                 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8183             }
8184           else
8185             ASET (initial, i, make_number (-1));
8186         }
8187
8188       reg_usage = args[coding_arg_iso2022_reg_usage];
8189       CHECK_CONS (reg_usage);
8190       CHECK_NATNUM (XCAR (reg_usage));
8191       CHECK_NATNUM (XCDR (reg_usage));
8192
8193       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
8194       for (tail = request; ! NILP (tail); tail = Fcdr (tail))
8195         {
8196           int id;
8197
8198           val = Fcar (tail);
8199           CHECK_CONS (val);
8200           CHECK_CHARSET_GET_ID (XCAR (val), id);
8201           CHECK_NATNUM (XCDR (val));
8202           if (XINT (XCDR (val)) >= 4)
8203             error ("Invalid graphic register number: %d", XINT (XCDR (val)));
8204           XCAR (val) = make_number (id);
8205         }
8206
8207       flags = args[coding_arg_iso2022_flags];
8208       CHECK_NATNUM (flags);
8209       i = XINT (flags);
8210       if (EQ (args[coding_arg_charset_list], Qiso_2022))
8211         flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
8212
8213       ASET (attrs, coding_attr_iso_initial, initial);
8214       ASET (attrs, coding_attr_iso_usage, reg_usage);
8215       ASET (attrs, coding_attr_iso_request, request);
8216       ASET (attrs, coding_attr_iso_flags, flags);
8217       setup_iso_safe_charsets (attrs);
8218
8219       if (i & CODING_ISO_FLAG_SEVEN_BITS)
8220         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
8221                           | CODING_ISO_FLAG_SINGLE_SHIFT))
8222                     ? coding_category_iso_7_else
8223                     : EQ (args[coding_arg_charset_list], Qiso_2022)
8224                     ? coding_category_iso_7
8225                     : coding_category_iso_7_tight);
8226       else
8227         {
8228           int id = XINT (AREF (initial, 1));
8229
8230           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
8231                        || EQ (args[coding_arg_charset_list], Qiso_2022)
8232                        || id < 0)
8233                       ? coding_category_iso_8_else
8234                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
8235                       ? coding_category_iso_8_1
8236                       : coding_category_iso_8_2);
8237         }
8238       if (category != coding_category_iso_8_1
8239           && category != coding_category_iso_8_2)
8240         CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8241     }
8242   else if (EQ (coding_type, Qemacs_mule))
8243     {
8244       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
8245         ASET (attrs, coding_attr_emacs_mule_full, Qt);
8246       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8247       category = coding_category_emacs_mule;
8248     }
8249   else if (EQ (coding_type, Qshift_jis))
8250     {
8251
8252       struct charset *charset;
8253
8254       if (XINT (Flength (charset_list)) != 3)
8255         error ("There should be just three charsets");
8256
8257       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8258       if (CHARSET_DIMENSION (charset) != 1)
8259         error ("Dimension of charset %s is not one",
8260                XSYMBOL (CHARSET_NAME (charset))->name->data);
8261       if (CHARSET_ASCII_COMPATIBLE_P (charset))
8262         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8263
8264       charset_list = XCDR (charset_list);
8265       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8266       if (CHARSET_DIMENSION (charset) != 1)
8267         error ("Dimension of charset %s is not one",
8268                XSYMBOL (CHARSET_NAME (charset))->name->data);
8269
8270       charset_list = XCDR (charset_list);
8271       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8272       if (CHARSET_DIMENSION (charset) != 2)
8273         error ("Dimension of charset %s is not two",
8274                XSYMBOL (CHARSET_NAME (charset))->name->data);
8275
8276       category = coding_category_sjis;
8277       Vsjis_coding_system = name;
8278     }
8279   else if (EQ (coding_type, Qbig5))
8280     {
8281       struct charset *charset;
8282
8283       if (XINT (Flength (charset_list)) != 2)
8284         error ("There should be just two charsets");
8285
8286       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8287       if (CHARSET_DIMENSION (charset) != 1)
8288         error ("Dimension of charset %s is not one",
8289                XSYMBOL (CHARSET_NAME (charset))->name->data);
8290       if (CHARSET_ASCII_COMPATIBLE_P (charset))
8291         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8292
8293       charset_list = XCDR (charset_list);
8294       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8295       if (CHARSET_DIMENSION (charset) != 2)
8296         error ("Dimension of charset %s is not two",
8297                XSYMBOL (CHARSET_NAME (charset))->name->data);
8298
8299       category = coding_category_big5;
8300       Vbig5_coding_system = name;
8301     }
8302   else if (EQ (coding_type, Qraw_text))
8303     {
8304       category = coding_category_raw_text;
8305       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8306     }
8307   else if (EQ (coding_type, Qutf_8))
8308     {
8309       category = coding_category_utf_8;
8310       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8311     }
8312   else if (EQ (coding_type, Qundecided))
8313     category = coding_category_undecided;
8314   else
8315     error ("Invalid coding system type: %s",
8316            XSYMBOL (coding_type)->name->data);
8317
8318   CODING_ATTR_CATEGORY (attrs) = make_number (category);
8319   CODING_ATTR_PLIST (attrs)
8320     = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
8321                                 CODING_ATTR_PLIST (attrs)));
8322
8323   eol_type = args[coding_arg_eol_type];
8324   if (! NILP (eol_type)
8325       && ! EQ (eol_type, Qunix)
8326       && ! EQ (eol_type, Qdos)
8327       && ! EQ (eol_type, Qmac))
8328     error ("Invalid eol-type");
8329
8330   aliases = Fcons (name, Qnil);
8331
8332   if (NILP (eol_type))
8333     {
8334       eol_type = make_subsidiaries (name);
8335       for (i = 0; i < 3; i++)
8336         {
8337           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
8338
8339           this_name = AREF (eol_type, i);
8340           this_aliases = Fcons (this_name, Qnil);
8341           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
8342           this_spec = Fmake_vector (make_number (3), attrs);
8343           ASET (this_spec, 1, this_aliases);
8344           ASET (this_spec, 2, this_eol_type);
8345           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
8346           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
8347           Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
8348                                         Vcoding_system_alist);
8349         }
8350     }
8351
8352   spec_vec = Fmake_vector (make_number (3), attrs);
8353   ASET (spec_vec, 1, aliases);
8354   ASET (spec_vec, 2, eol_type);
8355
8356   Fputhash (name, spec_vec, Vcoding_system_hash_table);
8357   Vcoding_system_list = Fcons (name, Vcoding_system_list);
8358   Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
8359                                 Vcoding_system_alist);
8360
8361   {
8362     int id = coding_categories[category].id;
8363
8364     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
8365       setup_coding_system (name, &coding_categories[category]);
8366   }
8367
8368   return Qnil;
8369
8370  short_args:
8371   return Fsignal (Qwrong_number_of_arguments,
8372                   Fcons (intern ("define-coding-system-internal"),
8373                          make_number (nargs)));
8374 }
8375
8376 /* Fixme: should this record the alias relationships for
8377    diagnostics?  Should it update coding-system-list?  */
8378 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
8379        Sdefine_coding_system_alias, 2, 2, 0,
8380        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
8381      (alias, coding_system)
8382      Lisp_Object alias, coding_system;
8383 {
8384   Lisp_Object spec, aliases, eol_type;
8385
8386   CHECK_SYMBOL (alias);
8387   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8388   aliases = AREF (spec, 1);
8389   while (!NILP (XCDR (aliases)))
8390     aliases = XCDR (aliases);
8391   XCDR (aliases) = Fcons (alias, Qnil);
8392
8393   eol_type = AREF (spec, 2);
8394   if (VECTORP (eol_type))
8395     {
8396       Lisp_Object subsidiaries;
8397       int i;
8398
8399       subsidiaries = make_subsidiaries (alias);
8400       for (i = 0; i < 3; i++)
8401         Fdefine_coding_system_alias (AREF (subsidiaries, i),
8402                                      AREF (eol_type, i));
8403
8404       ASET (spec, 2, subsidiaries);
8405     }
8406
8407   Fputhash (alias, spec, Vcoding_system_hash_table);
8408   Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
8409                                 Vcoding_system_alist);
8410
8411   return Qnil;
8412 }
8413
8414 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
8415        1, 1, 0,
8416        doc: /* Return the base of CODING-SYSTEM.
8417 Any alias or subsidiary coding system is not a base coding system.  */)
8418   (coding_system)
8419      Lisp_Object coding_system;
8420 {
8421   Lisp_Object spec, attrs;
8422
8423   if (NILP (coding_system))
8424     return (Qno_conversion);
8425   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8426   attrs = AREF (spec, 0);
8427   return CODING_ATTR_BASE_NAME (attrs);
8428 }
8429
8430 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
8431        1, 1, 0,
8432        doc: "Return the property list of CODING-SYSTEM.")
8433      (coding_system)
8434      Lisp_Object coding_system;
8435 {
8436   Lisp_Object spec, attrs;
8437
8438   if (NILP (coding_system))
8439     coding_system = Qno_conversion;
8440   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8441   attrs = AREF (spec, 0);
8442   return CODING_ATTR_PLIST (attrs);
8443 }
8444
8445
8446 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
8447        1, 1, 0,
8448        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
8449      (coding_system)
8450      Lisp_Object coding_system;
8451 {
8452   Lisp_Object spec;
8453
8454   if (NILP (coding_system))
8455     coding_system = Qno_conversion;
8456   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8457   return AREF (spec, 1);
8458 }
8459
8460 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
8461        Scoding_system_eol_type, 1, 1, 0,
8462        doc: /* Return eol-type of CODING-SYSTEM.
8463 An eol-type is integer 0, 1, 2, or a vector of coding systems.
8464
8465 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
8466 and CR respectively.
8467
8468 A vector value indicates that a format of end-of-line should be
8469 detected automatically.  Nth element of the vector is the subsidiary
8470 coding system whose eol-type is N.  */)
8471      (coding_system)
8472      Lisp_Object coding_system;
8473 {
8474   Lisp_Object spec, eol_type;
8475   int n;
8476
8477   if (NILP (coding_system))
8478     coding_system = Qno_conversion;
8479   if (! CODING_SYSTEM_P (coding_system))
8480     return Qnil;
8481   spec = CODING_SYSTEM_SPEC (coding_system);
8482   eol_type = AREF (spec, 2);
8483   if (VECTORP (eol_type))
8484     return Fcopy_sequence (eol_type);
8485   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
8486   return make_number (n);
8487 }
8488
8489 #endif /* emacs */
8490
8491 \f
8492 /*** 9. Post-amble ***/
8493
8494 void
8495 init_coding_once ()
8496 {
8497   int i;
8498
8499   for (i = 0; i < coding_category_max; i++)
8500     {
8501       coding_categories[i].id = -1;
8502       coding_priorities[i] = i;
8503     }
8504
8505   /* ISO2022 specific initialize routine.  */
8506   for (i = 0; i < 0x20; i++)
8507     iso_code_class[i] = ISO_control_0;
8508   for (i = 0x21; i < 0x7F; i++)
8509     iso_code_class[i] = ISO_graphic_plane_0;
8510   for (i = 0x80; i < 0xA0; i++)
8511     iso_code_class[i] = ISO_control_1;
8512   for (i = 0xA1; i < 0xFF; i++)
8513     iso_code_class[i] = ISO_graphic_plane_1;
8514   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
8515   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
8516   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
8517   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
8518   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
8519   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
8520   iso_code_class[ISO_CODE_ESC] = ISO_escape;
8521   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
8522   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
8523   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
8524
8525   inhibit_pre_post_conversion = 0;
8526
8527   for (i = 0; i < 256; i++)
8528     {
8529       emacs_mule_bytes[i] = 1;
8530     }
8531   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
8532   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
8533   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
8534   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
8535 }
8536
8537 #ifdef emacs
8538
8539 void
8540 syms_of_coding ()
8541 {
8542   staticpro (&Vcoding_system_hash_table);
8543   Vcoding_system_hash_table = Fmakehash (Qeq);
8544
8545   staticpro (&Vsjis_coding_system);
8546   Vsjis_coding_system = Qnil;
8547
8548   staticpro (&Vbig5_coding_system);
8549   Vbig5_coding_system = Qnil;
8550
8551   staticpro (&Vcode_conversion_work_buf_list);
8552   Vcode_conversion_work_buf_list = Qnil;
8553
8554   staticpro (&Vcode_conversion_reused_work_buf);
8555   Vcode_conversion_reused_work_buf = Qnil;
8556
8557   DEFSYM (Qcharset, "charset");
8558   DEFSYM (Qtarget_idx, "target-idx");
8559   DEFSYM (Qcoding_system_history, "coding-system-history");
8560   Fset (Qcoding_system_history, Qnil);
8561
8562   /* Target FILENAME is the first argument.  */
8563   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
8564   /* Target FILENAME is the third argument.  */
8565   Fput (Qwrite_region, Qtarget_idx, make_number (2));
8566
8567   DEFSYM (Qcall_process, "call-process");
8568   /* Target PROGRAM is the first argument.  */
8569   Fput (Qcall_process, Qtarget_idx, make_number (0));
8570
8571   DEFSYM (Qcall_process_region, "call-process-region");
8572   /* Target PROGRAM is the third argument.  */
8573   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
8574
8575   DEFSYM (Qstart_process, "start-process");
8576   /* Target PROGRAM is the third argument.  */
8577   Fput (Qstart_process, Qtarget_idx, make_number (2));
8578
8579   DEFSYM (Qopen_network_stream, "open-network-stream");
8580   /* Target SERVICE is the fourth argument.  */
8581   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
8582
8583   DEFSYM (Qcoding_system, "coding-system");
8584   DEFSYM (Qcoding_aliases, "coding-aliases");
8585
8586   DEFSYM (Qeol_type, "eol-type");
8587   DEFSYM (Qunix, "unix");
8588   DEFSYM (Qdos, "dos");
8589
8590   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
8591   DEFSYM (Qpost_read_conversion, "post-read-conversion");
8592   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
8593   DEFSYM (Qdefault_char, "default-char");
8594   DEFSYM (Qundecided, "undecided");
8595   DEFSYM (Qno_conversion, "no-conversion");
8596   DEFSYM (Qraw_text, "raw-text");
8597
8598   DEFSYM (Qiso_2022, "iso-2022");
8599
8600   DEFSYM (Qutf_8, "utf-8");
8601
8602   DEFSYM (Qutf_16, "utf-16");
8603   DEFSYM (Qbig, "big");
8604   DEFSYM (Qlittle, "little");
8605
8606   DEFSYM (Qshift_jis, "shift-jis");
8607   DEFSYM (Qbig5, "big5");
8608
8609   DEFSYM (Qcoding_system_p, "coding-system-p");
8610
8611   DEFSYM (Qcoding_system_error, "coding-system-error");
8612   Fput (Qcoding_system_error, Qerror_conditions,
8613         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
8614   Fput (Qcoding_system_error, Qerror_message,
8615         build_string ("Invalid coding system"));
8616
8617   /* Intern this now in case it isn't already done.
8618      Setting this variable twice is harmless.
8619      But don't staticpro it here--that is done in alloc.c.  */
8620   Qchar_table_extra_slots = intern ("char-table-extra-slots");
8621
8622   DEFSYM (Qtranslation_table, "translation-table");
8623   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
8624   DEFSYM (Qtranslation_table_id, "translation-table-id");
8625   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
8626   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
8627
8628   DEFSYM (Qvalid_codes, "valid-codes");
8629
8630   DEFSYM (Qemacs_mule, "emacs-mule");
8631
8632   DEFSYM (QCcategory, ":category");
8633
8634   Vcoding_category_table
8635     = Fmake_vector (make_number (coding_category_max), Qnil);
8636   staticpro (&Vcoding_category_table);
8637   /* Followings are target of code detection.  */
8638   ASET (Vcoding_category_table, coding_category_iso_7,
8639         intern ("coding-category-iso-7"));
8640   ASET (Vcoding_category_table, coding_category_iso_7_tight,
8641         intern ("coding-category-iso-7-tight"));
8642   ASET (Vcoding_category_table, coding_category_iso_8_1,
8643         intern ("coding-category-iso-8-1"));
8644   ASET (Vcoding_category_table, coding_category_iso_8_2,
8645         intern ("coding-category-iso-8-2"));
8646   ASET (Vcoding_category_table, coding_category_iso_7_else,
8647         intern ("coding-category-iso-7-else"));
8648   ASET (Vcoding_category_table, coding_category_iso_8_else,
8649         intern ("coding-category-iso-8-else"));
8650   ASET (Vcoding_category_table, coding_category_utf_8,
8651         intern ("coding-category-utf-8"));
8652   ASET (Vcoding_category_table, coding_category_utf_16_be,
8653         intern ("coding-category-utf-16-be"));
8654   ASET (Vcoding_category_table, coding_category_utf_16_auto,
8655         intern ("coding-category-utf-16-auto"));
8656   ASET (Vcoding_category_table, coding_category_utf_16_le,
8657         intern ("coding-category-utf-16-le"));
8658   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
8659         intern ("coding-category-utf-16-be-nosig"));
8660   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
8661         intern ("coding-category-utf-16-le-nosig"));
8662   ASET (Vcoding_category_table, coding_category_charset,
8663         intern ("coding-category-charset"));
8664   ASET (Vcoding_category_table, coding_category_sjis,
8665         intern ("coding-category-sjis"));
8666   ASET (Vcoding_category_table, coding_category_big5,
8667         intern ("coding-category-big5"));
8668   ASET (Vcoding_category_table, coding_category_ccl,
8669         intern ("coding-category-ccl"));
8670   ASET (Vcoding_category_table, coding_category_emacs_mule,
8671         intern ("coding-category-emacs-mule"));
8672   /* Followings are NOT target of code detection.  */
8673   ASET (Vcoding_category_table, coding_category_raw_text,
8674         intern ("coding-category-raw-text"));
8675   ASET (Vcoding_category_table, coding_category_undecided,
8676         intern ("coding-category-undecided"));
8677
8678   defsubr (&Scoding_system_p);
8679   defsubr (&Sread_coding_system);
8680   defsubr (&Sread_non_nil_coding_system);
8681   defsubr (&Scheck_coding_system);
8682   defsubr (&Sdetect_coding_region);
8683   defsubr (&Sdetect_coding_string);
8684   defsubr (&Sfind_coding_systems_region_internal);
8685   defsubr (&Scheck_coding_systems_region);
8686   defsubr (&Sdecode_coding_region);
8687   defsubr (&Sencode_coding_region);
8688   defsubr (&Sdecode_coding_string);
8689   defsubr (&Sencode_coding_string);
8690   defsubr (&Sdecode_sjis_char);
8691   defsubr (&Sencode_sjis_char);
8692   defsubr (&Sdecode_big5_char);
8693   defsubr (&Sencode_big5_char);
8694   defsubr (&Sset_terminal_coding_system_internal);
8695   defsubr (&Sset_safe_terminal_coding_system_internal);
8696   defsubr (&Sterminal_coding_system);
8697   defsubr (&Sset_keyboard_coding_system_internal);
8698   defsubr (&Skeyboard_coding_system);
8699   defsubr (&Sfind_operation_coding_system);
8700   defsubr (&Sset_coding_system_priority);
8701   defsubr (&Sdefine_coding_system_internal);
8702   defsubr (&Sdefine_coding_system_alias);
8703   defsubr (&Scoding_system_base);
8704   defsubr (&Scoding_system_plist);
8705   defsubr (&Scoding_system_aliases);
8706   defsubr (&Scoding_system_eol_type);
8707   defsubr (&Scoding_system_priority_list);
8708
8709   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
8710                doc: /* List of coding systems.
8711
8712 Do not alter the value of this variable manually.  This variable should be
8713 updated by the functions `define-coding-system' and
8714 `define-coding-system-alias'.  */);
8715   Vcoding_system_list = Qnil;
8716
8717   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
8718                doc: /* Alist of coding system names.
8719 Each element is one element list of coding system name.
8720 This variable is given to `completing-read' as TABLE argument.
8721
8722 Do not alter the value of this variable manually.  This variable should be
8723 updated by the functions `make-coding-system' and
8724 `define-coding-system-alias'.  */);
8725   Vcoding_system_alist = Qnil;
8726
8727   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
8728                doc: /* List of coding-categories (symbols) ordered by priority.
8729
8730 On detecting a coding system, Emacs tries code detection algorithms
8731 associated with each coding-category one by one in this order.  When
8732 one algorithm agrees with a byte sequence of source text, the coding
8733 system bound to the corresponding coding-category is selected.  */);
8734   {
8735     int i;
8736
8737     Vcoding_category_list = Qnil;
8738     for (i = coding_category_max - 1; i >= 0; i--)
8739       Vcoding_category_list
8740         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
8741                  Vcoding_category_list);
8742   }
8743
8744   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
8745                doc: /* Specify the coding system for read operations.
8746 It is useful to bind this variable with `let', but do not set it globally.
8747 If the value is a coding system, it is used for decoding on read operation.
8748 If not, an appropriate element is used from one of the coding system alists:
8749 There are three such tables, `file-coding-system-alist',
8750 `process-coding-system-alist', and `network-coding-system-alist'.  */);
8751   Vcoding_system_for_read = Qnil;
8752
8753   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
8754                doc: /* Specify the coding system for write operations.
8755 Programs bind this variable with `let', but you should not set it globally.
8756 If the value is a coding system, it is used for encoding of output,
8757 when writing it to a file and when sending it to a file or subprocess.
8758
8759 If this does not specify a coding system, an appropriate element
8760 is used from one of the coding system alists:
8761 There are three such tables, `file-coding-system-alist',
8762 `process-coding-system-alist', and `network-coding-system-alist'.
8763 For output to files, if the above procedure does not specify a coding system,
8764 the value of `buffer-file-coding-system' is used.  */);
8765   Vcoding_system_for_write = Qnil;
8766
8767   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
8768                doc: /*
8769 Coding system used in the latest file or process I/O.  */);
8770   Vlast_coding_system_used = Qnil;
8771
8772   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
8773                doc: /*
8774 *Non-nil means always inhibit code conversion of end-of-line format.
8775 See info node `Coding Systems' and info node `Text and Binary' concerning
8776 such conversion.  */);
8777   inhibit_eol_conversion = 0;
8778
8779   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
8780                doc: /*
8781 Non-nil means process buffer inherits coding system of process output.
8782 Bind it to t if the process output is to be treated as if it were a file
8783 read from some filesystem.  */);
8784   inherit_process_coding_system = 0;
8785
8786   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
8787                doc: /*
8788 Alist to decide a coding system to use for a file I/O operation.
8789 The format is ((PATTERN . VAL) ...),
8790 where PATTERN is a regular expression matching a file name,
8791 VAL is a coding system, a cons of coding systems, or a function symbol.
8792 If VAL is a coding system, it is used for both decoding and encoding
8793 the file contents.
8794 If VAL is a cons of coding systems, the car part is used for decoding,
8795 and the cdr part is used for encoding.
8796 If VAL is a function symbol, the function must return a coding system
8797 or a cons of coding systems which are used as above.  The function gets
8798 the arguments with which `find-operation-coding-systems' was called.
8799
8800 See also the function `find-operation-coding-system'
8801 and the variable `auto-coding-alist'.  */);
8802   Vfile_coding_system_alist = Qnil;
8803
8804   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
8805                doc: /*
8806 Alist to decide a coding system to use for a process I/O operation.
8807 The format is ((PATTERN . VAL) ...),
8808 where PATTERN is a regular expression matching a program name,
8809 VAL is a coding system, a cons of coding systems, or a function symbol.
8810 If VAL is a coding system, it is used for both decoding what received
8811 from the program and encoding what sent to the program.
8812 If VAL is a cons of coding systems, the car part is used for decoding,
8813 and the cdr part is used for encoding.
8814 If VAL is a function symbol, the function must return a coding system
8815 or a cons of coding systems which are used as above.
8816
8817 See also the function `find-operation-coding-system'.  */);
8818   Vprocess_coding_system_alist = Qnil;
8819
8820   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
8821                doc: /*
8822 Alist to decide a coding system to use for a network I/O operation.
8823 The format is ((PATTERN . VAL) ...),
8824 where PATTERN is a regular expression matching a network service name
8825 or is a port number to connect to,
8826 VAL is a coding system, a cons of coding systems, or a function symbol.
8827 If VAL is a coding system, it is used for both decoding what received
8828 from the network stream and encoding what sent to the network stream.
8829 If VAL is a cons of coding systems, the car part is used for decoding,
8830 and the cdr part is used for encoding.
8831 If VAL is a function symbol, the function must return a coding system
8832 or a cons of coding systems which are used as above.
8833
8834 See also the function `find-operation-coding-system'.  */);
8835   Vnetwork_coding_system_alist = Qnil;
8836
8837   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
8838                doc: /* Coding system to use with system messages.
8839 Also used for decoding keyboard input on X Window system.  */);
8840   Vlocale_coding_system = Qnil;
8841
8842   /* The eol mnemonics are reset in startup.el system-dependently.  */
8843   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
8844                doc: /*
8845 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
8846   eol_mnemonic_unix = build_string (":");
8847
8848   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
8849                doc: /*
8850 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
8851   eol_mnemonic_dos = build_string ("\\");
8852
8853   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
8854                doc: /*
8855 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
8856   eol_mnemonic_mac = build_string ("/");
8857
8858   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
8859                doc: /*
8860 *String displayed in mode line when end-of-line format is not yet determined.  */);
8861   eol_mnemonic_undecided = build_string (":");
8862
8863   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
8864                doc: /*
8865 *Non-nil enables character translation while encoding and decoding.  */);
8866   Venable_character_translation = Qt;
8867
8868   DEFVAR_LISP ("standard-translation-table-for-decode",
8869                &Vstandard_translation_table_for_decode,
8870                doc: /* Table for translating characters while decoding.  */);
8871   Vstandard_translation_table_for_decode = Qnil;
8872
8873   DEFVAR_LISP ("standard-translation-table-for-encode",
8874                &Vstandard_translation_table_for_encode,
8875                doc: /* Table for translating characters while encoding.  */);
8876   Vstandard_translation_table_for_encode = Qnil;
8877
8878   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
8879                doc: /* Alist of charsets vs revision numbers.
8880 While encoding, if a charset (car part of an element) is found,
8881 designate it with the escape sequence identifying revision (cdr part
8882 of the element).  */);
8883   Vcharset_revision_table = Qnil;
8884
8885   DEFVAR_LISP ("default-process-coding-system",
8886                &Vdefault_process_coding_system,
8887                doc: /* Cons of coding systems used for process I/O by default.
8888 The car part is used for decoding a process output,
8889 the cdr part is used for encoding a text to be sent to a process.  */);
8890   Vdefault_process_coding_system = Qnil;
8891
8892   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
8893                doc: /*
8894 Table of extra Latin codes in the range 128..159 (inclusive).
8895 This is a vector of length 256.
8896 If Nth element is non-nil, the existence of code N in a file
8897 \(or output of subprocess) doesn't prevent it to be detected as
8898 a coding system of ISO 2022 variant which has a flag
8899 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
8900 or reading output of a subprocess.
8901 Only 128th through 159th elements has a meaning.  */);
8902   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
8903
8904   DEFVAR_LISP ("select-safe-coding-system-function",
8905                &Vselect_safe_coding_system_function,
8906                doc: /*
8907 Function to call to select safe coding system for encoding a text.
8908
8909 If set, this function is called to force a user to select a proper
8910 coding system which can encode the text in the case that a default
8911 coding system used in each operation can't encode the text.
8912
8913 The default value is `select-safe-coding-system' (which see).  */);
8914   Vselect_safe_coding_system_function = Qnil;
8915
8916   DEFVAR_BOOL ("inhibit-iso-escape-detection",
8917                &inhibit_iso_escape_detection,
8918                doc: /*
8919 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
8920
8921 By default, on reading a file, Emacs tries to detect how the text is
8922 encoded.  This code detection is sensitive to escape sequences.  If
8923 the sequence is valid as ISO2022, the code is determined as one of
8924 the ISO2022 encodings, and the file is decoded by the corresponding
8925 coding system (e.g. `iso-2022-7bit').
8926
8927 However, there may be a case that you want to read escape sequences in
8928 a file as is.  In such a case, you can set this variable to non-nil.
8929 Then, as the code detection ignores any escape sequences, no file is
8930 detected as encoded in some ISO2022 encoding.  The result is that all
8931 escape sequences become visible in a buffer.
8932
8933 The default value is nil, and it is strongly recommended not to change
8934 it.  That is because many Emacs Lisp source files that contain
8935 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
8936 in Emacs's distribution, and they won't be decoded correctly on
8937 reading if you suppress escape sequence detection.
8938
8939 The other way to read escape sequences in a file without decoding is
8940 to explicitly specify some coding system that doesn't use ISO2022's
8941 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
8942   inhibit_iso_escape_detection = 0;
8943
8944   {
8945     Lisp_Object args[coding_arg_max];
8946     Lisp_Object plist[14];
8947     int i;
8948
8949     for (i = 0; i < coding_arg_max; i++)
8950       args[i] = Qnil;
8951
8952     plist[0] = intern (":name");
8953     plist[1] = args[coding_arg_name] = Qno_conversion;
8954     plist[2] = intern (":mnemonic");
8955     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
8956     plist[4] = intern (":coding-type");
8957     plist[5] = args[coding_arg_coding_type] = Qraw_text;
8958     plist[6] = intern (":ascii-compatible-p");
8959     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
8960     plist[8] = intern (":default-char");
8961     plist[9] = args[coding_arg_default_char] = make_number (0);
8962     plist[10] = intern (":docstring");
8963     plist[11] = build_string ("Do no conversion.\n\
8964 \n\
8965 When you visit a file with this coding, the file is read into a\n\
8966 unibyte buffer as is, thus each byte of a file is treated as a\n\
8967 character.");
8968     plist[12] = intern (":eol-type");
8969     plist[13] = args[coding_arg_eol_type] = Qunix;
8970     args[coding_arg_plist] = Flist (14, plist);
8971     Fdefine_coding_system_internal (coding_arg_max, args);
8972   }
8973
8974   setup_coding_system (Qno_conversion, &keyboard_coding);
8975   setup_coding_system (Qno_conversion, &terminal_coding);
8976   setup_coding_system (Qno_conversion, &safe_terminal_coding);
8977
8978   {
8979     int i;
8980
8981     for (i = 0; i < coding_category_max; i++)
8982       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
8983   }
8984 }
8985
8986 char *
8987 emacs_strerror (error_number)
8988      int error_number;
8989 {
8990   char *str;
8991
8992   synchronize_system_messages_locale ();
8993   str = strerror (error_number);
8994
8995   if (! NILP (Vlocale_coding_system))
8996     {
8997       Lisp_Object dec = code_convert_string_norecord (build_string (str),
8998                                                       Vlocale_coding_system,
8999                                                       0);
9000       str = (char *) XSTRING (dec)->data;
9001     }
9002
9003   return str;
9004 }
9005
9006 #endif /* emacs */