code.delx.au - gnu-emacs/blob - src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
   3      Licensed to the Free Software Foundation.
   4    Copyright (C) 2001, 2002 Free Software Foundation, Inc.
   5    Copyright (C) 2003
   6      National Institute of Advanced Industrial Science and Technology (AIST)
   7      Registration Number H13PRO009
   8
   9 This file is part of GNU Emacs.
  10
  11 GNU Emacs is free software; you can redistribute it and/or modify
  12 it under the terms of the GNU General Public License as published by
  13 the Free Software Foundation; either version 2, or (at your option)
  14 any later version.
  15
  16 GNU Emacs is distributed in the hope that it will be useful,
  17 but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19 GNU General Public License for more details.
  20
  21 You should have received a copy of the GNU General Public License
  22 along with GNU Emacs; see the file COPYING.  If not, write to
  23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  24 Boston, MA 02111-1307, USA.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  In
  59   C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return 1 if the byte sequence conforms to XXX, otherwise return 0.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static int
 156 detect_coding_XXX (coding, detect_info)
 157      struct coding_system *coding;
 158      struct coding_detection_info *detect_info;
 159 {
 160   const unsigned char *src = coding->source;
 161   const unsigned char *src_end = coding->source + coding->src_bytes;
 162   int multibytep = coding->src_multibyte;
 163   int consumed_chars = 0;
 164   int found = 0;
 165   ...;
 166
 167   while (1)
 168     {
 169       /* Get one byte from the source.  If the souce is exausted, jump
 170          to no_more_source:.  */
 171       ONE_MORE_BYTE (c);
 172
 173       if (! __C_conforms_to_XXX___ (c))
 174         break;
 175       if (! __C_strongly_suggests_XXX__ (c))
 176         found = CATEGORY_MASK_XXX;
 177     }
 178   /* The byte sequence is invalid for XXX.  */
 179   detect_info->rejected |= CATEGORY_MASK_XXX;
 180   return 0;
 181
 182  no_more_source:
 183   /* The source exausted successfully.  */
 184   detect_info->found |= found;
 185   return 1;
 186 }
 187 #endif
 188
 189 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 190
 191   These functions decode a byte sequence specified as a source by
 192   CODING.  The resulting multibyte text goes to a place pointed to by
 193   CODING->charbuf, the length of which should not exceed
 194   CODING->charbuf_size;
 195
 196   These functions set the information of original and decoded texts in
 197   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 198   They also set CODING->result to one of CODING_RESULT_XXX indicating
 199   how the decoding is finished.
 200
 201   Below is the template of these functions.  */
 202
 203 #if 0
 204 static void
 205 decode_coding_XXXX (coding)
 206      struct coding_system *coding;
 207 {
 208   const unsigned char *src = coding->source + coding->consumed;
 209   const unsigned char *src_end = coding->source + coding->src_bytes;
 210   /* SRC_BASE remembers the start position in source in each loop.
 211      The loop will be exited when there's not enough source code, or
 212      when there's no room in CHARBUF for a decoded character.  */
 213   const unsigned char *src_base;
 214   /* A buffer to produce decoded characters.  */
 215   int *charbuf = coding->charbuf + coding->charbuf_used;
 216   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 217   int multibytep = coding->src_multibyte;
 218
 219   while (1)
 220     {
 221       src_base = src;
 222       if (charbuf < charbuf_end)
 223         /* No more room to produce a decoded character.  */
 224         break;
 225       ONE_MORE_BYTE (c);
 226       /* Decode it. */
 227     }
 228
 229  no_more_source:
 230   if (src_base < src_end
 231       && coding->mode & CODING_MODE_LAST_BLOCK)
 232     /* If the source ends by partial bytes to construct a character,
 233        treat them as eight-bit raw data.  */
 234     while (src_base < src_end && charbuf < charbuf_end)
 235       *charbuf++ = *src_base++;
 236   /* Remember how many bytes and characters we consumed.  If the
 237      source is multibyte, the bytes and chars are not identical.  */
 238   coding->consumed = coding->consumed_char = src_base - coding->source;
 239   /* Remember how many characters we produced.  */
 240   coding->charbuf_used = charbuf - coding->charbuf;
 241 }
 242 #endif
 243
 244 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 245
 246   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 247   internal multibyte format by CODING.  The resulting byte sequence
 248   goes to a place pointed to by DESTINATION, the length of which
 249   should not exceed DST_BYTES.
 250
 251   These functions set the information of original and encoded texts in
 252   the members produced, produced_char, consumed, and consumed_char of
 253   the structure *CODING.  They also set the member result to one of
 254   CODING_RESULT_XXX indicating how the encoding finished.
 255
 256   DST_BYTES zero means that source area and destination area are
 257   overlapped, which means that we can produce a encoded text until it
 258   reaches at the head of not-yet-encoded source text.
 259
 260   Below is a template of these functions.  */
 261 #if 0
 262 static void
 263 encode_coding_XXX (coding)
 264      struct coding_system *coding;
 265 {
 266   int multibytep = coding->dst_multibyte;
 267   int *charbuf = coding->charbuf;
 268   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 269   unsigned char *dst = coding->destination + coding->produced;
 270   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 271   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 272   int produced_chars = 0;
 273
 274   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 275     {
 276       int c = *charbuf;
 277       /* Encode C into DST, and increment DST.  */
 278     }
 279  label_no_more_destination:
 280   /* How many chars and bytes we produced.  */
 281   coding->produced_char += produced_chars;
 282   coding->produced = dst - coding->destination;
 283 }
 284 #endif
 285
 286 \f
 287 /*** 1. Preamble ***/
 288
 289 #include <config.h>
 290 #include <stdio.h>
 291
 292 #include "lisp.h"
 293 #include "buffer.h"
 294 #include "character.h"
 295 #include "charset.h"
 296 #include "ccl.h"
 297 #include "composite.h"
 298 #include "coding.h"
 299 #include "window.h"
 300
 301 Lisp_Object Vcoding_system_hash_table;
 302
 303 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
 304 Lisp_Object Qunix, Qdos;
 305 extern Lisp_Object Qmac;        /* frame.c */
 306 Lisp_Object Qbuffer_file_coding_system;
 307 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 308 Lisp_Object Qdefault_char;
 309 Lisp_Object Qno_conversion, Qundecided;
 310 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
 311 Lisp_Object Qbig, Qlittle;
 312 Lisp_Object Qcoding_system_history;
 313 Lisp_Object Qvalid_codes;
 314 Lisp_Object QCcategory, QCmnemonic, QCdefalut_char;
 315 Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 316 Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 317
 318 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 319 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 320 Lisp_Object Qstart_process, Qopen_network_stream;
 321 Lisp_Object Qtarget_idx;
 322
 323 Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
 324 Lisp_Object Qinterrupted, Qinsufficient_memory;
 325
 326 int coding_system_require_warning;
 327
 328 Lisp_Object Vselect_safe_coding_system_function;
 329
 330 /* Mnemonic string for each format of end-of-line.  */
 331 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 332 /* Mnemonic string to indicate format of end-of-line is not yet
 333    decided.  */
 334 Lisp_Object eol_mnemonic_undecided;
 335
 336 #ifdef emacs
 337
 338 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 339
 340 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 341
 342 /* Coding system emacs-mule and raw-text are for converting only
 343    end-of-line format.  */
 344 Lisp_Object Qemacs_mule, Qraw_text;
 345 Lisp_Object Qutf_8_emacs;
 346
 347 /* Coding-systems are handed between Emacs Lisp programs and C internal
 348    routines by the following three variables.  */
 349 /* Coding-system for reading files and receiving data from process.  */
 350 Lisp_Object Vcoding_system_for_read;
 351 /* Coding-system for writing files and sending data to process.  */
 352 Lisp_Object Vcoding_system_for_write;
 353 /* Coding-system actually used in the latest I/O.  */
 354 Lisp_Object Vlast_coding_system_used;
 355 /* Set to non-nil when an error is detected while code conversion.  */
 356 Lisp_Object Vlast_code_conversion_error;
 357 /* A vector of length 256 which contains information about special
 358    Latin codes (especially for dealing with Microsoft codes).  */
 359 Lisp_Object Vlatin_extra_code_table;
 360
 361 /* Flag to inhibit code conversion of end-of-line format.  */
 362 int inhibit_eol_conversion;
 363
 364 /* Flag to inhibit ISO2022 escape sequence detection.  */
 365 int inhibit_iso_escape_detection;
 366
 367 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 368 int inherit_process_coding_system;
 369
 370 /* Coding system to be used to encode text for terminal display.  */
 371 struct coding_system terminal_coding;
 372
 373 /* Coding system to be used to encode text for terminal display when
 374    terminal coding system is nil.  */
 375 struct coding_system safe_terminal_coding;
 376
 377 /* Coding system of what is sent from terminal keyboard.  */
 378 struct coding_system keyboard_coding;
 379
 380 Lisp_Object Vfile_coding_system_alist;
 381 Lisp_Object Vprocess_coding_system_alist;
 382 Lisp_Object Vnetwork_coding_system_alist;
 383
 384 Lisp_Object Vlocale_coding_system;
 385
 386 #endif /* emacs */
 387
 388 /* Flag to tell if we look up translation table on character code
 389    conversion.  */
 390 Lisp_Object Venable_character_translation;
 391 /* Standard translation table to look up on decoding (reading).  */
 392 Lisp_Object Vstandard_translation_table_for_decode;
 393 /* Standard translation table to look up on encoding (writing).  */
 394 Lisp_Object Vstandard_translation_table_for_encode;
 395
 396 Lisp_Object Qtranslation_table;
 397 Lisp_Object Qtranslation_table_id;
 398 Lisp_Object Qtranslation_table_for_decode;
 399 Lisp_Object Qtranslation_table_for_encode;
 400
 401 /* Alist of charsets vs revision number.  */
 402 static Lisp_Object Vcharset_revision_table;
 403
 404 /* Default coding systems used for process I/O.  */
 405 Lisp_Object Vdefault_process_coding_system;
 406
 407 /* Char table for translating Quail and self-inserting input.  */
 408 Lisp_Object Vtranslation_table_for_input;
 409
 410 /* Two special coding systems.  */
 411 Lisp_Object Vsjis_coding_system;
 412 Lisp_Object Vbig5_coding_system;
 413
 414 /* ISO2022 section */
 415
 416 #define CODING_ISO_INITIAL(coding, reg)                 \
 417   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 418                      coding_attr_iso_initial),          \
 419                reg)))
 420
 421
 422 #define CODING_ISO_REQUEST(coding, charset_id)  \
 423   ((charset_id <= (coding)->max_charset_id      \
 424     ? (coding)->safe_charsets[charset_id]       \
 425     : -1))
 426
 427
 428 #define CODING_ISO_FLAGS(coding)        \
 429   ((coding)->spec.iso_2022.flags)
 430 #define CODING_ISO_DESIGNATION(coding, reg)     \
 431   ((coding)->spec.iso_2022.current_designation[reg])
 432 #define CODING_ISO_INVOCATION(coding, plane)    \
 433   ((coding)->spec.iso_2022.current_invocation[plane])
 434 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 435   ((coding)->spec.iso_2022.single_shifting)
 436 #define CODING_ISO_BOL(coding)  \
 437   ((coding)->spec.iso_2022.bol)
 438 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 439   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 440
 441 /* Control characters of ISO2022.  */
 442                         /* code */      /* function */
 443 #define ISO_CODE_LF     0x0A            /* line-feed */
 444 #define ISO_CODE_CR     0x0D            /* carriage-return */
 445 #define ISO_CODE_SO     0x0E            /* shift-out */
 446 #define ISO_CODE_SI     0x0F            /* shift-in */
 447 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 448 #define ISO_CODE_ESC    0x1B            /* escape */
 449 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 450 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 451 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 452
 453 /* All code (1-byte) of ISO2022 is classified into one of the
 454    followings.  */
 455 enum iso_code_class_type
 456   {
 457     ISO_control_0,              /* Control codes in the range
 458                                    0x00..0x1F and 0x7F, except for the
 459                                    following 5 codes.  */
 460     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 461     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 462     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 463     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 464     ISO_control_1,              /* Control codes in the range
 465                                    0x80..0x9F, except for the
 466                                    following 3 codes.  */
 467     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 468     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 469     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 470     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 471     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 472     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 473     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 474   };
 475
 476 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 477     `iso-flags' attribute of an iso2022 coding system.  */
 478
 479 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 480    instead of the correct short-form sequence (e.g. ESC $ A).  */
 481 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 482
 483 /* If set, reset graphic planes and registers at end-of-line to the
 484    initial state.  */
 485 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 486
 487 /* If set, reset graphic planes and registers before any control
 488    characters to the initial state.  */
 489 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 490
 491 /* If set, encode by 7-bit environment.  */
 492 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 493
 494 /* If set, use locking-shift function.  */
 495 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 496
 497 /* If set, use single-shift function.  Overwrite
 498    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 499 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 500
 501 /* If set, use designation escape sequence.  */
 502 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 503
 504 /* If set, produce revision number sequence.  */
 505 #define CODING_ISO_FLAG_REVISION        0x0080
 506
 507 /* If set, produce ISO6429's direction specifying sequence.  */
 508 #define CODING_ISO_FLAG_DIRECTION       0x0100
 509
 510 /* If set, assume designation states are reset at beginning of line on
 511    output.  */
 512 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 513
 514 /* If set, designation sequence should be placed at beginning of line
 515    on output.  */
 516 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 517
 518 /* If set, do not encode unsafe charactes on output.  */
 519 #define CODING_ISO_FLAG_SAFE            0x0800
 520
 521 /* If set, extra latin codes (128..159) are accepted as a valid code
 522    on input.  */
 523 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 524
 525 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 526
 527 #define CODING_ISO_FLAG_EUC_TW_SHIFT    0x4000
 528
 529 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 530
 531 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 532
 533 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 534
 535 /* A character to be produced on output if encoding of the original
 536    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 537 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 538
 539
 540 /* UTF-16 section */
 541 #define CODING_UTF_16_BOM(coding)       \
 542   ((coding)->spec.utf_16.bom)
 543
 544 #define CODING_UTF_16_ENDIAN(coding)    \
 545   ((coding)->spec.utf_16.endian)
 546
 547 #define CODING_UTF_16_SURROGATE(coding) \
 548   ((coding)->spec.utf_16.surrogate)
 549
 550
 551 /* CCL section */
 552 #define CODING_CCL_DECODER(coding)      \
 553   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 554 #define CODING_CCL_ENCODER(coding)      \
 555   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 556 #define CODING_CCL_VALIDS(coding)                                          \
 557   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 558
 559 /* Index for each coding category in `coding_categories' */
 560
 561 enum coding_category
 562   {
 563     coding_category_iso_7,
 564     coding_category_iso_7_tight,
 565     coding_category_iso_8_1,
 566     coding_category_iso_8_2,
 567     coding_category_iso_7_else,
 568     coding_category_iso_8_else,
 569     coding_category_utf_8,
 570     coding_category_utf_16_auto,
 571     coding_category_utf_16_be,
 572     coding_category_utf_16_le,
 573     coding_category_utf_16_be_nosig,
 574     coding_category_utf_16_le_nosig,
 575     coding_category_charset,
 576     coding_category_sjis,
 577     coding_category_big5,
 578     coding_category_ccl,
 579     coding_category_emacs_mule,
 580     /* All above are targets of code detection.  */
 581     coding_category_raw_text,
 582     coding_category_undecided,
 583     coding_category_max
 584   };
 585
 586 /* Definitions of flag bits used in detect_coding_XXXX.  */
 587 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 588 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 589 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 590 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 591 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 592 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 593 #define CATEGORY_MASK_UTF_8             (1 << coding_category_utf_8)
 594 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 595 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 596 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 597 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 598 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 599 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 600 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 601 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 602 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 603 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 604 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 605
 606 /* This value is returned if detect_coding_mask () find nothing other
 607    than ASCII characters.  */
 608 #define CATEGORY_MASK_ANY               \
 609   (CATEGORY_MASK_ISO_7                  \
 610    | CATEGORY_MASK_ISO_7_TIGHT          \
 611    | CATEGORY_MASK_ISO_8_1              \
 612    | CATEGORY_MASK_ISO_8_2              \
 613    | CATEGORY_MASK_ISO_7_ELSE           \
 614    | CATEGORY_MASK_ISO_8_ELSE           \
 615    | CATEGORY_MASK_UTF_8                \
 616    | CATEGORY_MASK_UTF_16_BE            \
 617    | CATEGORY_MASK_UTF_16_LE            \
 618    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 619    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 620    | CATEGORY_MASK_CHARSET              \
 621    | CATEGORY_MASK_SJIS                 \
 622    | CATEGORY_MASK_BIG5                 \
 623    | CATEGORY_MASK_CCL                  \
 624    | CATEGORY_MASK_EMACS_MULE)
 625
 626
 627 #define CATEGORY_MASK_ISO_7BIT \
 628   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 629
 630 #define CATEGORY_MASK_ISO_8BIT \
 631   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 632
 633 #define CATEGORY_MASK_ISO_ELSE \
 634   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 635
 636 #define CATEGORY_MASK_ISO_ESCAPE        \
 637   (CATEGORY_MASK_ISO_7                  \
 638    | CATEGORY_MASK_ISO_7_TIGHT          \
 639    | CATEGORY_MASK_ISO_7_ELSE           \
 640    | CATEGORY_MASK_ISO_8_ELSE)
 641
 642 #define CATEGORY_MASK_ISO       \
 643   (  CATEGORY_MASK_ISO_7BIT     \
 644      | CATEGORY_MASK_ISO_8BIT   \
 645      | CATEGORY_MASK_ISO_ELSE)
 646
 647 #define CATEGORY_MASK_UTF_16            \
 648   (CATEGORY_MASK_UTF_16_BE              \
 649    | CATEGORY_MASK_UTF_16_LE            \
 650    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 651    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 652
 653
 654 /* List of symbols `coding-category-xxx' ordered by priority.  This
 655    variable is exposed to Emacs Lisp.  */
 656 static Lisp_Object Vcoding_category_list;
 657
 658 /* Table of coding categories (Lisp symbols).  This variable is for
 659    internal use oly.  */
 660 static Lisp_Object Vcoding_category_table;
 661
 662 /* Table of coding-categories ordered by priority.  */
 663 static enum coding_category coding_priorities[coding_category_max];
 664
 665 /* Nth element is a coding context for the coding system bound to the
 666    Nth coding category.  */
 667 static struct coding_system coding_categories[coding_category_max];
 668
 669 /*** Commonly used macros and functions ***/
 670
 671 #ifndef min
 672 #define min(a, b) ((a) < (b) ? (a) : (b))
 673 #endif
 674 #ifndef max
 675 #define max(a, b) ((a) > (b) ? (a) : (b))
 676 #endif
 677
 678 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 679   do {                                                  \
 680     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 681     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 682   } while (0)
 683
 684
 685 /* Safely get one byte from the source text pointed by SRC which ends
 686    at SRC_END, and set C to that byte.  If there are not enough bytes
 687    in the source, it jumps to `no_more_source'.  If multibytep is
 688    nonzero, and a multibyte character is found at SRC, set C to the
 689    negative value of the character code.  The caller should declare
 690    and set these variables appropriately in advance:
 691         src, src_end, multibytep */
 692
 693 #define ONE_MORE_BYTE(c)                                \
 694   do {                                                  \
 695     if (src == src_end)                                 \
 696       {                                                 \
 697         if (src_base < src)                             \
 698           record_conversion_result                      \
 699             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 700         goto no_more_source;                            \
 701       }                                                 \
 702     c = *src++;                                         \
 703     if (multibytep && (c & 0x80))                       \
 704       {                                                 \
 705         if ((c & 0xFE) == 0xC0)                         \
 706           c = ((c & 1) << 6) | *src++;                  \
 707         else                                            \
 708           {                                             \
 709             c = - string_char (--src, &src, NULL);      \
 710             record_conversion_result                    \
 711               (coding, CODING_RESULT_INVALID_SRC);      \
 712           }                                             \
 713       }                                                 \
 714     consumed_chars++;                                   \
 715   } while (0)
 716
 717
 718 #define ONE_MORE_BYTE_NO_CHECK(c)                       \
 719   do {                                                  \
 720     c = *src++;                                         \
 721     if (multibytep && (c & 0x80))                       \
 722       {                                                 \
 723         if ((c & 0xFE) == 0xC0)                         \
 724           c = ((c & 1) << 6) | *src++;                  \
 725         else                                            \
 726           {                                             \
 727             c = - string_char (--src, &src, NULL);      \
 728             record_conversion_result                    \
 729               (coding, CODING_RESULT_INVALID_SRC);      \
 730           }                                             \
 731       }                                                 \
 732     consumed_chars++;                                   \
 733   } while (0)
 734
 735
 736 /* Store a byte C in the place pointed by DST and increment DST to the
 737    next free point, and increment PRODUCED_CHARS.  The caller should
 738    assure that C is 0..127, and declare and set the variable `dst'
 739    appropriately in advance.
 740 */
 741
 742
 743 #define EMIT_ONE_ASCII_BYTE(c)  \
 744   do {                          \
 745     produced_chars++;           \
 746     *dst++ = (c);               \
 747   } while (0)
 748
 749
 750 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2.  */
 751
 752 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 753   do {                                  \
 754     produced_chars += 2;                \
 755     *dst++ = (c1), *dst++ = (c2);       \
 756   } while (0)
 757
 758
 759 /* Store a byte C in the place pointed by DST and increment DST to the
 760    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP is
 761    nonzero, store in an appropriate multibyte from.  The caller should
 762    declare and set the variables `dst' and `multibytep' appropriately
 763    in advance.  */
 764
 765 #define EMIT_ONE_BYTE(c)                \
 766   do {                                  \
 767     produced_chars++;                   \
 768     if (multibytep)                     \
 769       {                                 \
 770         int ch = (c);                   \
 771         if (ch >= 0x80)                 \
 772           ch = BYTE8_TO_CHAR (ch);      \
 773         CHAR_STRING_ADVANCE (ch, dst);  \
 774       }                                 \
 775     else                                \
 776       *dst++ = (c);                     \
 777   } while (0)
 778
 779
 780 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 781
 782 #define EMIT_TWO_BYTES(c1, c2)          \
 783   do {                                  \
 784     produced_chars += 2;                \
 785     if (multibytep)                     \
 786       {                                 \
 787         int ch;                         \
 788                                         \
 789         ch = (c1);                      \
 790         if (ch >= 0x80)                 \
 791           ch = BYTE8_TO_CHAR (ch);      \
 792         CHAR_STRING_ADVANCE (ch, dst);  \
 793         ch = (c2);                      \
 794         if (ch >= 0x80)                 \
 795           ch = BYTE8_TO_CHAR (ch);      \
 796         CHAR_STRING_ADVANCE (ch, dst);  \
 797       }                                 \
 798     else                                \
 799       {                                 \
 800         *dst++ = (c1);                  \
 801         *dst++ = (c2);                  \
 802       }                                 \
 803   } while (0)
 804
 805
 806 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 807   do {                                  \
 808     EMIT_ONE_BYTE (c1);                 \
 809     EMIT_TWO_BYTES (c2, c3);            \
 810   } while (0)
 811
 812
 813 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 814   do {                                          \
 815     EMIT_TWO_BYTES (c1, c2);                    \
 816     EMIT_TWO_BYTES (c3, c4);                    \
 817   } while (0)
 818
 819
 820 /* Prototypes for static functions.  */
 821 static void record_conversion_result P_ ((struct coding_system *coding,
 822                                           enum coding_result_code result));
 823 static int detect_coding_utf_8 P_ ((struct coding_system *,
 824                                     struct coding_detection_info *info));
 825 static void decode_coding_utf_8 P_ ((struct coding_system *));
 826 static int encode_coding_utf_8 P_ ((struct coding_system *));
 827
 828 static int detect_coding_utf_16 P_ ((struct coding_system *,
 829                                      struct coding_detection_info *info));
 830 static void decode_coding_utf_16 P_ ((struct coding_system *));
 831 static int encode_coding_utf_16 P_ ((struct coding_system *));
 832
 833 static int detect_coding_iso_2022 P_ ((struct coding_system *,
 834                                        struct coding_detection_info *info));
 835 static void decode_coding_iso_2022 P_ ((struct coding_system *));
 836 static int encode_coding_iso_2022 P_ ((struct coding_system *));
 837
 838 static int detect_coding_emacs_mule P_ ((struct coding_system *,
 839                                          struct coding_detection_info *info));
 840 static void decode_coding_emacs_mule P_ ((struct coding_system *));
 841 static int encode_coding_emacs_mule P_ ((struct coding_system *));
 842
 843 static int detect_coding_sjis P_ ((struct coding_system *,
 844                                    struct coding_detection_info *info));
 845 static void decode_coding_sjis P_ ((struct coding_system *));
 846 static int encode_coding_sjis P_ ((struct coding_system *));
 847
 848 static int detect_coding_big5 P_ ((struct coding_system *,
 849                                    struct coding_detection_info *info));
 850 static void decode_coding_big5 P_ ((struct coding_system *));
 851 static int encode_coding_big5 P_ ((struct coding_system *));
 852
 853 static int detect_coding_ccl P_ ((struct coding_system *,
 854                                   struct coding_detection_info *info));
 855 static void decode_coding_ccl P_ ((struct coding_system *));
 856 static int encode_coding_ccl P_ ((struct coding_system *));
 857
 858 static void decode_coding_raw_text P_ ((struct coding_system *));
 859 static int encode_coding_raw_text P_ ((struct coding_system *));
 860
 861 static void coding_set_source P_ ((struct coding_system *));
 862 static void coding_set_destination P_ ((struct coding_system *));
 863 static void coding_alloc_by_realloc P_ ((struct coding_system *, EMACS_INT));
 864 static void coding_alloc_by_making_gap P_ ((struct coding_system *,
 865                                             EMACS_INT));
 866 static unsigned char *alloc_destination P_ ((struct coding_system *,
 867                                              EMACS_INT, unsigned char *));
 868 static void setup_iso_safe_charsets P_ ((Lisp_Object));
 869 static unsigned char *encode_designation_at_bol P_ ((struct coding_system *,
 870                                                      int *, int *,
 871                                                      unsigned char *));
 872 static int detect_eol P_ ((const unsigned char *,
 873                            EMACS_INT, enum coding_category));
 874 static Lisp_Object adjust_coding_eol_type P_ ((struct coding_system *, int));
 875 static void decode_eol P_ ((struct coding_system *));
 876 static Lisp_Object get_translation_table P_ ((Lisp_Object, int, int *));
 877 static Lisp_Object get_translation P_ ((Lisp_Object, int *, int *,
 878                                         int, int *, int *));
 879 static int produce_chars P_ ((struct coding_system *, Lisp_Object, int));
 880 static INLINE void produce_composition P_ ((struct coding_system *, int *,
 881                                             EMACS_INT));
 882 static INLINE void produce_charset P_ ((struct coding_system *, int *,
 883                                         EMACS_INT));
 884 static void produce_annotation P_ ((struct coding_system *, EMACS_INT));
 885 static int decode_coding P_ ((struct coding_system *));
 886 static INLINE int *handle_composition_annotation P_ ((EMACS_INT, EMACS_INT,
 887                                                       struct coding_system *,
 888                                                       int *, EMACS_INT *));
 889 static INLINE int *handle_charset_annotation P_ ((EMACS_INT, EMACS_INT,
 890                                                   struct coding_system *,
 891                                                   int *, EMACS_INT *));
 892 static void consume_chars P_ ((struct coding_system *, Lisp_Object, int));
 893 static int encode_coding P_ ((struct coding_system *));
 894 static Lisp_Object make_conversion_work_buffer P_ ((int));
 895 static Lisp_Object code_conversion_restore P_ ((Lisp_Object));
 896 static INLINE int char_encodable_p P_ ((int, Lisp_Object));
 897 static Lisp_Object make_subsidiaries P_ ((Lisp_Object));
 898
 899 static void
 900 record_conversion_result (struct coding_system *coding,
 901                           enum coding_result_code result)
 902 {
 903   coding->result = result;
 904   switch (result)
 905     {
 906     case CODING_RESULT_INSUFFICIENT_SRC:
 907       Vlast_code_conversion_error = Qinsufficient_source;
 908       break;
 909     case CODING_RESULT_INCONSISTENT_EOL:
 910       Vlast_code_conversion_error = Qinconsistent_eol;
 911       break;
 912     case CODING_RESULT_INVALID_SRC:
 913       Vlast_code_conversion_error = Qinvalid_source;
 914       break;
 915     case CODING_RESULT_INTERRUPT:
 916       Vlast_code_conversion_error = Qinterrupted;
 917       break;
 918     case CODING_RESULT_INSUFFICIENT_MEM:
 919       Vlast_code_conversion_error = Qinsufficient_memory;
 920       break;
 921     }
 922 }
 923
 924 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 925   do {                                                                       \
 926     charset_map_loaded = 0;                                                  \
 927     c = DECODE_CHAR (charset, code);                                         \
 928     if (charset_map_loaded)                                                  \
 929       {                                                                      \
 930         const unsigned char *orig = coding->source;                          \
 931         EMACS_INT offset;                                                    \
 932                                                                              \
 933         coding_set_source (coding);                                          \
 934         offset = coding->source - orig;                                      \
 935         src += offset;                                                       \
 936         src_base += offset;                                                  \
 937         src_end += offset;                                                   \
 938       }                                                                      \
 939   } while (0)
 940
 941
 942 #define ASSURE_DESTINATION(bytes)                               \
 943   do {                                                          \
 944     if (dst + (bytes) >= dst_end)                               \
 945       {                                                         \
 946         int more_bytes = charbuf_end - charbuf + (bytes);       \
 947                                                                 \
 948         dst = alloc_destination (coding, more_bytes, dst);      \
 949         dst_end = coding->destination + coding->dst_bytes;      \
 950       }                                                         \
 951   } while (0)
 952
 953
 954
 955 static void
 956 coding_set_source (coding)
 957      struct coding_system *coding;
 958 {
 959   if (BUFFERP (coding->src_object))
 960     {
 961       struct buffer *buf = XBUFFER (coding->src_object);
 962
 963       if (coding->src_pos < 0)
 964         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 965       else
 966         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 967     }
 968   else if (STRINGP (coding->src_object))
 969     {
 970       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 971     }
 972   else
 973     /* Otherwise, the source is C string and is never relocated
 974        automatically.  Thus we don't have to update anything.  */
 975     ;
 976 }
 977
 978 static void
 979 coding_set_destination (coding)
 980      struct coding_system *coding;
 981 {
 982   if (BUFFERP (coding->dst_object))
 983     {
 984       if (coding->src_pos < 0)
 985         {
 986           coding->destination = BEG_ADDR + coding->dst_pos_byte - 1;
 987           coding->dst_bytes = (GAP_END_ADDR
 988                                - (coding->src_bytes - coding->consumed)
 989                                - coding->destination);
 990         }
 991       else
 992         {
 993           /* We are sure that coding->dst_pos_byte is before the gap
 994              of the buffer. */
 995           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
 996                                  + coding->dst_pos_byte - 1);
 997           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
 998                                - coding->destination);
 999         }
1000     }
1001   else
1002     /* Otherwise, the destination is C string and is never relocated
1003        automatically.  Thus we don't have to update anything.  */
1004     ;
1005 }
1006
1007
1008 static void
1009 coding_alloc_by_realloc (coding, bytes)
1010      struct coding_system *coding;
1011      EMACS_INT bytes;
1012 {
1013   coding->destination = (unsigned char *) xrealloc (coding->destination,
1014                                                     coding->dst_bytes + bytes);
1015   coding->dst_bytes += bytes;
1016 }
1017
1018 static void
1019 coding_alloc_by_making_gap (coding, bytes)
1020      struct coding_system *coding;
1021      EMACS_INT bytes;
1022 {
1023   if (BUFFERP (coding->dst_object)
1024       && EQ (coding->src_object, coding->dst_object))
1025     {
1026       EMACS_INT add = coding->src_bytes - coding->consumed;
1027
1028       GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1029       make_gap (bytes);
1030       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1031     }
1032   else
1033     {
1034       Lisp_Object this_buffer;
1035
1036       this_buffer = Fcurrent_buffer ();
1037       set_buffer_internal (XBUFFER (coding->dst_object));
1038       make_gap (bytes);
1039       set_buffer_internal (XBUFFER (this_buffer));
1040     }
1041 }
1042
1043
1044 static unsigned char *
1045 alloc_destination (coding, nbytes, dst)
1046      struct coding_system *coding;
1047      EMACS_INT nbytes;
1048      unsigned char *dst;
1049 {
1050   EMACS_INT offset = dst - coding->destination;
1051
1052   if (BUFFERP (coding->dst_object))
1053     coding_alloc_by_making_gap (coding, nbytes);
1054   else
1055     coding_alloc_by_realloc (coding, nbytes);
1056   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1057   coding_set_destination (coding);
1058   dst = coding->destination + offset;
1059   return dst;
1060 }
1061
1062 /** Macros for annotations.  */
1063
1064 /* Maximum length of annotation data (sum of annotations for
1065    composition and charset).  */
1066 #define MAX_ANNOTATION_LENGTH (4 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 4)
1067
1068 /* An annotation data is stored in the array coding->charbuf in this
1069    format:
1070      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1071    LENGTH is the number of elements in the annotation.
1072    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1073    NCHARS is the number of characters in the text annotated.
1074
1075    The format of the following elements depend on ANNOTATION_MASK.
1076
1077    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1078    follows:
1079      ... METHOD [ COMPOSITION-COMPONENTS ... ]
1080    METHOD is one of enum composition_method.
1081    Optionnal COMPOSITION-COMPONENTS are characters and composition
1082    rules.
1083
1084    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1085    follows.  */
1086
1087 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1088   do {                                                  \
1089     *(buf)++ = -(len);                                  \
1090     *(buf)++ = (mask);                                  \
1091     *(buf)++ = (nchars);                                \
1092     coding->annotated = 1;                              \
1093   } while (0);
1094
1095 #define ADD_COMPOSITION_DATA(buf, nchars, method)                           \
1096   do {                                                                      \
1097     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1098     *buf++ = method;                                                        \
1099   } while (0)
1100
1101
1102 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1103   do {                                                                  \
1104     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1105     *buf++ = id;                                                        \
1106   } while (0)
1107
1108 \f
1109 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1110
1111
1112
1113 \f
1114 /*** 3. UTF-8 ***/
1115
1116 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1117    Check if a text is encoded in UTF-8.  If it is, return 1, else
1118    return 0.  */
1119
1120 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1121 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1122 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1123 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1124 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1125 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1126
1127 static int
1128 detect_coding_utf_8 (coding, detect_info)
1129      struct coding_system *coding;
1130      struct coding_detection_info *detect_info;
1131 {
1132   const unsigned char *src = coding->source, *src_base;
1133   const unsigned char *src_end = coding->source + coding->src_bytes;
1134   int multibytep = coding->src_multibyte;
1135   int consumed_chars = 0;
1136   int found = 0;
1137
1138   detect_info->checked |= CATEGORY_MASK_UTF_8;
1139   /* A coding system of this category is always ASCII compatible.  */
1140   src += coding->head_ascii;
1141
1142   while (1)
1143     {
1144       int c, c1, c2, c3, c4;
1145
1146       src_base = src;
1147       ONE_MORE_BYTE (c);
1148       if (c < 0 || UTF_8_1_OCTET_P (c))
1149         continue;
1150       ONE_MORE_BYTE (c1);
1151       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1152         break;
1153       if (UTF_8_2_OCTET_LEADING_P (c))
1154         {
1155           found = CATEGORY_MASK_UTF_8;
1156           continue;
1157         }
1158       ONE_MORE_BYTE (c2);
1159       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1160         break;
1161       if (UTF_8_3_OCTET_LEADING_P (c))
1162         {
1163           found = CATEGORY_MASK_UTF_8;
1164           continue;
1165         }
1166       ONE_MORE_BYTE (c3);
1167       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1168         break;
1169       if (UTF_8_4_OCTET_LEADING_P (c))
1170         {
1171           found = CATEGORY_MASK_UTF_8;
1172           continue;
1173         }
1174       ONE_MORE_BYTE (c4);
1175       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1176         break;
1177       if (UTF_8_5_OCTET_LEADING_P (c))
1178         {
1179           found = CATEGORY_MASK_UTF_8;
1180           continue;
1181         }
1182       break;
1183     }
1184   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1185   return 0;
1186
1187  no_more_source:
1188   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1189     {
1190       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1191       return 0;
1192     }
1193   detect_info->found |= found;
1194   return 1;
1195 }
1196
1197
1198 static void
1199 decode_coding_utf_8 (coding)
1200      struct coding_system *coding;
1201 {
1202   const unsigned char *src = coding->source + coding->consumed;
1203   const unsigned char *src_end = coding->source + coding->src_bytes;
1204   const unsigned char *src_base;
1205   int *charbuf = coding->charbuf + coding->charbuf_used;
1206   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1207   int consumed_chars = 0, consumed_chars_base;
1208   int multibytep = coding->src_multibyte;
1209   Lisp_Object attr, charset_list;
1210
1211   CODING_GET_INFO (coding, attr, charset_list);
1212
1213   while (1)
1214     {
1215       int c, c1, c2, c3, c4, c5;
1216
1217       src_base = src;
1218       consumed_chars_base = consumed_chars;
1219
1220       if (charbuf >= charbuf_end)
1221         break;
1222
1223       ONE_MORE_BYTE (c1);
1224       if (c1 < 0)
1225         {
1226           c = - c1;
1227         }
1228       else if (UTF_8_1_OCTET_P(c1))
1229         {
1230           c = c1;
1231         }
1232       else
1233         {
1234           ONE_MORE_BYTE (c2);
1235           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1236             goto invalid_code;
1237           if (UTF_8_2_OCTET_LEADING_P (c1))
1238             {
1239               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1240               /* Reject overlong sequences here and below.  Encoders
1241                  producing them are incorrect, they can be misleading,
1242                  and they mess up read/write invariance.  */
1243               if (c < 128)
1244                 goto invalid_code;
1245             }
1246           else
1247             {
1248               ONE_MORE_BYTE (c3);
1249               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1250                 goto invalid_code;
1251               if (UTF_8_3_OCTET_LEADING_P (c1))
1252                 {
1253                   c = (((c1 & 0xF) << 12)
1254                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1255                   if (c < 0x800
1256                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1257                     goto invalid_code;
1258                 }
1259               else
1260                 {
1261                   ONE_MORE_BYTE (c4);
1262                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1263                     goto invalid_code;
1264                   if (UTF_8_4_OCTET_LEADING_P (c1))
1265                     {
1266                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1267                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1268                     if (c < 0x10000)
1269                       goto invalid_code;
1270                     }
1271                   else
1272                     {
1273                       ONE_MORE_BYTE (c5);
1274                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1275                         goto invalid_code;
1276                       if (UTF_8_5_OCTET_LEADING_P (c1))
1277                         {
1278                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1279                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1280                                | (c5 & 0x3F));
1281                           if ((c > MAX_CHAR) || (c < 0x200000))
1282                             goto invalid_code;
1283                         }
1284                       else
1285                         goto invalid_code;
1286                     }
1287                 }
1288             }
1289         }
1290
1291       *charbuf++ = c;
1292       continue;
1293
1294     invalid_code:
1295       src = src_base;
1296       consumed_chars = consumed_chars_base;
1297       ONE_MORE_BYTE (c);
1298       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1299       coding->errors++;
1300     }
1301
1302  no_more_source:
1303   coding->consumed_char += consumed_chars_base;
1304   coding->consumed = src_base - coding->source;
1305   coding->charbuf_used = charbuf - coding->charbuf;
1306 }
1307
1308
1309 static int
1310 encode_coding_utf_8 (coding)
1311      struct coding_system *coding;
1312 {
1313   int multibytep = coding->dst_multibyte;
1314   int *charbuf = coding->charbuf;
1315   int *charbuf_end = charbuf + coding->charbuf_used;
1316   unsigned char *dst = coding->destination + coding->produced;
1317   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1318   int produced_chars = 0;
1319   int c;
1320
1321   if (multibytep)
1322     {
1323       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1324
1325       while (charbuf < charbuf_end)
1326         {
1327           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1328
1329           ASSURE_DESTINATION (safe_room);
1330           c = *charbuf++;
1331           if (CHAR_BYTE8_P (c))
1332             {
1333               c = CHAR_TO_BYTE8 (c);
1334               EMIT_ONE_BYTE (c);
1335             }
1336           else
1337             {
1338               CHAR_STRING_ADVANCE (c, pend);
1339               for (p = str; p < pend; p++)
1340                 EMIT_ONE_BYTE (*p);
1341             }
1342         }
1343     }
1344   else
1345     {
1346       int safe_room = MAX_MULTIBYTE_LENGTH;
1347
1348       while (charbuf < charbuf_end)
1349         {
1350           ASSURE_DESTINATION (safe_room);
1351           c = *charbuf++;
1352           if (CHAR_BYTE8_P (c))
1353             *dst++ = CHAR_TO_BYTE8 (c);
1354           else
1355             dst += CHAR_STRING (c, dst);
1356           produced_chars++;
1357         }
1358     }
1359   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1360   coding->produced_char += produced_chars;
1361   coding->produced = dst - coding->destination;
1362   return 0;
1363 }
1364
1365
1366 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1367    Check if a text is encoded in one of UTF-16 based coding systems.
1368    If it is, return 1, else return 0.  */
1369
1370 #define UTF_16_HIGH_SURROGATE_P(val) \
1371   (((val) & 0xFC00) == 0xD800)
1372
1373 #define UTF_16_LOW_SURROGATE_P(val) \
1374   (((val) & 0xFC00) == 0xDC00)
1375
1376 #define UTF_16_INVALID_P(val)   \
1377   (((val) == 0xFFFE)            \
1378    || ((val) == 0xFFFF)         \
1379    || UTF_16_LOW_SURROGATE_P (val))
1380
1381
1382 static int
1383 detect_coding_utf_16 (coding, detect_info)
1384      struct coding_system *coding;
1385      struct coding_detection_info *detect_info;
1386 {
1387   const unsigned char *src = coding->source, *src_base = src;
1388   const unsigned char *src_end = coding->source + coding->src_bytes;
1389   int multibytep = coding->src_multibyte;
1390   int consumed_chars = 0;
1391   int c1, c2;
1392
1393   detect_info->checked |= CATEGORY_MASK_UTF_16;
1394   if (coding->mode & CODING_MODE_LAST_BLOCK
1395       && (coding->src_chars & 1))
1396     {
1397       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1398       return 0;
1399     }
1400
1401   ONE_MORE_BYTE (c1);
1402   ONE_MORE_BYTE (c2);
1403   if ((c1 == 0xFF) && (c2 == 0xFE))
1404     {
1405       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1406                              | CATEGORY_MASK_UTF_16_AUTO);
1407       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1408                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1409                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1410     }
1411   else if ((c1 == 0xFE) && (c2 == 0xFF))
1412     {
1413       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1414                              | CATEGORY_MASK_UTF_16_AUTO);
1415       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1416                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1417                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1418     }
1419   else if (c1 >= 0 && c2 >= 0)
1420     {
1421       detect_info->rejected
1422         |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE);
1423     }
1424  no_more_source:
1425   return 1;
1426 }
1427
1428 static void
1429 decode_coding_utf_16 (coding)
1430      struct coding_system *coding;
1431 {
1432   const unsigned char *src = coding->source + coding->consumed;
1433   const unsigned char *src_end = coding->source + coding->src_bytes;
1434   const unsigned char *src_base;
1435   int *charbuf = coding->charbuf + coding->charbuf_used;
1436   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1437   int consumed_chars = 0, consumed_chars_base;
1438   int multibytep = coding->src_multibyte;
1439   enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1440   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1441   int surrogate = CODING_UTF_16_SURROGATE (coding);
1442   Lisp_Object attr, charset_list;
1443
1444   CODING_GET_INFO (coding, attr, charset_list);
1445
1446   if (bom == utf_16_with_bom)
1447     {
1448       int c, c1, c2;
1449
1450       src_base = src;
1451       ONE_MORE_BYTE (c1);
1452       ONE_MORE_BYTE (c2);
1453       c = (c1 << 8) | c2;
1454
1455       if (endian == utf_16_big_endian
1456           ? c != 0xFEFF : c != 0xFFFE)
1457         {
1458           /* The first two bytes are not BOM.  Treat them as bytes
1459              for a normal character.  */
1460           src = src_base;
1461           coding->errors++;
1462         }
1463       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1464     }
1465   else if (bom == utf_16_detect_bom)
1466     {
1467       /* We have already tried to detect BOM and failed in
1468          detect_coding.  */
1469       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1470     }
1471
1472   while (1)
1473     {
1474       int c, c1, c2;
1475
1476       src_base = src;
1477       consumed_chars_base = consumed_chars;
1478
1479       if (charbuf + 2 >= charbuf_end)
1480         break;
1481
1482       ONE_MORE_BYTE (c1);
1483       if (c1 < 0)
1484         {
1485           *charbuf++ = -c1;
1486           continue;
1487         }
1488       ONE_MORE_BYTE (c2);
1489       if (c2 < 0)
1490         {
1491           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1492           *charbuf++ = -c2;
1493           continue;
1494         }
1495       c = (endian == utf_16_big_endian
1496            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1497       if (surrogate)
1498         {
1499           if (! UTF_16_LOW_SURROGATE_P (c))
1500             {
1501               if (endian == utf_16_big_endian)
1502                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1503               else
1504                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1505               *charbuf++ = c1;
1506               *charbuf++ = c2;
1507               coding->errors++;
1508               if (UTF_16_HIGH_SURROGATE_P (c))
1509                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1510               else
1511                 *charbuf++ = c;
1512             }
1513           else
1514             {
1515               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1516               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1517               *charbuf++ = 0x10000 + c;
1518             }
1519         }
1520       else
1521         {
1522           if (UTF_16_HIGH_SURROGATE_P (c))
1523             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1524           else
1525             *charbuf++ = c;
1526         }
1527     }
1528
1529  no_more_source:
1530   coding->consumed_char += consumed_chars_base;
1531   coding->consumed = src_base - coding->source;
1532   coding->charbuf_used = charbuf - coding->charbuf;
1533 }
1534
1535 static int
1536 encode_coding_utf_16 (coding)
1537      struct coding_system *coding;
1538 {
1539   int multibytep = coding->dst_multibyte;
1540   int *charbuf = coding->charbuf;
1541   int *charbuf_end = charbuf + coding->charbuf_used;
1542   unsigned char *dst = coding->destination + coding->produced;
1543   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1544   int safe_room = 8;
1545   enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1546   int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1547   int produced_chars = 0;
1548   Lisp_Object attrs, charset_list;
1549   int c;
1550
1551   CODING_GET_INFO (coding, attrs, charset_list);
1552
1553   if (bom != utf_16_without_bom)
1554     {
1555       ASSURE_DESTINATION (safe_room);
1556       if (big_endian)
1557         EMIT_TWO_BYTES (0xFE, 0xFF);
1558       else
1559         EMIT_TWO_BYTES (0xFF, 0xFE);
1560       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1561     }
1562
1563   while (charbuf < charbuf_end)
1564     {
1565       ASSURE_DESTINATION (safe_room);
1566       c = *charbuf++;
1567       if (c >= MAX_UNICODE_CHAR)
1568         c = coding->default_char;
1569
1570       if (c < 0x10000)
1571         {
1572           if (big_endian)
1573             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1574           else
1575             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1576         }
1577       else
1578         {
1579           int c1, c2;
1580
1581           c -= 0x10000;
1582           c1 = (c >> 10) + 0xD800;
1583           c2 = (c & 0x3FF) + 0xDC00;
1584           if (big_endian)
1585             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1586           else
1587             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1588         }
1589     }
1590   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1591   coding->produced = dst - coding->destination;
1592   coding->produced_char += produced_chars;
1593   return 0;
1594 }
1595
1596 \f
1597 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1598
1599 /* Emacs' internal format for representation of multiple character
1600    sets is a kind of multi-byte encoding, i.e. characters are
1601    represented by variable-length sequences of one-byte codes.
1602
1603    ASCII characters and control characters (e.g. `tab', `newline') are
1604    represented by one-byte sequences which are their ASCII codes, in
1605    the range 0x00 through 0x7F.
1606
1607    8-bit characters of the range 0x80..0x9F are represented by
1608    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1609    code + 0x20).
1610
1611    8-bit characters of the range 0xA0..0xFF are represented by
1612    one-byte sequences which are their 8-bit code.
1613
1614    The other characters are represented by a sequence of `base
1615    leading-code', optional `extended leading-code', and one or two
1616    `position-code's.  The length of the sequence is determined by the
1617    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1618    whereas extended leading-code and position-code take the range 0xA0
1619    through 0xFF.  See `charset.h' for more details about leading-code
1620    and position-code.
1621
1622    --- CODE RANGE of Emacs' internal format ---
1623    character set        range
1624    -------------        -----
1625    ascii                0x00..0x7F
1626    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1627    eight-bit-graphic    0xA0..0xBF
1628    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1629    ---------------------------------------------
1630
1631    As this is the internal character representation, the format is
1632    usually not used externally (i.e. in a file or in a data sent to a
1633    process).  But, it is possible to have a text externally in this
1634    format (i.e. by encoding by the coding system `emacs-mule').
1635
1636    In that case, a sequence of one-byte codes has a slightly different
1637    form.
1638
1639    At first, all characters in eight-bit-control are represented by
1640    one-byte sequences which are their 8-bit code.
1641
1642    Next, character composition data are represented by the byte
1643    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1644    where,
1645         METHOD is 0xF0 plus one of composition method (enum
1646         composition_method),
1647
1648         BYTES is 0xA0 plus a byte length of this composition data,
1649
1650         CHARS is 0x20 plus a number of characters composed by this
1651         data,
1652
1653         COMPONENTs are characters of multibye form or composition
1654         rules encoded by two-byte of ASCII codes.
1655
1656    In addition, for backward compatibility, the following formats are
1657    also recognized as composition data on decoding.
1658
1659    0x80 MSEQ ...
1660    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1661
1662    Here,
1663         MSEQ is a multibyte form but in these special format:
1664           ASCII: 0xA0 ASCII_CODE+0x80,
1665           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1666         RULE is a one byte code of the range 0xA0..0xF0 that
1667         represents a composition rule.
1668   */
1669
1670 char emacs_mule_bytes[256];
1671
1672 int
1673 emacs_mule_char (coding, src, nbytes, nchars, id)
1674      struct coding_system *coding;
1675      const unsigned char *src;
1676      int *nbytes, *nchars, *id;
1677 {
1678   const unsigned char *src_end = coding->source + coding->src_bytes;
1679   const unsigned char *src_base = src;
1680   int multibytep = coding->src_multibyte;
1681   struct charset *charset;
1682   unsigned code;
1683   int c;
1684   int consumed_chars = 0;
1685
1686   ONE_MORE_BYTE (c);
1687   if (c < 0)
1688     {
1689       c = -c;
1690       charset = emacs_mule_charset[0];
1691     }
1692   else
1693     {
1694       switch (emacs_mule_bytes[c])
1695         {
1696         case 2:
1697           if (! (charset = emacs_mule_charset[c]))
1698             goto invalid_code;
1699           ONE_MORE_BYTE (c);
1700           if (c < 0xA0)
1701             goto invalid_code;
1702           code = c & 0x7F;
1703           break;
1704
1705         case 3:
1706           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1707               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1708             {
1709               ONE_MORE_BYTE (c);
1710               if (c < 0xA0 || ! (charset = emacs_mule_charset[c]))
1711                 goto invalid_code;
1712               ONE_MORE_BYTE (c);
1713               if (c < 0xA0)
1714                 goto invalid_code;
1715               code = c & 0x7F;
1716             }
1717           else
1718             {
1719               if (! (charset = emacs_mule_charset[c]))
1720                 goto invalid_code;
1721               ONE_MORE_BYTE (c);
1722               if (c < 0xA0)
1723                 goto invalid_code;
1724               code = (c & 0x7F) << 8;
1725               ONE_MORE_BYTE (c);
1726               if (c < 0xA0)
1727                 goto invalid_code;
1728               code |= c & 0x7F;
1729             }
1730           break;
1731
1732         case 4:
1733           ONE_MORE_BYTE (c);
1734           if (c < 0 || ! (charset = emacs_mule_charset[c]))
1735             goto invalid_code;
1736           ONE_MORE_BYTE (c);
1737           if (c < 0xA0)
1738             goto invalid_code;
1739           code = (c & 0x7F) << 8;
1740           ONE_MORE_BYTE (c);
1741           if (c < 0xA0)
1742             goto invalid_code;
1743           code |= c & 0x7F;
1744           break;
1745
1746         case 1:
1747           code = c;
1748           charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
1749                                      ? charset_ascii : charset_eight_bit);
1750           break;
1751
1752         default:
1753           abort ();
1754         }
1755       c = DECODE_CHAR (charset, code);
1756       if (c < 0)
1757         goto invalid_code;
1758     }
1759   *nbytes = src - src_base;
1760   *nchars = consumed_chars;
1761   if (id)
1762     *id = charset->id;
1763   return c;
1764
1765  no_more_source:
1766   return -2;
1767
1768  invalid_code:
1769   return -1;
1770 }
1771
1772
1773 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1774    Check if a text is encoded in `emacs-mule'.  If it is, return 1,
1775    else return 0.  */
1776
1777 static int
1778 detect_coding_emacs_mule (coding, detect_info)
1779      struct coding_system *coding;
1780      struct coding_detection_info *detect_info;
1781 {
1782   const unsigned char *src = coding->source, *src_base;
1783   const unsigned char *src_end = coding->source + coding->src_bytes;
1784   int multibytep = coding->src_multibyte;
1785   int consumed_chars = 0;
1786   int c;
1787   int found = 0;
1788
1789   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1790   /* A coding system of this category is always ASCII compatible.  */
1791   src += coding->head_ascii;
1792
1793   while (1)
1794     {
1795       src_base = src;
1796       ONE_MORE_BYTE (c);
1797       if (c < 0)
1798         continue;
1799       if (c == 0x80)
1800         {
1801           /* Perhaps the start of composite character.  We simple skip
1802              it because analyzing it is too heavy for detecting.  But,
1803              at least, we check that the composite character
1804              constitues of more than 4 bytes.  */
1805           const unsigned char *src_base;
1806
1807         repeat:
1808           src_base = src;
1809           do
1810             {
1811               ONE_MORE_BYTE (c);
1812             }
1813           while (c >= 0xA0);
1814
1815           if (src - src_base <= 4)
1816             break;
1817           found = CATEGORY_MASK_EMACS_MULE;
1818           if (c == 0x80)
1819             goto repeat;
1820         }
1821
1822       if (c < 0x80)
1823         {
1824           if (c < 0x20
1825               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1826             break;
1827         }
1828       else
1829         {
1830           int more_bytes = emacs_mule_bytes[*src_base] - 1;
1831
1832           while (more_bytes > 0)
1833             {
1834               ONE_MORE_BYTE (c);
1835               if (c < 0xA0)
1836                 {
1837                   src--;        /* Unread the last byte.  */
1838                   break;
1839                 }
1840               more_bytes--;
1841             }
1842           if (more_bytes != 0)
1843             break;
1844           found = CATEGORY_MASK_EMACS_MULE;
1845         }
1846     }
1847   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1848   return 0;
1849
1850  no_more_source:
1851   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1852     {
1853       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1854       return 0;
1855     }
1856   detect_info->found |= found;
1857   return 1;
1858 }
1859
1860
1861 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1862
1863 /* Decode a character represented as a component of composition
1864    sequence of Emacs 20/21 style at SRC.  Set C to that character and
1865    update SRC to the head of next character (or an encoded composition
1866    rule).  If SRC doesn't points a composition component, set C to -1.
1867    If SRC points an invalid byte sequence, global exit by a return
1868    value 0.  */
1869
1870 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf)                 \
1871   if (1)                                                        \
1872     {                                                           \
1873       int c;                                                    \
1874       int nbytes, nchars;                                       \
1875                                                                 \
1876       if (src == src_end)                                       \
1877         break;                                                  \
1878       c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
1879       if (c < 0)                                                \
1880         {                                                       \
1881           if (c == -2)                                          \
1882             break;                                              \
1883           goto invalid_code;                                    \
1884         }                                                       \
1885       *buf++ = c;                                               \
1886       src += nbytes;                                            \
1887       consumed_chars += nchars;                                 \
1888     }                                                           \
1889   else
1890
1891
1892 /* Decode a composition rule represented as a component of composition
1893    sequence of Emacs 20 style at SRC.  Store the decoded rule in *BUF,
1894    and increment BUF.  If SRC points an invalid byte sequence, set C
1895    to -1.  */
1896
1897 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf)      \
1898   do {                                                  \
1899     int c, gref, nref;                                  \
1900                                                         \
1901     if (src >= src_end)                                 \
1902       goto invalid_code;                                \
1903     ONE_MORE_BYTE_NO_CHECK (c);                         \
1904     c -= 0x20;                                          \
1905     if (c < 0 || c >= 81)                               \
1906       goto invalid_code;                                \
1907                                                         \
1908     gref = c / 9, nref = c % 9;                         \
1909     *buf++ = COMPOSITION_ENCODE_RULE (gref, nref);      \
1910   } while (0)
1911
1912
1913 /* Decode a composition rule represented as a component of composition
1914    sequence of Emacs 21 style at SRC.  Store the decoded rule in *BUF,
1915    and increment BUF.  If SRC points an invalid byte sequence, set C
1916    to -1.  */
1917
1918 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf)      \
1919   do {                                                  \
1920     int gref, nref;                                     \
1921                                                         \
1922     if (src + 1>= src_end)                              \
1923       goto invalid_code;                                \
1924     ONE_MORE_BYTE_NO_CHECK (gref);                      \
1925     gref -= 0x20;                                       \
1926     ONE_MORE_BYTE_NO_CHECK (nref);                      \
1927     nref -= 0x20;                                       \
1928     if (gref < 0 || gref >= 81                          \
1929         || nref < 0 || nref >= 81)                      \
1930       goto invalid_code;                                \
1931     *buf++ = COMPOSITION_ENCODE_RULE (gref, nref);      \
1932   } while (0)
1933
1934
1935 #define DECODE_EMACS_MULE_21_COMPOSITION(c)                             \
1936   do {                                                                  \
1937     /* Emacs 21 style format.  The first three bytes at SRC are         \
1938        (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is  \
1939        the byte length of this composition information, CHARS is the    \
1940        number of characters composed by this composition.  */           \
1941     enum composition_method method = c - 0xF2;                          \
1942     int *charbuf_base = charbuf;                                        \
1943     int consumed_chars_limit;                                           \
1944     int nbytes, nchars;                                                 \
1945                                                                         \
1946     ONE_MORE_BYTE (c);                                                  \
1947     if (c < 0)                                                          \
1948       goto invalid_code;                                                \
1949     nbytes = c - 0xA0;                                                  \
1950     if (nbytes < 3)                                                     \
1951       goto invalid_code;                                                \
1952     ONE_MORE_BYTE (c);                                                  \
1953     if (c < 0)                                                          \
1954       goto invalid_code;                                                \
1955     nchars = c - 0xA0;                                                  \
1956     ADD_COMPOSITION_DATA (charbuf, nchars, method);                     \
1957     consumed_chars_limit = consumed_chars_base + nbytes;                \
1958     if (method != COMPOSITION_RELATIVE)                                 \
1959       {                                                                 \
1960         int i = 0;                                                      \
1961         while (consumed_chars < consumed_chars_limit)                   \
1962           {                                                             \
1963             if (i % 2 && method != COMPOSITION_WITH_ALTCHARS)           \
1964               DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf);          \
1965             else                                                        \
1966               DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf);             \
1967             i++;                                                        \
1968           }                                                             \
1969         if (consumed_chars < consumed_chars_limit)                      \
1970           goto invalid_code;                                            \
1971         charbuf_base[0] -= i;                                           \
1972       }                                                                 \
1973   } while (0)
1974
1975
1976 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c)            \
1977   do {                                                          \
1978     /* Emacs 20 style format for relative composition.  */      \
1979     /* Store multibyte form of characters to be composed.  */   \
1980     enum composition_method method = COMPOSITION_RELATIVE;      \
1981     int components[MAX_COMPOSITION_COMPONENTS * 2 - 1];         \
1982     int *buf = components;                                      \
1983     int i, j;                                                   \
1984                                                                 \
1985     src = src_base;                                             \
1986     ONE_MORE_BYTE (c);          /* skip 0x80 */                 \
1987     for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++)            \
1988       DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);                 \
1989     if (i < 2)                                                  \
1990       goto invalid_code;                                        \
1991     ADD_COMPOSITION_DATA (charbuf, i, method);                  \
1992     for (j = 0; j < i; j++)                                     \
1993       *charbuf++ = components[j];                               \
1994   } while (0)
1995
1996
1997 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c)            \
1998   do {                                                          \
1999     /* Emacs 20 style format for rule-base composition.  */     \
2000     /* Store multibyte form of characters to be composed.  */   \
2001     enum composition_method method = COMPOSITION_WITH_RULE;     \
2002     int components[MAX_COMPOSITION_COMPONENTS * 2 - 1];         \
2003     int *buf = components;                                      \
2004     int i, j;                                                   \
2005                                                                 \
2006     DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);                   \
2007     for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++)            \
2008       {                                                         \
2009         DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf);            \
2010         DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);               \
2011       }                                                         \
2012     if (i < 1 || (buf - components) % 2 == 0)                   \
2013       goto invalid_code;                                        \
2014     if (charbuf + i + (i / 2) + 1 < charbuf_end)                \
2015       goto no_more_source;                                      \
2016     ADD_COMPOSITION_DATA (buf, i, method);                      \
2017     for (j = 0; j < i; j++)                                     \
2018       *charbuf++ = components[j];                               \
2019     for (j = 0; j < i; j += 2)                                  \
2020       *charbuf++ = components[j];                               \
2021   } while (0)
2022
2023
2024 static void
2025 decode_coding_emacs_mule (coding)
2026      struct coding_system *coding;
2027 {
2028   const unsigned char *src = coding->source + coding->consumed;
2029   const unsigned char *src_end = coding->source + coding->src_bytes;
2030   const unsigned char *src_base;
2031   int *charbuf = coding->charbuf + coding->charbuf_used;
2032   int *charbuf_end
2033     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
2034   int consumed_chars = 0, consumed_chars_base;
2035   int multibytep = coding->src_multibyte;
2036   Lisp_Object attrs, charset_list;
2037   int char_offset = coding->produced_char;
2038   int last_offset = char_offset;
2039   int last_id = charset_ascii;
2040
2041   CODING_GET_INFO (coding, attrs, charset_list);
2042
2043   while (1)
2044     {
2045       int c;
2046
2047       src_base = src;
2048       consumed_chars_base = consumed_chars;
2049
2050       if (charbuf >= charbuf_end)
2051         break;
2052
2053       ONE_MORE_BYTE (c);
2054       if (c < 0)
2055         {
2056           *charbuf++ = -c;
2057           char_offset++;
2058         }
2059       else if (c < 0x80)
2060         {
2061           *charbuf++ = c;
2062           char_offset++;
2063         }
2064       else if (c == 0x80)
2065         {
2066           ONE_MORE_BYTE (c);
2067           if (c < 0)
2068             goto invalid_code;
2069           if (c - 0xF2 >= COMPOSITION_RELATIVE
2070               && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
2071             DECODE_EMACS_MULE_21_COMPOSITION (c);
2072           else if (c < 0xC0)
2073             DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
2074           else if (c == 0xFF)
2075             DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
2076           else
2077             goto invalid_code;
2078         }
2079       else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
2080         {
2081           int nbytes, nchars;
2082           int id;
2083
2084           src = src_base;
2085           consumed_chars = consumed_chars_base;
2086           c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
2087           if (c < 0)
2088             {
2089               if (c == -2)
2090                 break;
2091               goto invalid_code;
2092             }
2093           if (last_id != id)
2094             {
2095               if (last_id != charset_ascii)
2096                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2097               last_id = id;
2098               last_offset = char_offset;
2099             }
2100           *charbuf++ = c;
2101           src += nbytes;
2102           consumed_chars += nchars;
2103           char_offset++;
2104         }
2105       continue;
2106
2107     invalid_code:
2108       src = src_base;
2109       consumed_chars = consumed_chars_base;
2110       ONE_MORE_BYTE (c);
2111       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2112       char_offset++;
2113       coding->errors++;
2114     }
2115
2116  no_more_source:
2117   if (last_id != charset_ascii)
2118     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2119   coding->consumed_char += consumed_chars_base;
2120   coding->consumed = src_base - coding->source;
2121   coding->charbuf_used = charbuf - coding->charbuf;
2122 }
2123
2124
2125 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2126   do {                                          \
2127     if (id < 0xA0)                              \
2128       codes[0] = id, codes[1] = 0;              \
2129     else if (id < 0xE0)                         \
2130       codes[0] = 0x9A, codes[1] = id;           \
2131     else if (id < 0xF0)                         \
2132       codes[0] = 0x9B, codes[1] = id;           \
2133     else if (id < 0xF5)                         \
2134       codes[0] = 0x9C, codes[1] = id;           \
2135     else                                        \
2136       codes[0] = 0x9D, codes[1] = id;           \
2137   } while (0);
2138
2139
2140 static int
2141 encode_coding_emacs_mule (coding)
2142      struct coding_system *coding;
2143 {
2144   int multibytep = coding->dst_multibyte;
2145   int *charbuf = coding->charbuf;
2146   int *charbuf_end = charbuf + coding->charbuf_used;
2147   unsigned char *dst = coding->destination + coding->produced;
2148   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2149   int safe_room = 8;
2150   int produced_chars = 0;
2151   Lisp_Object attrs, charset_list;
2152   int c;
2153   int preferred_charset_id = -1;
2154
2155   CODING_GET_INFO (coding, attrs, charset_list);
2156   if (! EQ (charset_list, Vemacs_mule_charset_list))
2157     {
2158       CODING_ATTR_CHARSET_LIST (attrs)
2159         = charset_list = Vemacs_mule_charset_list;
2160     }
2161
2162   while (charbuf < charbuf_end)
2163     {
2164       ASSURE_DESTINATION (safe_room);
2165       c = *charbuf++;
2166
2167       if (c < 0)
2168         {
2169           /* Handle an annotation.  */
2170           switch (*charbuf)
2171             {
2172             case CODING_ANNOTATE_COMPOSITION_MASK:
2173               /* Not yet implemented.  */
2174               break;
2175             case CODING_ANNOTATE_CHARSET_MASK:
2176               preferred_charset_id = charbuf[3];
2177               if (preferred_charset_id >= 0
2178                   && NILP (Fmemq (make_number (preferred_charset_id),
2179                                   charset_list)))
2180                 preferred_charset_id = -1;
2181               break;
2182             default:
2183               abort ();
2184             }
2185           charbuf += -c - 1;
2186           continue;
2187         }
2188
2189       if (ASCII_CHAR_P (c))
2190         EMIT_ONE_ASCII_BYTE (c);
2191       else if (CHAR_BYTE8_P (c))
2192         {
2193           c = CHAR_TO_BYTE8 (c);
2194           EMIT_ONE_BYTE (c);
2195         }
2196       else
2197         {
2198           struct charset *charset;
2199           unsigned code;
2200           int dimension;
2201           int emacs_mule_id;
2202           unsigned char leading_codes[2];
2203
2204           if (preferred_charset_id >= 0)
2205             {
2206               charset = CHARSET_FROM_ID (preferred_charset_id);
2207               if (! CHAR_CHARSET_P (c, charset))
2208                 charset = char_charset (c, charset_list, NULL);
2209             }
2210           else
2211             charset = char_charset (c, charset_list, &code);
2212           if (! charset)
2213             {
2214               c = coding->default_char;
2215               if (ASCII_CHAR_P (c))
2216                 {
2217                   EMIT_ONE_ASCII_BYTE (c);
2218                   continue;
2219                 }
2220               charset = char_charset (c, charset_list, &code);
2221             }
2222           dimension = CHARSET_DIMENSION (charset);
2223           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2224           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2225           EMIT_ONE_BYTE (leading_codes[0]);
2226           if (leading_codes[1])
2227             EMIT_ONE_BYTE (leading_codes[1]);
2228           if (dimension == 1)
2229             EMIT_ONE_BYTE (code | 0x80);
2230           else
2231             {
2232               code |= 0x8080;
2233               EMIT_ONE_BYTE (code >> 8);
2234               EMIT_ONE_BYTE (code & 0xFF);
2235             }
2236         }
2237     }
2238   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2239   coding->produced_char += produced_chars;
2240   coding->produced = dst - coding->destination;
2241   return 0;
2242 }
2243
2244 \f
2245 /*** 7. ISO2022 handlers ***/
2246
2247 /* The following note describes the coding system ISO2022 briefly.
2248    Since the intention of this note is to help understand the
2249    functions in this file, some parts are NOT ACCURATE or are OVERLY
2250    SIMPLIFIED.  For thorough understanding, please refer to the
2251    original document of ISO2022.  This is equivalent to the standard
2252    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2253
2254    ISO2022 provides many mechanisms to encode several character sets
2255    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2256    is encoded using bytes less than 128.  This may make the encoded
2257    text a little bit longer, but the text passes more easily through
2258    several types of gateway, some of which strip off the MSB (Most
2259    Significant Bit).
2260
2261    There are two kinds of character sets: control character sets and
2262    graphic character sets.  The former contain control characters such
2263    as `newline' and `escape' to provide control functions (control
2264    functions are also provided by escape sequences).  The latter
2265    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2266    two control character sets and many graphic character sets.
2267
2268    Graphic character sets are classified into one of the following
2269    four classes, according to the number of bytes (DIMENSION) and
2270    number of characters in one dimension (CHARS) of the set:
2271    - DIMENSION1_CHARS94
2272    - DIMENSION1_CHARS96
2273    - DIMENSION2_CHARS94
2274    - DIMENSION2_CHARS96
2275
2276    In addition, each character set is assigned an identification tag,
2277    unique for each set, called the "final character" (denoted as <F>
2278    hereafter).  The <F> of each character set is decided by ECMA(*)
2279    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2280    (0x30..0x3F are for private use only).
2281
2282    Note (*): ECMA = European Computer Manufacturers Association
2283
2284    Here are examples of graphic character sets [NAME(<F>)]:
2285         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2286         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2287         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2288         o DIMENSION2_CHARS96 -- none for the moment
2289
2290    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2291         C0 [0x00..0x1F] -- control character plane 0
2292         GL [0x20..0x7F] -- graphic character plane 0
2293         C1 [0x80..0x9F] -- control character plane 1
2294         GR [0xA0..0xFF] -- graphic character plane 1
2295
2296    A control character set is directly designated and invoked to C0 or
2297    C1 by an escape sequence.  The most common case is that:
2298    - ISO646's  control character set is designated/invoked to C0, and
2299    - ISO6429's control character set is designated/invoked to C1,
2300    and usually these designations/invocations are omitted in encoded
2301    text.  In a 7-bit environment, only C0 can be used, and a control
2302    character for C1 is encoded by an appropriate escape sequence to
2303    fit into the environment.  All control characters for C1 are
2304    defined to have corresponding escape sequences.
2305
2306    A graphic character set is at first designated to one of four
2307    graphic registers (G0 through G3), then these graphic registers are
2308    invoked to GL or GR.  These designations and invocations can be
2309    done independently.  The most common case is that G0 is invoked to
2310    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2311    these invocations and designations are omitted in encoded text.
2312    In a 7-bit environment, only GL can be used.
2313
2314    When a graphic character set of CHARS94 is invoked to GL, codes
2315    0x20 and 0x7F of the GL area work as control characters SPACE and
2316    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2317    be used.
2318
2319    There are two ways of invocation: locking-shift and single-shift.
2320    With locking-shift, the invocation lasts until the next different
2321    invocation, whereas with single-shift, the invocation affects the
2322    following character only and doesn't affect the locking-shift
2323    state.  Invocations are done by the following control characters or
2324    escape sequences:
2325
2326    ----------------------------------------------------------------------
2327    abbrev  function                  cntrl escape seq   description
2328    ----------------------------------------------------------------------
2329    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2330    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2331    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2332    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2333    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2334    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2335    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2336    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2337    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2338    ----------------------------------------------------------------------
2339    (*) These are not used by any known coding system.
2340
2341    Control characters for these functions are defined by macros
2342    ISO_CODE_XXX in `coding.h'.
2343
2344    Designations are done by the following escape sequences:
2345    ----------------------------------------------------------------------
2346    escape sequence      description
2347    ----------------------------------------------------------------------
2348    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2349    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2350    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2351    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2352    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2353    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2354    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2355    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2356    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2357    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2358    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2359    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2360    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2361    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2362    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2363    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2364    ----------------------------------------------------------------------
2365
2366    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2367    of dimension 1, chars 94, and final character <F>, etc...
2368
2369    Note (*): Although these designations are not allowed in ISO2022,
2370    Emacs accepts them on decoding, and produces them on encoding
2371    CHARS96 character sets in a coding system which is characterized as
2372    7-bit environment, non-locking-shift, and non-single-shift.
2373
2374    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2375    '(' must be omitted.  We refer to this as "short-form" hereafter.
2376
2377    Now you may notice that there are a lot of ways of encoding the
2378    same multilingual text in ISO2022.  Actually, there exist many
2379    coding systems such as Compound Text (used in X11's inter client
2380    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2381    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2382    localized platforms), and all of these are variants of ISO2022.
2383
2384    In addition to the above, Emacs handles two more kinds of escape
2385    sequences: ISO6429's direction specification and Emacs' private
2386    sequence for specifying character composition.
2387
2388    ISO6429's direction specification takes the following form:
2389         o CSI ']'      -- end of the current direction
2390         o CSI '0' ']'  -- end of the current direction
2391         o CSI '1' ']'  -- start of left-to-right text
2392         o CSI '2' ']'  -- start of right-to-left text
2393    The control character CSI (0x9B: control sequence introducer) is
2394    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2395
2396    Character composition specification takes the following form:
2397         o ESC '0' -- start relative composition
2398         o ESC '1' -- end composition
2399         o ESC '2' -- start rule-base composition (*)
2400         o ESC '3' -- start relative composition with alternate chars  (**)
2401         o ESC '4' -- start rule-base composition with alternate chars  (**)
2402   Since these are not standard escape sequences of any ISO standard,
2403   the use of them with these meanings is restricted to Emacs only.
2404
2405   (*) This form is used only in Emacs 20.7 and older versions,
2406   but newer versions can safely decode it.
2407   (**) This form is used only in Emacs 21.1 and newer versions,
2408   and older versions can't decode it.
2409
2410   Here's a list of example usages of these composition escape
2411   sequences (categorized by `enum composition_method').
2412
2413   COMPOSITION_RELATIVE:
2414         ESC 0 CHAR [ CHAR ] ESC 1
2415   COMPOSITION_WITH_RULE:
2416         ESC 2 CHAR [ RULE CHAR ] ESC 1
2417   COMPOSITION_WITH_ALTCHARS:
2418         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2419   COMPOSITION_WITH_RULE_ALTCHARS:
2420         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2421
2422 enum iso_code_class_type iso_code_class[256];
2423
2424 #define SAFE_CHARSET_P(coding, id)      \
2425   ((id) <= (coding)->max_charset_id     \
2426    && (coding)->safe_charsets[id] >= 0)
2427
2428
2429 #define SHIFT_OUT_OK(category)  \
2430   (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2431
2432 static void
2433 setup_iso_safe_charsets (attrs)
2434      Lisp_Object attrs;
2435 {
2436   Lisp_Object charset_list, safe_charsets;
2437   Lisp_Object request;
2438   Lisp_Object reg_usage;
2439   Lisp_Object tail;
2440   int reg94, reg96;
2441   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2442   int max_charset_id;
2443
2444   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2445   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2446       && ! EQ (charset_list, Viso_2022_charset_list))
2447     {
2448       CODING_ATTR_CHARSET_LIST (attrs)
2449         = charset_list = Viso_2022_charset_list;
2450       ASET (attrs, coding_attr_safe_charsets, Qnil);
2451     }
2452
2453   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2454     return;
2455
2456   max_charset_id = 0;
2457   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2458     {
2459       int id = XINT (XCAR (tail));
2460       if (max_charset_id < id)
2461         max_charset_id = id;
2462     }
2463
2464   safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2465                                 make_number (255));
2466   request = AREF (attrs, coding_attr_iso_request);
2467   reg_usage = AREF (attrs, coding_attr_iso_usage);
2468   reg94 = XINT (XCAR (reg_usage));
2469   reg96 = XINT (XCDR (reg_usage));
2470
2471   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2472     {
2473       Lisp_Object id;
2474       Lisp_Object reg;
2475       struct charset *charset;
2476
2477       id = XCAR (tail);
2478       charset = CHARSET_FROM_ID (XINT (id));
2479       reg = Fcdr (Fassq (id, request));
2480       if (! NILP (reg))
2481         SSET (safe_charsets, XINT (id), XINT (reg));
2482       else if (charset->iso_chars_96)
2483         {
2484           if (reg96 < 4)
2485             SSET (safe_charsets, XINT (id), reg96);
2486         }
2487       else
2488         {
2489           if (reg94 < 4)
2490             SSET (safe_charsets, XINT (id), reg94);
2491         }
2492     }
2493   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2494 }
2495
2496
2497 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2498    Check if a text is encoded in one of ISO-2022 based codig systems.
2499    If it is, return 1, else return 0.  */
2500
2501 static int
2502 detect_coding_iso_2022 (coding, detect_info)
2503      struct coding_system *coding;
2504      struct coding_detection_info *detect_info;
2505 {
2506   const unsigned char *src = coding->source, *src_base = src;
2507   const unsigned char *src_end = coding->source + coding->src_bytes;
2508   int multibytep = coding->src_multibyte;
2509   int single_shifting = 0;
2510   int id;
2511   int c, c1;
2512   int consumed_chars = 0;
2513   int i;
2514   int rejected = 0;
2515   int found = 0;
2516
2517   detect_info->checked |= CATEGORY_MASK_ISO;
2518
2519   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2520     {
2521       struct coding_system *this = &(coding_categories[i]);
2522       Lisp_Object attrs, val;
2523
2524       attrs = CODING_ID_ATTRS (this->id);
2525       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2526           && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2527         setup_iso_safe_charsets (attrs);
2528       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2529       this->max_charset_id = SCHARS (val) - 1;
2530       this->safe_charsets = (char *) SDATA (val);
2531     }
2532
2533   /* A coding system of this category is always ASCII compatible.  */
2534   src += coding->head_ascii;
2535
2536   while (rejected != CATEGORY_MASK_ISO)
2537     {
2538       src_base = src;
2539       ONE_MORE_BYTE (c);
2540       switch (c)
2541         {
2542         case ISO_CODE_ESC:
2543           if (inhibit_iso_escape_detection)
2544             break;
2545           single_shifting = 0;
2546           ONE_MORE_BYTE (c);
2547           if (c >= '(' && c <= '/')
2548             {
2549               /* Designation sequence for a charset of dimension 1.  */
2550               ONE_MORE_BYTE (c1);
2551               if (c1 < ' ' || c1 >= 0x80
2552                   || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2553                 /* Invalid designation sequence.  Just ignore.  */
2554                 break;
2555             }
2556           else if (c == '$')
2557             {
2558               /* Designation sequence for a charset of dimension 2.  */
2559               ONE_MORE_BYTE (c);
2560               if (c >= '@' && c <= 'B')
2561                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
2562                 id = iso_charset_table[1][0][c];
2563               else if (c >= '(' && c <= '/')
2564                 {
2565                   ONE_MORE_BYTE (c1);
2566                   if (c1 < ' ' || c1 >= 0x80
2567                       || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2568                     /* Invalid designation sequence.  Just ignore.  */
2569                     break;
2570                 }
2571               else
2572                 /* Invalid designation sequence.  Just ignore it.  */
2573                 break;
2574             }
2575           else if (c == 'N' || c == 'O')
2576             {
2577               /* ESC <Fe> for SS2 or SS3.  */
2578               single_shifting = 1;
2579               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2580               break;
2581             }
2582           else if (c >= '0' && c <= '4')
2583             {
2584               /* ESC <Fp> for start/end composition.  */
2585               found |= CATEGORY_MASK_ISO;
2586               break;
2587             }
2588           else
2589             {
2590               /* Invalid escape sequence.  Just ignore it.  */
2591               break;
2592             }
2593
2594           /* We found a valid designation sequence for CHARSET.  */
2595           rejected |= CATEGORY_MASK_ISO_8BIT;
2596           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2597                               id))
2598             found |= CATEGORY_MASK_ISO_7;
2599           else
2600             rejected |= CATEGORY_MASK_ISO_7;
2601           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2602                               id))
2603             found |= CATEGORY_MASK_ISO_7_TIGHT;
2604           else
2605             rejected |= CATEGORY_MASK_ISO_7_TIGHT;
2606           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2607                               id))
2608             found |= CATEGORY_MASK_ISO_7_ELSE;
2609           else
2610             rejected |= CATEGORY_MASK_ISO_7_ELSE;
2611           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2612                               id))
2613             found |= CATEGORY_MASK_ISO_8_ELSE;
2614           else
2615             rejected |= CATEGORY_MASK_ISO_8_ELSE;
2616           break;
2617
2618         case ISO_CODE_SO:
2619         case ISO_CODE_SI:
2620           /* Locking shift out/in.  */
2621           if (inhibit_iso_escape_detection)
2622             break;
2623           single_shifting = 0;
2624           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2625           found |= CATEGORY_MASK_ISO_ELSE;
2626           break;
2627
2628         case ISO_CODE_CSI:
2629           /* Control sequence introducer.  */
2630           single_shifting = 0;
2631           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2632           found |= CATEGORY_MASK_ISO_8_ELSE;
2633           goto check_extra_latin;
2634
2635         case ISO_CODE_SS2:
2636         case ISO_CODE_SS3:
2637           /* Single shift.   */
2638           if (inhibit_iso_escape_detection)
2639             break;
2640           single_shifting = 0;
2641           rejected |= CATEGORY_MASK_ISO_7BIT;
2642           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2643               & CODING_ISO_FLAG_SINGLE_SHIFT)
2644             found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
2645           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2646               & CODING_ISO_FLAG_SINGLE_SHIFT)
2647             found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
2648           if (single_shifting)
2649             break;
2650           goto check_extra_latin;
2651
2652         default:
2653           if (c < 0)
2654             continue;
2655           if (c < 0x80)
2656             {
2657               single_shifting = 0;
2658               break;
2659             }
2660           if (c >= 0xA0)
2661             {
2662               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2663               found |= CATEGORY_MASK_ISO_8_1;
2664               /* Check the length of succeeding codes of the range
2665                  0xA0..0FF.  If the byte length is even, we include
2666                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
2667                  only when we are not single shifting.  */
2668               if (! single_shifting
2669                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
2670                 {
2671                   int i = 1;
2672                   while (src < src_end)
2673                     {
2674                       ONE_MORE_BYTE (c);
2675                       if (c < 0xA0)
2676                         break;
2677                       i++;
2678                     }
2679
2680                   if (i & 1 && src < src_end)
2681                     rejected |= CATEGORY_MASK_ISO_8_2;
2682                   else
2683                     found |= CATEGORY_MASK_ISO_8_2;
2684                 }
2685               break;
2686             }
2687         check_extra_latin:
2688           single_shifting = 0;
2689           if (! VECTORP (Vlatin_extra_code_table)
2690               || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2691             {
2692               rejected = CATEGORY_MASK_ISO;
2693               break;
2694             }
2695           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2696               & CODING_ISO_FLAG_LATIN_EXTRA)
2697             found |= CATEGORY_MASK_ISO_8_1;
2698           else
2699             rejected |= CATEGORY_MASK_ISO_8_1;
2700           rejected |= CATEGORY_MASK_ISO_8_2;
2701         }
2702     }
2703   detect_info->rejected |= CATEGORY_MASK_ISO;
2704   return 0;
2705
2706  no_more_source:
2707   detect_info->rejected |= rejected;
2708   detect_info->found |= (found & ~rejected);
2709   return 1;
2710 }
2711
2712
2713 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
2714    escape sequence should be kept.  */
2715 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
2716   do {                                                                  \
2717     int id, prev;                                                       \
2718                                                                         \
2719     if (final < '0' || final >= 128                                     \
2720         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
2721         || !SAFE_CHARSET_P (coding, id))                                \
2722       {                                                                 \
2723         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
2724         chars_96 = -1;                                                  \
2725         break;                                                          \
2726       }                                                                 \
2727     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
2728     if (id == charset_jisx0201_roman)                                   \
2729       {                                                                 \
2730         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
2731           id = charset_ascii;                                           \
2732       }                                                                 \
2733     else if (id == charset_jisx0208_1978)                               \
2734       {                                                                 \
2735         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
2736           id = charset_jisx0208;                                        \
2737       }                                                                 \
2738     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
2739     /* If there was an invalid designation to REG previously, and this  \
2740        designation is ASCII to REG, we should keep this designation     \
2741        sequence.  */                                                    \
2742     if (prev == -2 && id == charset_ascii)                              \
2743       chars_96 = -1;                                                    \
2744   } while (0)
2745
2746
2747 #define MAYBE_FINISH_COMPOSITION()                              \
2748   do {                                                          \
2749     int i;                                                      \
2750     if (composition_state == COMPOSING_NO)                      \
2751       break;                                                    \
2752     /* It is assured that we have enough room for producing     \
2753        characters stored in the table `components'.  */         \
2754     if (charbuf + component_idx > charbuf_end)                  \
2755       goto no_more_source;                                      \
2756     composition_state = COMPOSING_NO;                           \
2757     if (method == COMPOSITION_RELATIVE                          \
2758         || method == COMPOSITION_WITH_ALTCHARS)                 \
2759       {                                                         \
2760         for (i = 0; i < component_idx; i++)                     \
2761           *charbuf++ = components[i];                           \
2762         char_offset += component_idx;                           \
2763       }                                                         \
2764     else                                                        \
2765       {                                                         \
2766         for (i = 0; i < component_idx; i += 2)                  \
2767           *charbuf++ = components[i];                           \
2768         char_offset += (component_idx / 2) + 1;                 \
2769       }                                                         \
2770   } while (0)
2771
2772
2773 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2774    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2775    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2776    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2777    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2778   */
2779
2780 #define DECODE_COMPOSITION_START(c1)                                    \
2781   do {                                                                  \
2782     if (c1 == '0'                                                       \
2783         && composition_state == COMPOSING_COMPONENT_RULE)               \
2784       {                                                                 \
2785         component_len = component_idx;                                  \
2786         composition_state = COMPOSING_CHAR;                             \
2787       }                                                                 \
2788     else                                                                \
2789       {                                                                 \
2790         const unsigned char *p;                                         \
2791                                                                         \
2792         MAYBE_FINISH_COMPOSITION ();                                    \
2793         if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end)         \
2794           goto no_more_source;                                          \
2795         for (p = src; p < src_end - 1; p++)                             \
2796           if (*p == ISO_CODE_ESC && p[1] == '1')                        \
2797             break;                                                      \
2798         if (p == src_end - 1)                                           \
2799           {                                                             \
2800             if (coding->mode & CODING_MODE_LAST_BLOCK)                  \
2801               goto invalid_code;                                        \
2802             goto no_more_source;                                        \
2803           }                                                             \
2804                                                                         \
2805         /* This is surely the start of a composition.  */               \
2806         method = (c1 == '0' ? COMPOSITION_RELATIVE                      \
2807                   : c1 == '2' ? COMPOSITION_WITH_RULE                   \
2808                   : c1 == '3' ? COMPOSITION_WITH_ALTCHARS               \
2809                   : COMPOSITION_WITH_RULE_ALTCHARS);                    \
2810         composition_state = (c1 <= '2' ? COMPOSING_CHAR                 \
2811                              : COMPOSING_COMPONENT_CHAR);               \
2812         component_idx = component_len = 0;                              \
2813       }                                                                 \
2814   } while (0)
2815
2816
2817 /* Handle compositoin end sequence ESC 1.  */
2818
2819 #define DECODE_COMPOSITION_END()                                        \
2820   do {                                                                  \
2821     int nchars = (component_len > 0 ? component_idx - component_len     \
2822                   : method == COMPOSITION_RELATIVE ? component_idx      \
2823                   : (component_idx + 1) / 2);                           \
2824     int i;                                                              \
2825     int *saved_charbuf = charbuf;                                       \
2826                                                                         \
2827     ADD_COMPOSITION_DATA (charbuf, nchars, method);                     \
2828     if (method != COMPOSITION_RELATIVE)                                 \
2829       {                                                                 \
2830         if (component_len == 0)                                         \
2831           for (i = 0; i < component_idx; i++)                           \
2832             *charbuf++ = components[i];                                 \
2833         else                                                            \
2834           for (i = 0; i < component_len; i++)                           \
2835             *charbuf++ = components[i];                                 \
2836         *saved_charbuf = saved_charbuf - charbuf;                       \
2837       }                                                                 \
2838     if (method == COMPOSITION_WITH_RULE)                                \
2839       for (i = 0; i < component_idx; i += 2, char_offset++)             \
2840         *charbuf++ = components[i];                                     \
2841     else                                                                \
2842       for (i = component_len; i < component_idx; i++, char_offset++)    \
2843         *charbuf++ = components[i];                                     \
2844     coding->annotated = 1;                                              \
2845     composition_state = COMPOSING_NO;                                   \
2846   } while (0)
2847
2848
2849 /* Decode a composition rule from the byte C1 (and maybe one more byte
2850    from SRC) and store one encoded composition rule in
2851    coding->cmp_data.  */
2852
2853 #define DECODE_COMPOSITION_RULE(c1)                                     \
2854   do {                                                                  \
2855     (c1) -= 32;                                                         \
2856     if (c1 < 81)                /* old format (before ver.21) */        \
2857       {                                                                 \
2858         int gref = (c1) / 9;                                            \
2859         int nref = (c1) % 9;                                            \
2860         if (gref == 4) gref = 10;                                       \
2861         if (nref == 4) nref = 10;                                       \
2862         c1 = COMPOSITION_ENCODE_RULE (gref, nref);                      \
2863       }                                                                 \
2864     else if (c1 < 93)           /* new format (after ver.21) */         \
2865       {                                                                 \
2866         ONE_MORE_BYTE (c2);                                             \
2867         c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);                \
2868       }                                                                 \
2869     else                                                                \
2870       c1 = 0;                                                           \
2871   } while (0)
2872
2873
2874 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2875
2876 static void
2877 decode_coding_iso_2022 (coding)
2878      struct coding_system *coding;
2879 {
2880   const unsigned char *src = coding->source + coding->consumed;
2881   const unsigned char *src_end = coding->source + coding->src_bytes;
2882   const unsigned char *src_base;
2883   int *charbuf = coding->charbuf + coding->charbuf_used;
2884   int *charbuf_end
2885     = coding->charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
2886   int consumed_chars = 0, consumed_chars_base;
2887   int multibytep = coding->src_multibyte;
2888   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
2889   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2890   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2891   int charset_id_2, charset_id_3;
2892   struct charset *charset;
2893   int c;
2894   /* For handling composition sequence.  */
2895 #define COMPOSING_NO                    0
2896 #define COMPOSING_CHAR                  1
2897 #define COMPOSING_RULE                  2
2898 #define COMPOSING_COMPONENT_CHAR        3
2899 #define COMPOSING_COMPONENT_RULE        4
2900
2901   int composition_state = COMPOSING_NO;
2902   enum composition_method method;
2903   int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
2904   int component_idx;
2905   int component_len;
2906   Lisp_Object attrs, charset_list;
2907   int char_offset = coding->produced_char;
2908   int last_offset = char_offset;
2909   int last_id = charset_ascii;
2910
2911   CODING_GET_INFO (coding, attrs, charset_list);
2912   setup_iso_safe_charsets (attrs);
2913
2914   while (1)
2915     {
2916       int c1, c2;
2917
2918       src_base = src;
2919       consumed_chars_base = consumed_chars;
2920
2921       if (charbuf >= charbuf_end)
2922         break;
2923
2924       ONE_MORE_BYTE (c1);
2925       if (c1 < 0)
2926         goto invalid_code;
2927
2928       /* We produce at most one character.  */
2929       switch (iso_code_class [c1])
2930         {
2931         case ISO_0x20_or_0x7F:
2932           if (composition_state != COMPOSING_NO)
2933             {
2934               if (composition_state == COMPOSING_RULE
2935                   || composition_state == COMPOSING_COMPONENT_RULE)
2936                 {
2937                   DECODE_COMPOSITION_RULE (c1);
2938                   components[component_idx++] = c1;
2939                   composition_state--;
2940                   continue;
2941                 }
2942             }
2943           if (charset_id_0 < 0
2944               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
2945             /* This is SPACE or DEL.  */
2946             charset = CHARSET_FROM_ID (charset_ascii);
2947           else
2948             charset = CHARSET_FROM_ID (charset_id_0);
2949           break;
2950
2951         case ISO_graphic_plane_0:
2952           if (composition_state != COMPOSING_NO)
2953             {
2954               if (composition_state == COMPOSING_RULE
2955                   || composition_state == COMPOSING_COMPONENT_RULE)
2956                 {
2957                   DECODE_COMPOSITION_RULE (c1);
2958                   components[component_idx++] = c1;
2959                   composition_state--;
2960                   continue;
2961                 }
2962             }
2963           if (charset_id_0 < 0)
2964             charset = CHARSET_FROM_ID (charset_ascii);
2965           else
2966             charset = CHARSET_FROM_ID (charset_id_0);
2967           break;
2968
2969         case ISO_0xA0_or_0xFF:
2970           if (charset_id_1 < 0
2971               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
2972               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
2973             goto invalid_code;
2974           /* This is a graphic character, we fall down ... */
2975
2976         case ISO_graphic_plane_1:
2977           if (charset_id_1 < 0)
2978             goto invalid_code;
2979           charset = CHARSET_FROM_ID (charset_id_1);
2980           break;
2981
2982         case ISO_control_0:
2983           MAYBE_FINISH_COMPOSITION ();
2984           charset = CHARSET_FROM_ID (charset_ascii);
2985           break;
2986
2987         case ISO_control_1:
2988           MAYBE_FINISH_COMPOSITION ();
2989           goto invalid_code;
2990
2991         case ISO_shift_out:
2992           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2993               || CODING_ISO_DESIGNATION (coding, 1) < 0)
2994             goto invalid_code;
2995           CODING_ISO_INVOCATION (coding, 0) = 1;
2996           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2997           continue;
2998
2999         case ISO_shift_in:
3000           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3001             goto invalid_code;
3002           CODING_ISO_INVOCATION (coding, 0) = 0;
3003           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3004           continue;
3005
3006         case ISO_single_shift_2_7:
3007         case ISO_single_shift_2:
3008           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3009             goto invalid_code;
3010           /* SS2 is handled as an escape sequence of ESC 'N' */
3011           c1 = 'N';
3012           goto label_escape_sequence;
3013
3014         case ISO_single_shift_3:
3015           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3016             goto invalid_code;
3017           /* SS2 is handled as an escape sequence of ESC 'O' */
3018           c1 = 'O';
3019           goto label_escape_sequence;
3020
3021         case ISO_control_sequence_introducer:
3022           /* CSI is handled as an escape sequence of ESC '[' ...  */
3023           c1 = '[';
3024           goto label_escape_sequence;
3025
3026         case ISO_escape:
3027           ONE_MORE_BYTE (c1);
3028         label_escape_sequence:
3029           /* Escape sequences handled here are invocation,
3030              designation, direction specification, and character
3031              composition specification.  */
3032           switch (c1)
3033             {
3034             case '&':           /* revision of following character set */
3035               ONE_MORE_BYTE (c1);
3036               if (!(c1 >= '@' && c1 <= '~'))
3037                 goto invalid_code;
3038               ONE_MORE_BYTE (c1);
3039               if (c1 != ISO_CODE_ESC)
3040                 goto invalid_code;
3041               ONE_MORE_BYTE (c1);
3042               goto label_escape_sequence;
3043
3044             case '$':           /* designation of 2-byte character set */
3045               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3046                 goto invalid_code;
3047               {
3048                 int reg, chars96;
3049
3050                 ONE_MORE_BYTE (c1);
3051                 if (c1 >= '@' && c1 <= 'B')
3052                   {     /* designation of JISX0208.1978, GB2312.1980,
3053                            or JISX0208.1980 */
3054                     reg = 0, chars96 = 0;
3055                   }
3056                 else if (c1 >= 0x28 && c1 <= 0x2B)
3057                   { /* designation of DIMENSION2_CHARS94 character set */
3058                     reg = c1 - 0x28, chars96 = 0;
3059                     ONE_MORE_BYTE (c1);
3060                   }
3061                 else if (c1 >= 0x2C && c1 <= 0x2F)
3062                   { /* designation of DIMENSION2_CHARS96 character set */
3063                     reg = c1 - 0x2C, chars96 = 1;
3064                     ONE_MORE_BYTE (c1);
3065                   }
3066                 else
3067                   goto invalid_code;
3068                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3069                 /* We must update these variables now.  */
3070                 if (reg == 0)
3071                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3072                 else if (reg == 1)
3073                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3074                 if (chars96 < 0)
3075                   goto invalid_code;
3076               }
3077               continue;
3078
3079             case 'n':           /* invocation of locking-shift-2 */
3080               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3081                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3082                 goto invalid_code;
3083               CODING_ISO_INVOCATION (coding, 0) = 2;
3084               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3085               continue;
3086
3087             case 'o':           /* invocation of locking-shift-3 */
3088               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3089                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3090                 goto invalid_code;
3091               CODING_ISO_INVOCATION (coding, 0) = 3;
3092               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3093               continue;
3094
3095             case 'N':           /* invocation of single-shift-2 */
3096               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3097                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3098                 goto invalid_code;
3099               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3100               if (charset_id_2 < 0)
3101                 charset = CHARSET_FROM_ID (charset_ascii);
3102               else
3103                 charset = CHARSET_FROM_ID (charset_id_2);
3104               ONE_MORE_BYTE (c1);
3105               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3106                 goto invalid_code;
3107               break;
3108
3109             case 'O':           /* invocation of single-shift-3 */
3110               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3111                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3112                 goto invalid_code;
3113               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3114               if (charset_id_3 < 0)
3115                 charset = CHARSET_FROM_ID (charset_ascii);
3116               else
3117                 charset = CHARSET_FROM_ID (charset_id_3);
3118               ONE_MORE_BYTE (c1);
3119               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3120                 goto invalid_code;
3121               break;
3122
3123             case '0': case '2': case '3': case '4': /* start composition */
3124               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3125                 goto invalid_code;
3126               DECODE_COMPOSITION_START (c1);
3127               continue;
3128
3129             case '1':           /* end composition */
3130               if (composition_state == COMPOSING_NO)
3131                 goto invalid_code;
3132               DECODE_COMPOSITION_END ();
3133               continue;
3134
3135             case '[':           /* specification of direction */
3136               if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3137                 goto invalid_code;
3138               /* For the moment, nested direction is not supported.
3139                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3140                  left-to-right, and nozero means right-to-left.  */
3141               ONE_MORE_BYTE (c1);
3142               switch (c1)
3143                 {
3144                 case ']':       /* end of the current direction */
3145                   coding->mode &= ~CODING_MODE_DIRECTION;
3146
3147                 case '0':       /* end of the current direction */
3148                 case '1':       /* start of left-to-right direction */
3149                   ONE_MORE_BYTE (c1);
3150                   if (c1 == ']')
3151                     coding->mode &= ~CODING_MODE_DIRECTION;
3152                   else
3153                     goto invalid_code;
3154                   break;
3155
3156                 case '2':       /* start of right-to-left direction */
3157                   ONE_MORE_BYTE (c1);
3158                   if (c1 == ']')
3159                     coding->mode |= CODING_MODE_DIRECTION;
3160                   else
3161                     goto invalid_code;
3162                   break;
3163
3164                 default:
3165                   goto invalid_code;
3166                 }
3167               continue;
3168
3169             case '%':
3170               ONE_MORE_BYTE (c1);
3171               if (c1 == '/')
3172                 {
3173                   /* CTEXT extended segment:
3174                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3175                      We keep these bytes as is for the moment.
3176                      They may be decoded by post-read-conversion.  */
3177                   int dim, M, L;
3178                   int size;
3179
3180                   ONE_MORE_BYTE (dim);
3181                   ONE_MORE_BYTE (M);
3182                   ONE_MORE_BYTE (L);
3183                   size = ((M - 128) * 128) + (L - 128);
3184                   if (charbuf + 8 + size > charbuf_end)
3185                     goto break_loop;
3186                   *charbuf++ = ISO_CODE_ESC;
3187                   *charbuf++ = '%';
3188                   *charbuf++ = '/';
3189                   *charbuf++ = dim;
3190                   *charbuf++ = BYTE8_TO_CHAR (M);
3191                   *charbuf++ = BYTE8_TO_CHAR (L);
3192                   while (size-- > 0)
3193                     {
3194                       ONE_MORE_BYTE (c1);
3195                       *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3196                     }
3197                 }
3198               else if (c1 == 'G')
3199                 {
3200                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3201                      ESC % G --UTF-8-BYTES-- ESC % @
3202                      We keep these bytes as is for the moment.
3203                      They may be decoded by post-read-conversion.  */
3204                   int *p = charbuf;
3205
3206                   if (p + 6 > charbuf_end)
3207                     goto break_loop;
3208                   *p++ = ISO_CODE_ESC;
3209                   *p++ = '%';
3210                   *p++ = 'G';
3211                   while (p < charbuf_end)
3212                     {
3213                       ONE_MORE_BYTE (c1);
3214                       if (c1 == ISO_CODE_ESC
3215                           && src + 1 < src_end
3216                           && src[0] == '%'
3217                           && src[1] == '@')
3218                         {
3219                           src += 2;
3220                           break;
3221                         }
3222                       *p++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3223                     }
3224                   if (p + 3 > charbuf_end)
3225                     goto break_loop;
3226                   *p++ = ISO_CODE_ESC;
3227                   *p++ = '%';
3228                   *p++ = '@';
3229                   charbuf = p;
3230                 }
3231               else
3232                 goto invalid_code;
3233               continue;
3234               break;
3235
3236             default:
3237               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3238                 goto invalid_code;
3239               {
3240                 int reg, chars96;
3241
3242                 if (c1 >= 0x28 && c1 <= 0x2B)
3243                   { /* designation of DIMENSION1_CHARS94 character set */
3244                     reg = c1 - 0x28, chars96 = 0;
3245                     ONE_MORE_BYTE (c1);
3246                   }
3247                 else if (c1 >= 0x2C && c1 <= 0x2F)
3248                   { /* designation of DIMENSION1_CHARS96 character set */
3249                     reg = c1 - 0x2C, chars96 = 1;
3250                     ONE_MORE_BYTE (c1);
3251                   }
3252                 else
3253                   goto invalid_code;
3254                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3255                 /* We must update these variables now.  */
3256                 if (reg == 0)
3257                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3258                 else if (reg == 1)
3259                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3260                 if (chars96 < 0)
3261                   goto invalid_code;
3262               }
3263               continue;
3264             }
3265         }
3266
3267       if (charset->id != charset_ascii
3268           && last_id != charset->id)
3269         {
3270           if (last_id != charset_ascii)
3271             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3272           last_id = charset->id;
3273           last_offset = char_offset;
3274         }
3275
3276       /* Now we know CHARSET and 1st position code C1 of a character.
3277          Produce a decoded character while getting 2nd position code
3278          C2 if necessary.  */
3279       c1 &= 0x7F;
3280       if (CHARSET_DIMENSION (charset) > 1)
3281         {
3282           ONE_MORE_BYTE (c2);
3283           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3284             /* C2 is not in a valid range.  */
3285             goto invalid_code;
3286           c1 = (c1 << 8) | (c2 & 0x7F);
3287           if (CHARSET_DIMENSION (charset) > 2)
3288             {
3289               ONE_MORE_BYTE (c2);
3290               if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3291                 /* C2 is not in a valid range.  */
3292                 goto invalid_code;
3293               c1 = (c1 << 8) | (c2 & 0x7F);
3294             }
3295         }
3296
3297       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3298       if (c < 0)
3299         {
3300           MAYBE_FINISH_COMPOSITION ();
3301           for (; src_base < src; src_base++, char_offset++)
3302             {
3303               if (ASCII_BYTE_P (*src_base))
3304                 *charbuf++ = *src_base;
3305               else
3306                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3307             }
3308         }
3309       else if (composition_state == COMPOSING_NO)
3310         {
3311           *charbuf++ = c;
3312           char_offset++;
3313         }
3314       else
3315         {
3316           components[component_idx++] = c;
3317           if (method == COMPOSITION_WITH_RULE
3318               || (method == COMPOSITION_WITH_RULE_ALTCHARS
3319                   && composition_state == COMPOSING_COMPONENT_CHAR))
3320             composition_state++;
3321         }
3322       continue;
3323
3324     invalid_code:
3325       MAYBE_FINISH_COMPOSITION ();
3326       src = src_base;
3327       consumed_chars = consumed_chars_base;
3328       ONE_MORE_BYTE (c);
3329       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3330       char_offset++;
3331       coding->errors++;
3332       continue;
3333
3334     break_loop:
3335       break;
3336     }
3337
3338  no_more_source:
3339   if (last_id != charset_ascii)
3340     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3341   coding->consumed_char += consumed_chars_base;
3342   coding->consumed = src_base - coding->source;
3343   coding->charbuf_used = charbuf - coding->charbuf;
3344 }
3345
3346
3347 /* ISO2022 encoding stuff.  */
3348
3349 /*
3350    It is not enough to say just "ISO2022" on encoding, we have to
3351    specify more details.  In Emacs, each coding system of ISO2022
3352    variant has the following specifications:
3353         1. Initial designation to G0 thru G3.
3354         2. Allows short-form designation?
3355         3. ASCII should be designated to G0 before control characters?
3356         4. ASCII should be designated to G0 at end of line?
3357         5. 7-bit environment or 8-bit environment?
3358         6. Use locking-shift?
3359         7. Use Single-shift?
3360    And the following two are only for Japanese:
3361         8. Use ASCII in place of JIS0201-1976-Roman?
3362         9. Use JISX0208-1983 in place of JISX0208-1978?
3363    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3364    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
3365    details.
3366 */
3367
3368 /* Produce codes (escape sequence) for designating CHARSET to graphic
3369    register REG at DST, and increment DST.  If <final-char> of CHARSET is
3370    '@', 'A', or 'B' and the coding system CODING allows, produce
3371    designation sequence of short-form.  */
3372
3373 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
3374   do {                                                                  \
3375     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
3376     char *intermediate_char_94 = "()*+";                                \
3377     char *intermediate_char_96 = ",-./";                                \
3378     int revision = -1;                                                  \
3379     int c;                                                              \
3380                                                                         \
3381     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
3382       revision = CHARSET_ISO_REVISION (charset);                        \
3383                                                                         \
3384     if (revision >= 0)                                                  \
3385       {                                                                 \
3386         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
3387         EMIT_ONE_BYTE ('@' + revision);                                 \
3388       }                                                                 \
3389     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
3390     if (CHARSET_DIMENSION (charset) == 1)                               \
3391       {                                                                 \
3392         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3393           c = intermediate_char_94[reg];                                \
3394         else                                                            \
3395           c = intermediate_char_96[reg];                                \
3396         EMIT_ONE_ASCII_BYTE (c);                                        \
3397       }                                                                 \
3398     else                                                                \
3399       {                                                                 \
3400         EMIT_ONE_ASCII_BYTE ('$');                                      \
3401         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3402           {                                                             \
3403             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
3404                 || reg != 0                                             \
3405                 || final_char < '@' || final_char > 'B')                \
3406               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
3407           }                                                             \
3408         else                                                            \
3409           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
3410       }                                                                 \
3411     EMIT_ONE_ASCII_BYTE (final_char);                                   \
3412                                                                         \
3413     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
3414   } while (0)
3415
3416
3417 /* The following two macros produce codes (control character or escape
3418    sequence) for ISO2022 single-shift functions (single-shift-2 and
3419    single-shift-3).  */
3420
3421 #define ENCODE_SINGLE_SHIFT_2                                           \
3422   do {                                                                  \
3423     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3424       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
3425     else                                                                \
3426       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
3427     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3428   } while (0)
3429
3430
3431 #define ENCODE_SINGLE_SHIFT_3                                           \
3432   do {                                                                  \
3433     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3434       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
3435     else                                                                \
3436       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
3437     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3438   } while (0)
3439
3440
3441 /* The following four macros produce codes (control character or
3442    escape sequence) for ISO2022 locking-shift functions (shift-in,
3443    shift-out, locking-shift-2, and locking-shift-3).  */
3444
3445 #define ENCODE_SHIFT_IN                                 \
3446   do {                                                  \
3447     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
3448     CODING_ISO_INVOCATION (coding, 0) = 0;              \
3449   } while (0)
3450
3451
3452 #define ENCODE_SHIFT_OUT                                \
3453   do {                                                  \
3454     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
3455     CODING_ISO_INVOCATION (coding, 0) = 1;              \
3456   } while (0)
3457
3458
3459 #define ENCODE_LOCKING_SHIFT_2                          \
3460   do {                                                  \
3461     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
3462     CODING_ISO_INVOCATION (coding, 0) = 2;              \
3463   } while (0)
3464
3465
3466 #define ENCODE_LOCKING_SHIFT_3                          \
3467   do {                                                  \
3468     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
3469     CODING_ISO_INVOCATION (coding, 0) = 3;              \
3470   } while (0)
3471
3472
3473 /* Produce codes for a DIMENSION1 character whose character set is
3474    CHARSET and whose position-code is C1.  Designation and invocation
3475    sequences are also produced in advance if necessary.  */
3476
3477 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
3478   do {                                                                  \
3479     int id = CHARSET_ID (charset);                                      \
3480                                                                         \
3481     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
3482         && id == charset_ascii)                                         \
3483       {                                                                 \
3484         id = charset_jisx0201_roman;                                    \
3485         charset = CHARSET_FROM_ID (id);                                 \
3486       }                                                                 \
3487                                                                         \
3488     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
3489       {                                                                 \
3490         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
3491           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
3492         else                                                            \
3493           EMIT_ONE_BYTE (c1 | 0x80);                                    \
3494         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
3495         break;                                                          \
3496       }                                                                 \
3497     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
3498       {                                                                 \
3499         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
3500         break;                                                          \
3501       }                                                                 \
3502     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
3503       {                                                                 \
3504         EMIT_ONE_BYTE (c1 | 0x80);                                      \
3505         break;                                                          \
3506       }                                                                 \
3507     else                                                                \
3508       /* Since CHARSET is not yet invoked to any graphic planes, we     \
3509          must invoke it, or, at first, designate it to some graphic     \
3510          register.  Then repeat the loop to actually produce the        \
3511          character.  */                                                 \
3512       dst = encode_invocation_designation (charset, coding, dst,        \
3513                                            &produced_chars);            \
3514   } while (1)
3515
3516
3517 /* Produce codes for a DIMENSION2 character whose character set is
3518    CHARSET and whose position-codes are C1 and C2.  Designation and
3519    invocation codes are also produced in advance if necessary.  */
3520
3521 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
3522   do {                                                                  \
3523     int id = CHARSET_ID (charset);                                      \
3524                                                                         \
3525     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
3526         && id == charset_jisx0208)                                      \
3527       {                                                                 \
3528         id = charset_jisx0208_1978;                                     \
3529         charset = CHARSET_FROM_ID (id);                                 \
3530       }                                                                 \
3531                                                                         \
3532     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
3533       {                                                                 \
3534         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
3535           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
3536         else                                                            \
3537           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
3538         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
3539         break;                                                          \
3540       }                                                                 \
3541     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
3542       {                                                                 \
3543         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
3544         break;                                                          \
3545       }                                                                 \
3546     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
3547       {                                                                 \
3548         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
3549         break;                                                          \
3550       }                                                                 \
3551     else                                                                \
3552       /* Since CHARSET is not yet invoked to any graphic planes, we     \
3553          must invoke it, or, at first, designate it to some graphic     \
3554          register.  Then repeat the loop to actually produce the        \
3555          character.  */                                                 \
3556       dst = encode_invocation_designation (charset, coding, dst,        \
3557                                            &produced_chars);            \
3558   } while (1)
3559
3560
3561 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
3562   do {                                                                     \
3563     int code = ENCODE_CHAR ((charset),(c));                                \
3564                                                                            \
3565     if (CHARSET_DIMENSION (charset) == 1)                                  \
3566       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
3567     else                                                                   \
3568       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3569   } while (0)
3570
3571
3572 /* Produce designation and invocation codes at a place pointed by DST
3573    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
3574    Return new DST.  */
3575
3576 unsigned char *
3577 encode_invocation_designation (charset, coding, dst, p_nchars)
3578      struct charset *charset;
3579      struct coding_system *coding;
3580      unsigned char *dst;
3581      int *p_nchars;
3582 {
3583   int multibytep = coding->dst_multibyte;
3584   int produced_chars = *p_nchars;
3585   int reg;                      /* graphic register number */
3586   int id = CHARSET_ID (charset);
3587
3588   /* At first, check designations.  */
3589   for (reg = 0; reg < 4; reg++)
3590     if (id == CODING_ISO_DESIGNATION (coding, reg))
3591       break;
3592
3593   if (reg >= 4)
3594     {
3595       /* CHARSET is not yet designated to any graphic registers.  */
3596       /* At first check the requested designation.  */
3597       reg = CODING_ISO_REQUEST (coding, id);
3598       if (reg < 0)
3599         /* Since CHARSET requests no special designation, designate it
3600            to graphic register 0.  */
3601         reg = 0;
3602
3603       ENCODE_DESIGNATION (charset, reg, coding);
3604     }
3605
3606   if (CODING_ISO_INVOCATION (coding, 0) != reg
3607       && CODING_ISO_INVOCATION (coding, 1) != reg)
3608     {
3609       /* Since the graphic register REG is not invoked to any graphic
3610          planes, invoke it to graphic plane 0.  */
3611       switch (reg)
3612         {
3613         case 0:                 /* graphic register 0 */
3614           ENCODE_SHIFT_IN;
3615           break;
3616
3617         case 1:                 /* graphic register 1 */
3618           ENCODE_SHIFT_OUT;
3619           break;
3620
3621         case 2:                 /* graphic register 2 */
3622           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3623             ENCODE_SINGLE_SHIFT_2;
3624           else
3625             ENCODE_LOCKING_SHIFT_2;
3626           break;
3627
3628         case 3:                 /* graphic register 3 */
3629           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3630             ENCODE_SINGLE_SHIFT_3;
3631           else
3632             ENCODE_LOCKING_SHIFT_3;
3633           break;
3634         }
3635     }
3636
3637   *p_nchars = produced_chars;
3638   return dst;
3639 }
3640
3641 /* The following three macros produce codes for indicating direction
3642    of text.  */
3643 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER                              \
3644   do {                                                                  \
3645     if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS)        \
3646       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '[');                         \
3647     else                                                                \
3648       EMIT_ONE_BYTE (ISO_CODE_CSI);                                     \
3649   } while (0)
3650
3651
3652 #define ENCODE_DIRECTION_R2L()                  \
3653   do {                                          \
3654     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
3655     EMIT_TWO_ASCII_BYTES ('2', ']');            \
3656   } while (0)
3657
3658
3659 #define ENCODE_DIRECTION_L2R()                  \
3660   do {                                          \
3661     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
3662     EMIT_TWO_ASCII_BYTES ('0', ']');            \
3663   } while (0)
3664
3665
3666 /* Produce codes for designation and invocation to reset the graphic
3667    planes and registers to initial state.  */
3668 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
3669   do {                                                                  \
3670     int reg;                                                            \
3671     struct charset *charset;                                            \
3672                                                                         \
3673     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
3674       ENCODE_SHIFT_IN;                                                  \
3675     for (reg = 0; reg < 4; reg++)                                       \
3676       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
3677           && (CODING_ISO_DESIGNATION (coding, reg)                      \
3678               != CODING_ISO_INITIAL (coding, reg)))                     \
3679         {                                                               \
3680           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3681           ENCODE_DESIGNATION (charset, reg, coding);                    \
3682         }                                                               \
3683   } while (0)
3684
3685
3686 /* Produce designation sequences of charsets in the line started from
3687    SRC to a place pointed by DST, and return updated DST.
3688
3689    If the current block ends before any end-of-line, we may fail to
3690    find all the necessary designations.  */
3691
3692 static unsigned char *
3693 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
3694      struct coding_system *coding;
3695      int *charbuf, *charbuf_end;
3696      unsigned char *dst;
3697 {
3698   struct charset *charset;
3699   /* Table of charsets to be designated to each graphic register.  */
3700   int r[4];
3701   int c, found = 0, reg;
3702   int produced_chars = 0;
3703   int multibytep = coding->dst_multibyte;
3704   Lisp_Object attrs;
3705   Lisp_Object charset_list;
3706
3707   attrs = CODING_ID_ATTRS (coding->id);
3708   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3709   if (EQ (charset_list, Qiso_2022))
3710     charset_list = Viso_2022_charset_list;
3711
3712   for (reg = 0; reg < 4; reg++)
3713     r[reg] = -1;
3714
3715   while (found < 4)
3716     {
3717       int id;
3718
3719       c = *charbuf++;
3720       if (c == '\n')
3721         break;
3722       charset = char_charset (c, charset_list, NULL);
3723       id = CHARSET_ID (charset);
3724       reg = CODING_ISO_REQUEST (coding, id);
3725       if (reg >= 0 && r[reg] < 0)
3726         {
3727           found++;
3728           r[reg] = id;
3729         }
3730     }
3731
3732   if (found)
3733     {
3734       for (reg = 0; reg < 4; reg++)
3735         if (r[reg] >= 0
3736             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3737           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
3738     }
3739
3740   return dst;
3741 }
3742
3743 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
3744
3745 static int
3746 encode_coding_iso_2022 (coding)
3747      struct coding_system *coding;
3748 {
3749   int multibytep = coding->dst_multibyte;
3750   int *charbuf = coding->charbuf;
3751   int *charbuf_end = charbuf + coding->charbuf_used;
3752   unsigned char *dst = coding->destination + coding->produced;
3753   unsigned char *dst_end = coding->destination + coding->dst_bytes;
3754   int safe_room = 16;
3755   int bol_designation
3756     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3757        && CODING_ISO_BOL (coding));
3758   int produced_chars = 0;
3759   Lisp_Object attrs, eol_type, charset_list;
3760   int ascii_compatible;
3761   int c;
3762   int preferred_charset_id = -1;
3763
3764   CODING_GET_INFO (coding, attrs, charset_list);
3765   eol_type = CODING_ID_EOL_TYPE (coding->id);
3766   if (VECTORP (eol_type))
3767     eol_type = Qunix;
3768
3769   setup_iso_safe_charsets (attrs);
3770   /* Charset list may have been changed.  */
3771   charset_list = CODING_ATTR_CHARSET_LIST (attrs);              \
3772   coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
3773
3774   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3775
3776   while (charbuf < charbuf_end)
3777     {
3778       ASSURE_DESTINATION (safe_room);
3779
3780       if (bol_designation)
3781         {
3782           unsigned char *dst_prev = dst;
3783
3784           /* We have to produce designation sequences if any now.  */
3785           dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3786           bol_designation = 0;
3787           /* We are sure that designation sequences are all ASCII bytes.  */
3788           produced_chars += dst - dst_prev;
3789         }
3790
3791       c = *charbuf++;
3792
3793       if (c < 0)
3794         {
3795           /* Handle an annotation.  */
3796           switch (*charbuf)
3797             {
3798             case CODING_ANNOTATE_COMPOSITION_MASK:
3799               /* Not yet implemented.  */
3800               break;
3801             case CODING_ANNOTATE_CHARSET_MASK:
3802               preferred_charset_id = charbuf[2];
3803               if (preferred_charset_id >= 0
3804                   && NILP (Fmemq (make_number (preferred_charset_id),
3805                                   charset_list)))
3806                 preferred_charset_id = -1;
3807               break;
3808             default:
3809               abort ();
3810             }
3811           charbuf += -c - 1;
3812           continue;
3813         }
3814
3815       /* Now encode the character C.  */
3816       if (c < 0x20 || c == 0x7F)
3817         {
3818           if (c == '\n'
3819               || (c == '\r' && EQ (eol_type, Qmac)))
3820             {
3821               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3822                 ENCODE_RESET_PLANE_AND_REGISTER ();
3823               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
3824                 {
3825                   int i;
3826
3827                   for (i = 0; i < 4; i++)
3828                     CODING_ISO_DESIGNATION (coding, i)
3829                       = CODING_ISO_INITIAL (coding, i);
3830                 }
3831               bol_designation
3832                 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
3833             }
3834           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3835             ENCODE_RESET_PLANE_AND_REGISTER ();
3836           EMIT_ONE_ASCII_BYTE (c);
3837         }
3838       else if (ASCII_CHAR_P (c))
3839         {
3840           if (ascii_compatible)
3841             EMIT_ONE_ASCII_BYTE (c);
3842           else
3843             {
3844               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
3845               ENCODE_ISO_CHARACTER (charset, c);
3846             }
3847         }
3848       else if (CHAR_BYTE8_P (c))
3849         {
3850           c = CHAR_TO_BYTE8 (c);
3851           EMIT_ONE_BYTE (c);
3852         }
3853       else
3854         {
3855           struct charset *charset;
3856
3857           if (preferred_charset_id >= 0)
3858             {
3859               charset = CHARSET_FROM_ID (preferred_charset_id);
3860               if (! CHAR_CHARSET_P (c, charset))
3861                 charset = char_charset (c, charset_list, NULL);
3862             }
3863           else
3864             charset = char_charset (c, charset_list, NULL);
3865           if (!charset)
3866             {
3867               if (coding->mode & CODING_MODE_SAFE_ENCODING)
3868                 {
3869                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3870                   charset = CHARSET_FROM_ID (charset_ascii);
3871                 }
3872               else
3873                 {
3874                   c = coding->default_char;
3875                   charset = char_charset (c, charset_list, NULL);
3876                 }
3877             }
3878           ENCODE_ISO_CHARACTER (charset, c);
3879         }
3880     }
3881
3882   if (coding->mode & CODING_MODE_LAST_BLOCK
3883       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3884     {
3885       ASSURE_DESTINATION (safe_room);
3886       ENCODE_RESET_PLANE_AND_REGISTER ();
3887     }
3888   record_conversion_result (coding, CODING_RESULT_SUCCESS);
3889   CODING_ISO_BOL (coding) = bol_designation;
3890   coding->produced_char += produced_chars;
3891   coding->produced = dst - coding->destination;
3892   return 0;
3893 }
3894
3895 \f
3896 /*** 8,9. SJIS and BIG5 handlers ***/
3897
3898 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3899    quite widely.  So, for the moment, Emacs supports them in the bare
3900    C code.  But, in the future, they may be supported only by CCL.  */
3901
3902 /* SJIS is a coding system encoding three character sets: ASCII, right
3903    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
3904    as is.  A character of charset katakana-jisx0201 is encoded by
3905    "position-code + 0x80".  A character of charset japanese-jisx0208
3906    is encoded in 2-byte but two position-codes are divided and shifted
3907    so that it fit in the range below.
3908
3909    --- CODE RANGE of SJIS ---
3910    (character set)      (range)
3911    ASCII                0x00 .. 0x7F
3912    KATAKANA-JISX0201    0xA0 .. 0xDF
3913    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
3914             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
3915    -------------------------------
3916
3917 */
3918
3919 /* BIG5 is a coding system encoding two character sets: ASCII and
3920    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
3921    character set and is encoded in two-byte.
3922
3923    --- CODE RANGE of BIG5 ---
3924    (character set)      (range)
3925    ASCII                0x00 .. 0x7F
3926    Big5 (1st byte)      0xA1 .. 0xFE
3927         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3928    --------------------------
3929
3930   */
3931
3932 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3933    Check if a text is encoded in SJIS.  If it is, return
3934    CATEGORY_MASK_SJIS, else return 0.  */
3935
3936 static int
3937 detect_coding_sjis (coding, detect_info)
3938      struct coding_system *coding;
3939      struct coding_detection_info *detect_info;
3940 {
3941   const unsigned char *src = coding->source, *src_base;
3942   const unsigned char *src_end = coding->source + coding->src_bytes;
3943   int multibytep = coding->src_multibyte;
3944   int consumed_chars = 0;
3945   int found = 0;
3946   int c;
3947
3948   detect_info->checked |= CATEGORY_MASK_SJIS;
3949   /* A coding system of this category is always ASCII compatible.  */
3950   src += coding->head_ascii;
3951
3952   while (1)
3953     {
3954       src_base = src;
3955       ONE_MORE_BYTE (c);
3956       if (c < 0x80)
3957         continue;
3958       if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
3959         {
3960           ONE_MORE_BYTE (c);
3961           if (c < 0x40 || c == 0x7F || c > 0xFC)
3962             break;
3963           found = CATEGORY_MASK_SJIS;
3964         }
3965       else if (c >= 0xA0 && c < 0xE0)
3966         found = CATEGORY_MASK_SJIS;
3967       else
3968         break;
3969     }
3970   detect_info->rejected |= CATEGORY_MASK_SJIS;
3971   return 0;
3972
3973  no_more_source:
3974   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
3975     {
3976       detect_info->rejected |= CATEGORY_MASK_SJIS;
3977       return 0;
3978     }
3979   detect_info->found |= found;
3980   return 1;
3981 }
3982
3983 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3984    Check if a text is encoded in BIG5.  If it is, return
3985    CATEGORY_MASK_BIG5, else return 0.  */
3986
3987 static int
3988 detect_coding_big5 (coding, detect_info)
3989      struct coding_system *coding;
3990      struct coding_detection_info *detect_info;
3991 {
3992   const unsigned char *src = coding->source, *src_base;
3993   const unsigned char *src_end = coding->source + coding->src_bytes;
3994   int multibytep = coding->src_multibyte;
3995   int consumed_chars = 0;
3996   int found = 0;
3997   int c;
3998
3999   detect_info->checked |= CATEGORY_MASK_BIG5;
4000   /* A coding system of this category is always ASCII compatible.  */
4001   src += coding->head_ascii;
4002
4003   while (1)
4004     {
4005       src_base = src;
4006       ONE_MORE_BYTE (c);
4007       if (c < 0x80)
4008         continue;
4009       if (c >= 0xA1)
4010         {
4011           ONE_MORE_BYTE (c);
4012           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4013             return 0;
4014           found = CATEGORY_MASK_BIG5;
4015         }
4016       else
4017         break;
4018     }
4019   detect_info->rejected |= CATEGORY_MASK_BIG5;
4020   return 0;
4021
4022  no_more_source:
4023   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4024     {
4025       detect_info->rejected |= CATEGORY_MASK_BIG5;
4026       return 0;
4027     }
4028   detect_info->found |= found;
4029   return 1;
4030 }
4031
4032 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4033    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
4034
4035 static void
4036 decode_coding_sjis (coding)
4037      struct coding_system *coding;
4038 {
4039   const unsigned char *src = coding->source + coding->consumed;
4040   const unsigned char *src_end = coding->source + coding->src_bytes;
4041   const unsigned char *src_base;
4042   int *charbuf = coding->charbuf + coding->charbuf_used;
4043   int *charbuf_end
4044     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4045   int consumed_chars = 0, consumed_chars_base;
4046   int multibytep = coding->src_multibyte;
4047   struct charset *charset_roman, *charset_kanji, *charset_kana;
4048   struct charset *charset_kanji2;
4049   Lisp_Object attrs, charset_list, val;
4050   int char_offset = coding->produced_char;
4051   int last_offset = char_offset;
4052   int last_id = charset_ascii;
4053
4054   CODING_GET_INFO (coding, attrs, charset_list);
4055
4056   val = charset_list;
4057   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4058   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4059   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4060   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4061
4062   while (1)
4063     {
4064       int c, c1;
4065       struct charset *charset;
4066
4067       src_base = src;
4068       consumed_chars_base = consumed_chars;
4069
4070       if (charbuf >= charbuf_end)
4071         break;
4072
4073       ONE_MORE_BYTE (c);
4074       if (c < 0)
4075         goto invalid_code;
4076       if (c < 0x80)
4077         charset = charset_roman;
4078       else if (c == 0x80 || c == 0xA0)
4079         goto invalid_code;
4080       else if (c >= 0xA1 && c <= 0xDF)
4081         {
4082           /* SJIS -> JISX0201-Kana */
4083           c &= 0x7F;
4084           charset = charset_kana;
4085         }
4086       else if (c <= 0xEF)
4087         {
4088           /* SJIS -> JISX0208 */
4089           ONE_MORE_BYTE (c1);
4090           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4091             goto invalid_code;
4092           c = (c << 8) | c1;
4093           SJIS_TO_JIS (c);
4094           charset = charset_kanji;
4095         }
4096       else if (c <= 0xFC && charset_kanji2)
4097         {
4098           /* SJIS -> JISX0213-2 */
4099           ONE_MORE_BYTE (c1);
4100           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4101             goto invalid_code;
4102           c = (c << 8) | c1;
4103           SJIS_TO_JIS2 (c);
4104           charset = charset_kanji2;
4105         }
4106       else
4107         goto invalid_code;
4108       if (charset->id != charset_ascii
4109           && last_id != charset->id)
4110         {
4111           if (last_id != charset_ascii)
4112             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4113           last_id = charset->id;
4114           last_offset = char_offset;
4115         }
4116       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4117       *charbuf++ = c;
4118       char_offset++;
4119       continue;
4120
4121     invalid_code:
4122       src = src_base;
4123       consumed_chars = consumed_chars_base;
4124       ONE_MORE_BYTE (c);
4125       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4126       char_offset++;
4127       coding->errors++;
4128     }
4129
4130  no_more_source:
4131   if (last_id != charset_ascii)
4132     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4133   coding->consumed_char += consumed_chars_base;
4134   coding->consumed = src_base - coding->source;
4135   coding->charbuf_used = charbuf - coding->charbuf;
4136 }
4137
4138 static void
4139 decode_coding_big5 (coding)
4140      struct coding_system *coding;
4141 {
4142   const unsigned char *src = coding->source + coding->consumed;
4143   const unsigned char *src_end = coding->source + coding->src_bytes;
4144   const unsigned char *src_base;
4145   int *charbuf = coding->charbuf + coding->charbuf_used;
4146   int *charbuf_end
4147     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4148   int consumed_chars = 0, consumed_chars_base;
4149   int multibytep = coding->src_multibyte;
4150   struct charset *charset_roman, *charset_big5;
4151   Lisp_Object attrs, charset_list, val;
4152   int char_offset = coding->produced_char;
4153   int last_offset = char_offset;
4154   int last_id = charset_ascii;
4155
4156   CODING_GET_INFO (coding, attrs, charset_list);
4157   val = charset_list;
4158   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4159   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4160
4161   while (1)
4162     {
4163       int c, c1;
4164       struct charset *charset;
4165
4166       src_base = src;
4167       consumed_chars_base = consumed_chars;
4168
4169       if (charbuf >= charbuf_end)
4170         break;
4171
4172       ONE_MORE_BYTE (c);
4173
4174       if (c < 0)
4175         goto invalid_code;
4176       if (c < 0x80)
4177         charset = charset_roman;
4178       else
4179         {
4180           /* BIG5 -> Big5 */
4181           if (c < 0xA1 || c > 0xFE)
4182             goto invalid_code;
4183           ONE_MORE_BYTE (c1);
4184           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4185             goto invalid_code;
4186           c = c << 8 | c1;
4187           charset = charset_big5;
4188         }
4189       if (charset->id != charset_ascii
4190           && last_id != charset->id)
4191         {
4192           if (last_id != charset_ascii)
4193             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4194           last_id = charset->id;
4195           last_offset = char_offset;
4196         }
4197       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4198       *charbuf++ = c;
4199       char_offset++;
4200       continue;
4201
4202     invalid_code:
4203       src = src_base;
4204       consumed_chars = consumed_chars_base;
4205       ONE_MORE_BYTE (c);
4206       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4207       char_offset++;
4208       coding->errors++;
4209     }
4210
4211  no_more_source:
4212   if (last_id != charset_ascii)
4213     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4214   coding->consumed_char += consumed_chars_base;
4215   coding->consumed = src_base - coding->source;
4216   coding->charbuf_used = charbuf - coding->charbuf;
4217 }
4218
4219 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4220    This function can encode charsets `ascii', `katakana-jisx0201',
4221    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4222    are sure that all these charsets are registered as official charset
4223    (i.e. do not have extended leading-codes).  Characters of other
4224    charsets are produced without any encoding.  If SJIS_P is 1, encode
4225    SJIS text, else encode BIG5 text.  */
4226
4227 static int
4228 encode_coding_sjis (coding)
4229      struct coding_system *coding;
4230 {
4231   int multibytep = coding->dst_multibyte;
4232   int *charbuf = coding->charbuf;
4233   int *charbuf_end = charbuf + coding->charbuf_used;
4234   unsigned char *dst = coding->destination + coding->produced;
4235   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4236   int safe_room = 4;
4237   int produced_chars = 0;
4238   Lisp_Object attrs, charset_list, val;
4239   int ascii_compatible;
4240   struct charset *charset_roman, *charset_kanji, *charset_kana;
4241   struct charset *charset_kanji2;
4242   int c;
4243
4244   CODING_GET_INFO (coding, attrs, charset_list);
4245   val = charset_list;
4246   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4247   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4248   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4249   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4250
4251   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4252
4253   while (charbuf < charbuf_end)
4254     {
4255       ASSURE_DESTINATION (safe_room);
4256       c = *charbuf++;
4257       /* Now encode the character C.  */
4258       if (ASCII_CHAR_P (c) && ascii_compatible)
4259         EMIT_ONE_ASCII_BYTE (c);
4260       else if (CHAR_BYTE8_P (c))
4261         {
4262           c = CHAR_TO_BYTE8 (c);
4263           EMIT_ONE_BYTE (c);
4264         }
4265       else
4266         {
4267           unsigned code;
4268           struct charset *charset = char_charset (c, charset_list, &code);
4269
4270           if (!charset)
4271             {
4272               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4273                 {
4274                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4275                   charset = CHARSET_FROM_ID (charset_ascii);
4276                 }
4277               else
4278                 {
4279                   c = coding->default_char;
4280                   charset = char_charset (c, charset_list, &code);
4281                 }
4282             }
4283           if (code == CHARSET_INVALID_CODE (charset))
4284             abort ();
4285           if (charset == charset_kanji)
4286             {
4287               int c1, c2;
4288               JIS_TO_SJIS (code);
4289               c1 = code >> 8, c2 = code & 0xFF;
4290               EMIT_TWO_BYTES (c1, c2);
4291             }
4292           else if (charset == charset_kana)
4293             EMIT_ONE_BYTE (code | 0x80);
4294           else if (charset_kanji2 && charset == charset_kanji2)
4295             {
4296               int c1, c2;
4297
4298               c1 = code >> 8;
4299               if (c1 == 0x21 || (c1 >= 0x23 && c1 < 0x25)
4300                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4301                 {
4302                   JIS_TO_SJIS2 (code);
4303                   c1 = code >> 8, c2 = code & 0xFF;
4304                   EMIT_TWO_BYTES (c1, c2);
4305                 }
4306               else
4307                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4308             }
4309           else
4310             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4311         }
4312     }
4313   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4314   coding->produced_char += produced_chars;
4315   coding->produced = dst - coding->destination;
4316   return 0;
4317 }
4318
4319 static int
4320 encode_coding_big5 (coding)
4321      struct coding_system *coding;
4322 {
4323   int multibytep = coding->dst_multibyte;
4324   int *charbuf = coding->charbuf;
4325   int *charbuf_end = charbuf + coding->charbuf_used;
4326   unsigned char *dst = coding->destination + coding->produced;
4327   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4328   int safe_room = 4;
4329   int produced_chars = 0;
4330   Lisp_Object attrs, charset_list, val;
4331   int ascii_compatible;
4332   struct charset *charset_roman, *charset_big5;
4333   int c;
4334
4335   CODING_GET_INFO (coding, attrs, charset_list);
4336   val = charset_list;
4337   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4338   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4339   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4340
4341   while (charbuf < charbuf_end)
4342     {
4343       ASSURE_DESTINATION (safe_room);
4344       c = *charbuf++;
4345       /* Now encode the character C.  */
4346       if (ASCII_CHAR_P (c) && ascii_compatible)
4347         EMIT_ONE_ASCII_BYTE (c);
4348       else if (CHAR_BYTE8_P (c))
4349         {
4350           c = CHAR_TO_BYTE8 (c);
4351           EMIT_ONE_BYTE (c);
4352         }
4353       else
4354         {
4355           unsigned code;
4356           struct charset *charset = char_charset (c, charset_list, &code);
4357
4358           if (! charset)
4359             {
4360               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4361                 {
4362                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4363                   charset = CHARSET_FROM_ID (charset_ascii);
4364                 }
4365               else
4366                 {
4367                   c = coding->default_char;
4368                   charset = char_charset (c, charset_list, &code);
4369                 }
4370             }
4371           if (code == CHARSET_INVALID_CODE (charset))
4372             abort ();
4373           if (charset == charset_big5)
4374             {
4375               int c1, c2;
4376
4377               c1 = code >> 8, c2 = code & 0xFF;
4378               EMIT_TWO_BYTES (c1, c2);
4379             }
4380           else
4381             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4382         }
4383     }
4384   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4385   coding->produced_char += produced_chars;
4386   coding->produced = dst - coding->destination;
4387   return 0;
4388 }
4389
4390 \f
4391 /*** 10. CCL handlers ***/
4392
4393 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4394    Check if a text is encoded in a coding system of which
4395    encoder/decoder are written in CCL program.  If it is, return
4396    CATEGORY_MASK_CCL, else return 0.  */
4397
4398 static int
4399 detect_coding_ccl (coding, detect_info)
4400      struct coding_system *coding;
4401      struct coding_detection_info *detect_info;
4402 {
4403   const unsigned char *src = coding->source, *src_base;
4404   const unsigned char *src_end = coding->source + coding->src_bytes;
4405   int multibytep = coding->src_multibyte;
4406   int consumed_chars = 0;
4407   int found = 0;
4408   unsigned char *valids;
4409   int head_ascii = coding->head_ascii;
4410   Lisp_Object attrs;
4411
4412   detect_info->checked |= CATEGORY_MASK_CCL;
4413
4414   coding = &coding_categories[coding_category_ccl];
4415   valids = CODING_CCL_VALIDS (coding);
4416   attrs = CODING_ID_ATTRS (coding->id);
4417   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4418     src += head_ascii;
4419
4420   while (1)
4421     {
4422       int c;
4423
4424       src_base = src;
4425       ONE_MORE_BYTE (c);
4426       if (c < 0 || ! valids[c])
4427         break;
4428       if ((valids[c] > 1))
4429         found = CATEGORY_MASK_CCL;
4430     }
4431   detect_info->rejected |= CATEGORY_MASK_CCL;
4432   return 0;
4433
4434  no_more_source:
4435   detect_info->found |= found;
4436   return 1;
4437 }
4438
4439 static void
4440 decode_coding_ccl (coding)
4441      struct coding_system *coding;
4442 {
4443   const unsigned char *src = coding->source + coding->consumed;
4444   const unsigned char *src_end = coding->source + coding->src_bytes;
4445   int *charbuf = coding->charbuf + coding->charbuf_used;
4446   int *charbuf_end = coding->charbuf + coding->charbuf_size;
4447   int consumed_chars = 0;
4448   int multibytep = coding->src_multibyte;
4449   struct ccl_program ccl;
4450   int source_charbuf[1024];
4451   int source_byteidx[1024];
4452   Lisp_Object attrs, charset_list;
4453
4454   CODING_GET_INFO (coding, attrs, charset_list);
4455   setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4456
4457   while (src < src_end)
4458     {
4459       const unsigned char *p = src;
4460       int *source, *source_end;
4461       int i = 0;
4462
4463       if (multibytep)
4464         while (i < 1024 && p < src_end)
4465           {
4466             source_byteidx[i] = p - src;
4467             source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4468           }
4469       else
4470         while (i < 1024 && p < src_end)
4471           source_charbuf[i++] = *p++;
4472
4473       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4474         ccl.last_block = 1;
4475
4476       source = source_charbuf;
4477       source_end = source + i;
4478       while (source < source_end)
4479         {
4480           ccl_driver (&ccl, source, charbuf,
4481                       source_end - source, charbuf_end - charbuf,
4482                       charset_list);
4483           source += ccl.consumed;
4484           charbuf += ccl.produced;
4485           if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4486             break;
4487         }
4488       if (source < source_end)
4489         src += source_byteidx[source - source_charbuf];
4490       else
4491         src = p;
4492       consumed_chars += source - source_charbuf;
4493
4494       if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4495           && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4496         break;
4497     }
4498
4499   switch (ccl.status)
4500     {
4501     case CCL_STAT_SUSPEND_BY_SRC:
4502       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4503       break;
4504     case CCL_STAT_SUSPEND_BY_DST:
4505       break;
4506     case CCL_STAT_QUIT:
4507     case CCL_STAT_INVALID_CMD:
4508       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4509       break;
4510     default:
4511       record_conversion_result (coding, CODING_RESULT_SUCCESS);
4512       break;
4513     }
4514   coding->consumed_char += consumed_chars;
4515   coding->consumed = src - coding->source;
4516   coding->charbuf_used = charbuf - coding->charbuf;
4517 }
4518
4519 static int
4520 encode_coding_ccl (coding)
4521      struct coding_system *coding;
4522 {
4523   struct ccl_program ccl;
4524   int multibytep = coding->dst_multibyte;
4525   int *charbuf = coding->charbuf;
4526   int *charbuf_end = charbuf + coding->charbuf_used;
4527   unsigned char *dst = coding->destination + coding->produced;
4528   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4529   unsigned char *adjusted_dst_end = dst_end - 1;
4530   int destination_charbuf[1024];
4531   int i, produced_chars = 0;
4532   Lisp_Object attrs, charset_list;
4533
4534   CODING_GET_INFO (coding, attrs, charset_list);
4535   setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4536
4537   ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4538   ccl.dst_multibyte = coding->dst_multibyte;
4539
4540   while (charbuf < charbuf_end && dst < adjusted_dst_end)
4541     {
4542       int dst_bytes = dst_end - dst;
4543       if (dst_bytes > 1024)
4544         dst_bytes = 1024;
4545
4546       ccl_driver (&ccl, charbuf, destination_charbuf,
4547                   charbuf_end - charbuf, dst_bytes, charset_list);
4548       charbuf += ccl.consumed;
4549       if (multibytep)
4550         for (i = 0; i < ccl.produced; i++)
4551           EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4552       else
4553         {
4554           for (i = 0; i < ccl.produced; i++)
4555             *dst++ = destination_charbuf[i] & 0xFF;
4556           produced_chars += ccl.produced;
4557         }
4558     }
4559
4560   switch (ccl.status)
4561     {
4562     case CCL_STAT_SUSPEND_BY_SRC:
4563       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4564       break;
4565     case CCL_STAT_SUSPEND_BY_DST:
4566       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
4567       break;
4568     case CCL_STAT_QUIT:
4569     case CCL_STAT_INVALID_CMD:
4570       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4571       break;
4572     default:
4573       record_conversion_result (coding, CODING_RESULT_SUCCESS);
4574       break;
4575     }
4576
4577   coding->produced_char += produced_chars;
4578   coding->produced = dst - coding->destination;
4579   return 0;
4580 }
4581
4582
4583 \f
4584 /*** 10, 11. no-conversion handlers ***/
4585
4586 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4587
4588 static void
4589 decode_coding_raw_text (coding)
4590      struct coding_system *coding;
4591 {
4592   coding->chars_at_source = 1;
4593   coding->consumed_char = 0;
4594   coding->consumed = 0;
4595   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4596 }
4597
4598 static int
4599 encode_coding_raw_text (coding)
4600      struct coding_system *coding;
4601 {
4602   int multibytep = coding->dst_multibyte;
4603   int *charbuf = coding->charbuf;
4604   int *charbuf_end = coding->charbuf + coding->charbuf_used;
4605   unsigned char *dst = coding->destination + coding->produced;
4606   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4607   int produced_chars = 0;
4608   int c;
4609
4610   if (multibytep)
4611     {
4612       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4613
4614       if (coding->src_multibyte)
4615         while (charbuf < charbuf_end)
4616           {
4617             ASSURE_DESTINATION (safe_room);
4618             c = *charbuf++;
4619             if (ASCII_CHAR_P (c))
4620               EMIT_ONE_ASCII_BYTE (c);
4621             else if (CHAR_BYTE8_P (c))
4622               {
4623                 c = CHAR_TO_BYTE8 (c);
4624                 EMIT_ONE_BYTE (c);
4625               }
4626             else
4627               {
4628                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
4629
4630                 CHAR_STRING_ADVANCE (c, p1);
4631                 while (p0 < p1)
4632                   {
4633                     EMIT_ONE_BYTE (*p0);
4634                     p0++;
4635                   }
4636               }
4637           }
4638       else
4639         while (charbuf < charbuf_end)
4640           {
4641             ASSURE_DESTINATION (safe_room);
4642             c = *charbuf++;
4643             EMIT_ONE_BYTE (c);
4644           }
4645     }
4646   else
4647     {
4648       if (coding->src_multibyte)
4649         {
4650           int safe_room = MAX_MULTIBYTE_LENGTH;
4651
4652           while (charbuf < charbuf_end)
4653             {
4654               ASSURE_DESTINATION (safe_room);
4655               c = *charbuf++;
4656               if (ASCII_CHAR_P (c))
4657                 *dst++ = c;
4658               else if (CHAR_BYTE8_P (c))
4659                 *dst++ = CHAR_TO_BYTE8 (c);
4660               else
4661                 CHAR_STRING_ADVANCE (c, dst);
4662               produced_chars++;
4663             }
4664         }
4665       else
4666         {
4667           ASSURE_DESTINATION (charbuf_end - charbuf);
4668           while (charbuf < charbuf_end && dst < dst_end)
4669             *dst++ = *charbuf++;
4670           produced_chars = dst - (coding->destination + coding->dst_bytes);
4671         }
4672     }
4673   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4674   coding->produced_char += produced_chars;
4675   coding->produced = dst - coding->destination;
4676   return 0;
4677 }
4678
4679 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4680    Check if a text is encoded in a charset-based coding system.  If it
4681    is, return 1, else return 0.  */
4682
4683 static int
4684 detect_coding_charset (coding, detect_info)
4685      struct coding_system *coding;
4686      struct coding_detection_info *detect_info;
4687 {
4688   const unsigned char *src = coding->source, *src_base;
4689   const unsigned char *src_end = coding->source + coding->src_bytes;
4690   int multibytep = coding->src_multibyte;
4691   int consumed_chars = 0;
4692   Lisp_Object attrs, valids;
4693   int found = 0;
4694
4695   detect_info->checked |= CATEGORY_MASK_CHARSET;
4696
4697   coding = &coding_categories[coding_category_charset];
4698   attrs = CODING_ID_ATTRS (coding->id);
4699   valids = AREF (attrs, coding_attr_charset_valids);
4700
4701   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4702     src += coding->head_ascii;
4703
4704   while (1)
4705     {
4706       int c;
4707
4708       src_base = src;
4709       ONE_MORE_BYTE (c);
4710       if (c < 0)
4711         continue;
4712       if (NILP (AREF (valids, c)))
4713         break;
4714       if (c >= 0x80)
4715         found = CATEGORY_MASK_CHARSET;
4716     }
4717   detect_info->rejected |= CATEGORY_MASK_CHARSET;
4718   return 0;
4719
4720  no_more_source:
4721   detect_info->found |= found;
4722   return 1;
4723 }
4724
4725 static void
4726 decode_coding_charset (coding)
4727      struct coding_system *coding;
4728 {
4729   const unsigned char *src = coding->source + coding->consumed;
4730   const unsigned char *src_end = coding->source + coding->src_bytes;
4731   const unsigned char *src_base;
4732   int *charbuf = coding->charbuf + coding->charbuf_used;
4733   int *charbuf_end
4734     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4735   int consumed_chars = 0, consumed_chars_base;
4736   int multibytep = coding->src_multibyte;
4737   Lisp_Object attrs, charset_list, valids;
4738   int char_offset = coding->produced_char;
4739   int last_offset = char_offset;
4740   int last_id = charset_ascii;
4741
4742   CODING_GET_INFO (coding, attrs, charset_list);
4743   valids = AREF (attrs, coding_attr_charset_valids);
4744
4745   while (1)
4746     {
4747       int c;
4748       Lisp_Object val;
4749       struct charset *charset;
4750       int dim;
4751       int len = 1;
4752       unsigned code;
4753
4754       src_base = src;
4755       consumed_chars_base = consumed_chars;
4756
4757       if (charbuf >= charbuf_end)
4758         break;
4759
4760       ONE_MORE_BYTE (c);
4761       if (c < 0)
4762         goto invalid_code;
4763       code = c;
4764
4765       val = AREF (valids, c);
4766       if (NILP (val))
4767         goto invalid_code;
4768       if (INTEGERP (val))
4769         {
4770           charset = CHARSET_FROM_ID (XFASTINT (val));
4771           dim = CHARSET_DIMENSION (charset);
4772           while (len < dim)
4773             {
4774               ONE_MORE_BYTE (c);
4775               code = (code << 8) | c;
4776               len++;
4777             }
4778           CODING_DECODE_CHAR (coding, src, src_base, src_end,
4779                               charset, code, c);
4780         }
4781       else
4782         {
4783           /* VAL is a list of charset IDs.  It is assured that the
4784              list is sorted by charset dimensions (smaller one
4785              comes first).  */
4786           while (CONSP (val))
4787             {
4788               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
4789               dim = CHARSET_DIMENSION (charset);
4790               while (len < dim)
4791                 {
4792                   ONE_MORE_BYTE (c);
4793                   code = (code << 8) | c;
4794                   len++;
4795                 }
4796               CODING_DECODE_CHAR (coding, src, src_base,
4797                                   src_end, charset, code, c);
4798               if (c >= 0)
4799                 break;
4800               val = XCDR (val);
4801             }
4802         }
4803       if (c < 0)
4804         goto invalid_code;
4805       if (charset->id != charset_ascii
4806           && last_id != charset->id)
4807         {
4808           if (last_id != charset_ascii)
4809             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4810           last_id = charset->id;
4811           last_offset = char_offset;
4812         }
4813
4814       *charbuf++ = c;
4815       char_offset++;
4816       continue;
4817
4818     invalid_code:
4819       src = src_base;
4820       consumed_chars = consumed_chars_base;
4821       ONE_MORE_BYTE (c);
4822       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4823       char_offset++;
4824       coding->errors++;
4825     }
4826
4827  no_more_source:
4828   if (last_id != charset_ascii)
4829     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4830   coding->consumed_char += consumed_chars_base;
4831   coding->consumed = src_base - coding->source;
4832   coding->charbuf_used = charbuf - coding->charbuf;
4833 }
4834
4835 static int
4836 encode_coding_charset (coding)
4837      struct coding_system *coding;
4838 {
4839   int multibytep = coding->dst_multibyte;
4840   int *charbuf = coding->charbuf;
4841   int *charbuf_end = charbuf + coding->charbuf_used;
4842   unsigned char *dst = coding->destination + coding->produced;
4843   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4844   int safe_room = MAX_MULTIBYTE_LENGTH;
4845   int produced_chars = 0;
4846   Lisp_Object attrs, charset_list;
4847   int ascii_compatible;
4848   int c;
4849
4850   CODING_GET_INFO (coding, attrs, charset_list);
4851   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4852
4853   while (charbuf < charbuf_end)
4854     {
4855       struct charset *charset;
4856       unsigned code;
4857
4858       ASSURE_DESTINATION (safe_room);
4859       c = *charbuf++;
4860       if (ascii_compatible && ASCII_CHAR_P (c))
4861         EMIT_ONE_ASCII_BYTE (c);
4862       else if (CHAR_BYTE8_P (c))
4863         {
4864           c = CHAR_TO_BYTE8 (c);
4865           EMIT_ONE_BYTE (c);
4866         }
4867       else
4868         {
4869           charset = char_charset (c, charset_list, &code);
4870           if (charset)
4871             {
4872               if (CHARSET_DIMENSION (charset) == 1)
4873                 EMIT_ONE_BYTE (code);
4874               else if (CHARSET_DIMENSION (charset) == 2)
4875                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
4876               else if (CHARSET_DIMENSION (charset) == 3)
4877                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
4878               else
4879                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
4880                                  (code >> 8) & 0xFF, code & 0xFF);
4881             }
4882           else
4883             {
4884               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4885                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4886               else
4887                 c = coding->default_char;
4888               EMIT_ONE_BYTE (c);
4889             }
4890         }
4891     }
4892
4893   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4894   coding->produced_char += produced_chars;
4895   coding->produced = dst - coding->destination;
4896   return 0;
4897 }
4898
4899 \f
4900 /*** 7. C library functions ***/
4901
4902 /* Setup coding context CODING from information about CODING_SYSTEM.
4903    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
4904    CODING_SYSTEM is invalid, signal an error.  */
4905
4906 void
4907 setup_coding_system (coding_system, coding)
4908      Lisp_Object coding_system;
4909      struct coding_system *coding;
4910 {
4911   Lisp_Object attrs;
4912   Lisp_Object eol_type;
4913   Lisp_Object coding_type;
4914   Lisp_Object val;
4915
4916   if (NILP (coding_system))
4917     coding_system = Qundecided;
4918
4919   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
4920
4921   attrs = CODING_ID_ATTRS (coding->id);
4922   eol_type = CODING_ID_EOL_TYPE (coding->id);
4923
4924   coding->mode = 0;
4925   coding->head_ascii = -1;
4926   coding->common_flags
4927     = (VECTORP (eol_type) ? CODING_REQUIRE_DETECTION_MASK : 0);
4928   if (! NILP (CODING_ATTR_POST_READ (attrs)))
4929     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
4930   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
4931     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
4932   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
4933     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
4934
4935   val = CODING_ATTR_SAFE_CHARSETS (attrs);
4936   coding->max_charset_id = SCHARS (val) - 1;
4937   coding->safe_charsets = (char *) SDATA (val);
4938   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4939
4940   coding_type = CODING_ATTR_TYPE (attrs);
4941   if (EQ (coding_type, Qundecided))
4942     {
4943       coding->detector = NULL;
4944       coding->decoder = decode_coding_raw_text;
4945       coding->encoder = encode_coding_raw_text;
4946       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4947     }
4948   else if (EQ (coding_type, Qiso_2022))
4949     {
4950       int i;
4951       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
4952
4953       /* Invoke graphic register 0 to plane 0.  */
4954       CODING_ISO_INVOCATION (coding, 0) = 0;
4955       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
4956       CODING_ISO_INVOCATION (coding, 1)
4957         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
4958       /* Setup the initial status of designation.  */
4959       for (i = 0; i < 4; i++)
4960         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
4961       /* Not single shifting initially.  */
4962       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
4963       /* Beginning of buffer should also be regarded as bol. */
4964       CODING_ISO_BOL (coding) = 1;
4965       coding->detector = detect_coding_iso_2022;
4966       coding->decoder = decode_coding_iso_2022;
4967       coding->encoder = encode_coding_iso_2022;
4968       if (flags & CODING_ISO_FLAG_SAFE)
4969         coding->mode |= CODING_MODE_SAFE_ENCODING;
4970       coding->common_flags
4971         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4972             | CODING_REQUIRE_FLUSHING_MASK);
4973       if (flags & CODING_ISO_FLAG_COMPOSITION)
4974         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
4975       if (flags & CODING_ISO_FLAG_DESIGNATION)
4976         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
4977       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
4978         {
4979           setup_iso_safe_charsets (attrs);
4980           val = CODING_ATTR_SAFE_CHARSETS (attrs);
4981           coding->max_charset_id = SCHARS (val) - 1;
4982           coding->safe_charsets = (char *) SDATA (val);
4983         }
4984       CODING_ISO_FLAGS (coding) = flags;
4985     }
4986   else if (EQ (coding_type, Qcharset))
4987     {
4988       coding->detector = detect_coding_charset;
4989       coding->decoder = decode_coding_charset;
4990       coding->encoder = encode_coding_charset;
4991       coding->common_flags
4992         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4993     }
4994   else if (EQ (coding_type, Qutf_8))
4995     {
4996       coding->detector = detect_coding_utf_8;
4997       coding->decoder = decode_coding_utf_8;
4998       coding->encoder = encode_coding_utf_8;
4999       coding->common_flags
5000         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5001     }
5002   else if (EQ (coding_type, Qutf_16))
5003     {
5004       val = AREF (attrs, coding_attr_utf_16_bom);
5005       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
5006                                     : EQ (val, Qt) ? utf_16_with_bom
5007                                     : utf_16_without_bom);
5008       val = AREF (attrs, coding_attr_utf_16_endian);
5009       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5010                                        : utf_16_little_endian);
5011       CODING_UTF_16_SURROGATE (coding) = 0;
5012       coding->detector = detect_coding_utf_16;
5013       coding->decoder = decode_coding_utf_16;
5014       coding->encoder = encode_coding_utf_16;
5015       coding->common_flags
5016         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5017       if (CODING_UTF_16_BOM (coding) == utf_16_detect_bom)
5018         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5019     }
5020   else if (EQ (coding_type, Qccl))
5021     {
5022       coding->detector = detect_coding_ccl;
5023       coding->decoder = decode_coding_ccl;
5024       coding->encoder = encode_coding_ccl;
5025       coding->common_flags
5026         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5027             | CODING_REQUIRE_FLUSHING_MASK);
5028     }
5029   else if (EQ (coding_type, Qemacs_mule))
5030     {
5031       coding->detector = detect_coding_emacs_mule;
5032       coding->decoder = decode_coding_emacs_mule;
5033       coding->encoder = encode_coding_emacs_mule;
5034       coding->common_flags
5035         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5036       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5037           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5038         {
5039           Lisp_Object tail, safe_charsets;
5040           int max_charset_id = 0;
5041
5042           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5043                tail = XCDR (tail))
5044             if (max_charset_id < XFASTINT (XCAR (tail)))
5045               max_charset_id = XFASTINT (XCAR (tail));
5046           safe_charsets = Fmake_string (make_number (max_charset_id + 1),
5047                                         make_number (255));
5048           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5049                tail = XCDR (tail))
5050             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5051           coding->max_charset_id = max_charset_id;
5052           coding->safe_charsets = (char *) SDATA (safe_charsets);
5053         }
5054     }
5055   else if (EQ (coding_type, Qshift_jis))
5056     {
5057       coding->detector = detect_coding_sjis;
5058       coding->decoder = decode_coding_sjis;
5059       coding->encoder = encode_coding_sjis;
5060       coding->common_flags
5061         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5062     }
5063   else if (EQ (coding_type, Qbig5))
5064     {
5065       coding->detector = detect_coding_big5;
5066       coding->decoder = decode_coding_big5;
5067       coding->encoder = encode_coding_big5;
5068       coding->common_flags
5069         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5070     }
5071   else                          /* EQ (coding_type, Qraw_text) */
5072     {
5073       coding->detector = NULL;
5074       coding->decoder = decode_coding_raw_text;
5075       coding->encoder = encode_coding_raw_text;
5076       if (! EQ (eol_type, Qunix))
5077         {
5078           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5079           if (! VECTORP (eol_type))
5080             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5081         }
5082
5083     }
5084
5085   return;
5086 }
5087
5088 /* Return a list of charsets supported by CODING.  */
5089
5090 Lisp_Object
5091 coding_charset_list (coding)
5092      struct coding_system *coding;
5093 {
5094   Lisp_Object attrs, charset_list, coding_type;
5095
5096   CODING_GET_INFO (coding, attrs, charset_list);
5097   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5098     {
5099       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5100
5101       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5102         charset_list = Viso_2022_charset_list;
5103     }
5104   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5105     {
5106       charset_list = Vemacs_mule_charset_list;
5107     }
5108   return charset_list;
5109 }
5110
5111
5112 /* Return raw-text or one of its subsidiaries that has the same
5113    eol_type as CODING-SYSTEM.  */
5114
5115 Lisp_Object
5116 raw_text_coding_system (coding_system)
5117      Lisp_Object coding_system;
5118 {
5119   Lisp_Object spec, attrs;
5120   Lisp_Object eol_type, raw_text_eol_type;
5121
5122   if (NILP (coding_system))
5123     return Qraw_text;
5124   spec = CODING_SYSTEM_SPEC (coding_system);
5125   attrs = AREF (spec, 0);
5126
5127   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5128     return coding_system;
5129
5130   eol_type = AREF (spec, 2);
5131   if (VECTORP (eol_type))
5132     return Qraw_text;
5133   spec = CODING_SYSTEM_SPEC (Qraw_text);
5134   raw_text_eol_type = AREF (spec, 2);
5135   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5136           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5137           : AREF (raw_text_eol_type, 2));
5138 }
5139
5140
5141 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5142    does, return one of the subsidiary that has the same eol-spec as
5143    PARENT.  Otherwise, return CODING_SYSTEM.  */
5144
5145 Lisp_Object
5146 coding_inherit_eol_type (coding_system, parent)
5147      Lisp_Object coding_system, parent;
5148 {
5149   Lisp_Object spec, eol_type;
5150
5151   if (NILP (coding_system))
5152     coding_system = Qraw_text;
5153   spec = CODING_SYSTEM_SPEC (coding_system);
5154   eol_type = AREF (spec, 2);
5155   if (VECTORP (eol_type)
5156       && ! NILP (parent))
5157     {
5158       Lisp_Object parent_spec;
5159       Lisp_Object parent_eol_type;
5160
5161       parent_spec
5162         = CODING_SYSTEM_SPEC (buffer_defaults.buffer_file_coding_system);
5163       parent_eol_type = AREF (parent_spec, 2);
5164       if (EQ (parent_eol_type, Qunix))
5165         coding_system = AREF (eol_type, 0);
5166       else if (EQ (parent_eol_type, Qdos))
5167         coding_system = AREF (eol_type, 1);
5168       else if (EQ (parent_eol_type, Qmac))
5169         coding_system = AREF (eol_type, 2);
5170     }
5171   return coding_system;
5172 }
5173
5174 /* Emacs has a mechanism to automatically detect a coding system if it
5175    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
5176    it's impossible to distinguish some coding systems accurately
5177    because they use the same range of codes.  So, at first, coding
5178    systems are categorized into 7, those are:
5179
5180    o coding-category-emacs-mule
5181
5182         The category for a coding system which has the same code range
5183         as Emacs' internal format.  Assigned the coding-system (Lisp
5184         symbol) `emacs-mule' by default.
5185
5186    o coding-category-sjis
5187
5188         The category for a coding system which has the same code range
5189         as SJIS.  Assigned the coding-system (Lisp
5190         symbol) `japanese-shift-jis' by default.
5191
5192    o coding-category-iso-7
5193
5194         The category for a coding system which has the same code range
5195         as ISO2022 of 7-bit environment.  This doesn't use any locking
5196         shift and single shift functions.  This can encode/decode all
5197         charsets.  Assigned the coding-system (Lisp symbol)
5198         `iso-2022-7bit' by default.
5199
5200    o coding-category-iso-7-tight
5201
5202         Same as coding-category-iso-7 except that this can
5203         encode/decode only the specified charsets.
5204
5205    o coding-category-iso-8-1
5206
5207         The category for a coding system which has the same code range
5208         as ISO2022 of 8-bit environment and graphic plane 1 used only
5209         for DIMENSION1 charset.  This doesn't use any locking shift
5210         and single shift functions.  Assigned the coding-system (Lisp
5211         symbol) `iso-latin-1' by default.
5212
5213    o coding-category-iso-8-2
5214
5215         The category for a coding system which has the same code range
5216         as ISO2022 of 8-bit environment and graphic plane 1 used only
5217         for DIMENSION2 charset.  This doesn't use any locking shift
5218         and single shift functions.  Assigned the coding-system (Lisp
5219         symbol) `japanese-iso-8bit' by default.
5220
5221    o coding-category-iso-7-else
5222
5223         The category for a coding system which has the same code range
5224         as ISO2022 of 7-bit environemnt but uses locking shift or
5225         single shift functions.  Assigned the coding-system (Lisp
5226         symbol) `iso-2022-7bit-lock' by default.
5227
5228    o coding-category-iso-8-else
5229
5230         The category for a coding system which has the same code range
5231         as ISO2022 of 8-bit environemnt but uses locking shift or
5232         single shift functions.  Assigned the coding-system (Lisp
5233         symbol) `iso-2022-8bit-ss2' by default.
5234
5235    o coding-category-big5
5236
5237         The category for a coding system which has the same code range
5238         as BIG5.  Assigned the coding-system (Lisp symbol)
5239         `cn-big5' by default.
5240
5241    o coding-category-utf-8
5242
5243         The category for a coding system which has the same code range
5244         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
5245         symbol) `utf-8' by default.
5246
5247    o coding-category-utf-16-be
5248
5249         The category for a coding system in which a text has an
5250         Unicode signature (cf. Unicode Standard) in the order of BIG
5251         endian at the head.  Assigned the coding-system (Lisp symbol)
5252         `utf-16-be' by default.
5253
5254    o coding-category-utf-16-le
5255
5256         The category for a coding system in which a text has an
5257         Unicode signature (cf. Unicode Standard) in the order of
5258         LITTLE endian at the head.  Assigned the coding-system (Lisp
5259         symbol) `utf-16-le' by default.
5260
5261    o coding-category-ccl
5262
5263         The category for a coding system of which encoder/decoder is
5264         written in CCL programs.  The default value is nil, i.e., no
5265         coding system is assigned.
5266
5267    o coding-category-binary
5268
5269         The category for a coding system not categorized in any of the
5270         above.  Assigned the coding-system (Lisp symbol)
5271         `no-conversion' by default.
5272
5273    Each of them is a Lisp symbol and the value is an actual
5274    `coding-system's (this is also a Lisp symbol) assigned by a user.
5275    What Emacs does actually is to detect a category of coding system.
5276    Then, it uses a `coding-system' assigned to it.  If Emacs can't
5277    decide only one possible category, it selects a category of the
5278    highest priority.  Priorities of categories are also specified by a
5279    user in a Lisp variable `coding-category-list'.
5280
5281 */
5282
5283 #define EOL_SEEN_NONE   0
5284 #define EOL_SEEN_LF     1
5285 #define EOL_SEEN_CR     2
5286 #define EOL_SEEN_CRLF   4
5287
5288 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
5289    SOURCE is encoded.  If CATEGORY is one of
5290    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5291    two-byte, else they are encoded by one-byte.
5292
5293    Return one of EOL_SEEN_XXX.  */
5294
5295 #define MAX_EOL_CHECK_COUNT 3
5296
5297 static int
5298 detect_eol (source, src_bytes, category)
5299      const unsigned char *source;
5300      EMACS_INT src_bytes;
5301      enum coding_category category;
5302 {
5303   const unsigned char *src = source, *src_end = src + src_bytes;
5304   unsigned char c;
5305   int total  = 0;
5306   int eol_seen = EOL_SEEN_NONE;
5307
5308   if ((1 << category) & CATEGORY_MASK_UTF_16)
5309     {
5310       int msb, lsb;
5311
5312       msb = category == (coding_category_utf_16_le
5313                          | coding_category_utf_16_le_nosig);
5314       lsb = 1 - msb;
5315
5316       while (src + 1 < src_end)
5317         {
5318           c = src[lsb];
5319           if (src[msb] == 0 && (c == '\n' || c == '\r'))
5320             {
5321               int this_eol;
5322
5323               if (c == '\n')
5324                 this_eol = EOL_SEEN_LF;
5325               else if (src + 3 >= src_end
5326                        || src[msb + 2] != 0
5327                        || src[lsb + 2] != '\n')
5328                 this_eol = EOL_SEEN_CR;
5329               else
5330                 this_eol = EOL_SEEN_CRLF;
5331
5332               if (eol_seen == EOL_SEEN_NONE)
5333                 /* This is the first end-of-line.  */
5334                 eol_seen = this_eol;
5335               else if (eol_seen != this_eol)
5336                 {
5337                   /* The found type is different from what found before.  */
5338                   eol_seen = EOL_SEEN_LF;
5339                   break;
5340                 }
5341               if (++total == MAX_EOL_CHECK_COUNT)
5342                 break;
5343             }
5344           src += 2;
5345         }
5346     }
5347   else
5348     {
5349       while (src < src_end)
5350         {
5351           c = *src++;
5352           if (c == '\n' || c == '\r')
5353             {
5354               int this_eol;
5355
5356               if (c == '\n')
5357                 this_eol = EOL_SEEN_LF;
5358               else if (src >= src_end || *src != '\n')
5359                 this_eol = EOL_SEEN_CR;
5360               else
5361                 this_eol = EOL_SEEN_CRLF, src++;
5362
5363               if (eol_seen == EOL_SEEN_NONE)
5364                 /* This is the first end-of-line.  */
5365                 eol_seen = this_eol;
5366               else if (eol_seen != this_eol)
5367                 {
5368                   /* The found type is different from what found before.  */
5369                   eol_seen = EOL_SEEN_LF;
5370                   break;
5371                 }
5372               if (++total == MAX_EOL_CHECK_COUNT)
5373                 break;
5374             }
5375         }
5376     }
5377   return eol_seen;
5378 }
5379
5380
5381 static Lisp_Object
5382 adjust_coding_eol_type (coding, eol_seen)
5383      struct coding_system *coding;
5384      int eol_seen;
5385 {
5386   Lisp_Object eol_type;
5387
5388   eol_type = CODING_ID_EOL_TYPE (coding->id);
5389   if (eol_seen & EOL_SEEN_LF)
5390     {
5391       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5392       eol_type = Qunix;
5393     }
5394   else if (eol_seen & EOL_SEEN_CRLF)
5395     {
5396       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5397       eol_type = Qdos;
5398     }
5399   else if (eol_seen & EOL_SEEN_CR)
5400     {
5401       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5402       eol_type = Qmac;
5403     }
5404   return eol_type;
5405 }
5406
5407 /* Detect how a text specified in CODING is encoded.  If a coding
5408    system is detected, update fields of CODING by the detected coding
5409    system.  */
5410
5411 void
5412 detect_coding (coding)
5413      struct coding_system *coding;
5414 {
5415   const unsigned char *src, *src_end;
5416
5417   coding->consumed = coding->consumed_char = 0;
5418   coding->produced = coding->produced_char = 0;
5419   coding_set_source (coding);
5420
5421   src_end = coding->source + coding->src_bytes;
5422
5423   /* If we have not yet decided the text encoding type, detect it
5424      now.  */
5425   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5426     {
5427       int c, i;
5428       struct coding_detection_info detect_info;
5429
5430       detect_info.checked = detect_info.found = detect_info.rejected = 0;
5431       for (i = 0, src = coding->source; src < src_end; i++, src++)
5432         {
5433           c = *src;
5434           if (c & 0x80)
5435             break;
5436           if (c < 0x20
5437               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
5438               && ! inhibit_iso_escape_detection
5439               && ! detect_info.checked)
5440             {
5441               coding->head_ascii = src - (coding->source + coding->consumed);
5442               if (detect_coding_iso_2022 (coding, &detect_info))
5443                 {
5444                   /* We have scanned the whole data.  */
5445                   if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
5446                     /* We didn't find an 8-bit code.  */
5447                     src = src_end;
5448                   break;
5449                 }
5450             }
5451         }
5452       coding->head_ascii = src - (coding->source + coding->consumed);
5453
5454       if (coding->head_ascii < coding->src_bytes
5455           || detect_info.found)
5456         {
5457           enum coding_category category;
5458           struct coding_system *this;
5459
5460           if (coding->head_ascii == coding->src_bytes)
5461             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
5462             for (i = 0; i < coding_category_raw_text; i++)
5463               {
5464                 category = coding_priorities[i];
5465                 this = coding_categories + category;
5466                 if (detect_info.found & (1 << category))
5467                   break;
5468               }
5469           else
5470             for (i = 0; i < coding_category_raw_text; i++)
5471               {
5472                 category = coding_priorities[i];
5473                 this = coding_categories + category;
5474                 if (this->id < 0)
5475                   {
5476                     /* No coding system of this category is defined.  */
5477                     detect_info.rejected |= (1 << category);
5478                   }
5479                 else if (category >= coding_category_raw_text)
5480                   continue;
5481                 else if (detect_info.checked & (1 << category))
5482                   {
5483                     if (detect_info.found & (1 << category))
5484                       break;
5485                   }
5486                 else if ((*(this->detector)) (coding, &detect_info)
5487                          && detect_info.found & (1 << category))
5488                   {
5489                     if (category == coding_category_utf_16_auto)
5490                       {
5491                         if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5492                           category = coding_category_utf_16_le;
5493                         else
5494                           category = coding_category_utf_16_be;
5495                       }
5496                     break;
5497                   }
5498               }
5499
5500           if (i < coding_category_raw_text)
5501             setup_coding_system (CODING_ID_NAME (this->id), coding);
5502           else if (detect_info.rejected == CATEGORY_MASK_ANY)
5503             setup_coding_system (Qraw_text, coding);
5504           else if (detect_info.rejected)
5505             for (i = 0; i < coding_category_raw_text; i++)
5506               if (! (detect_info.rejected & (1 << coding_priorities[i])))
5507                 {
5508                   this = coding_categories + coding_priorities[i];
5509                   setup_coding_system (CODING_ID_NAME (this->id), coding);
5510                   break;
5511                 }
5512         }
5513     }
5514   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
5515            == coding_category_utf_16_auto)
5516     {
5517       Lisp_Object coding_systems;
5518       struct coding_detection_info detect_info;
5519
5520       coding_systems
5521         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_16_bom);
5522       detect_info.found = detect_info.rejected = 0;
5523       if (CONSP (coding_systems)
5524           && detect_coding_utf_16 (coding, &detect_info))
5525         {
5526           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5527             setup_coding_system (XCAR (coding_systems), coding);
5528           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
5529             setup_coding_system (XCDR (coding_systems), coding);
5530         }
5531     }
5532 }
5533
5534
5535 static void
5536 decode_eol (coding)
5537      struct coding_system *coding;
5538 {
5539   Lisp_Object eol_type;
5540   unsigned char *p, *pbeg, *pend;
5541
5542   eol_type = CODING_ID_EOL_TYPE (coding->id);
5543   if (EQ (eol_type, Qunix))
5544     return;
5545
5546   if (NILP (coding->dst_object))
5547     pbeg = coding->destination;
5548   else
5549     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
5550   pend = pbeg + coding->produced;
5551
5552   if (VECTORP (eol_type))
5553     {
5554       int eol_seen = EOL_SEEN_NONE;
5555
5556       for (p = pbeg; p < pend; p++)
5557         {
5558           if (*p == '\n')
5559             eol_seen |= EOL_SEEN_LF;
5560           else if (*p == '\r')
5561             {
5562               if (p + 1 < pend && *(p + 1) == '\n')
5563                 {
5564                   eol_seen |= EOL_SEEN_CRLF;
5565                   p++;
5566                 }
5567               else
5568                 eol_seen |= EOL_SEEN_CR;
5569             }
5570         }
5571       if (eol_seen != EOL_SEEN_NONE
5572           && eol_seen != EOL_SEEN_LF
5573           && eol_seen != EOL_SEEN_CRLF
5574           && eol_seen != EOL_SEEN_CR)
5575         eol_seen = EOL_SEEN_LF;
5576       if (eol_seen != EOL_SEEN_NONE)
5577         eol_type = adjust_coding_eol_type (coding, eol_seen);
5578     }
5579
5580   if (EQ (eol_type, Qmac))
5581     {
5582       for (p = pbeg; p < pend; p++)
5583         if (*p == '\r')
5584           *p = '\n';
5585     }
5586   else if (EQ (eol_type, Qdos))
5587     {
5588       int n = 0;
5589
5590       if (NILP (coding->dst_object))
5591         {
5592           for (p = pend - 2; p >= pbeg; p--)
5593             if (*p == '\r')
5594               {
5595                 safe_bcopy ((char *) (p + 1), (char *) p, pend-- - p - 1);
5596                 n++;
5597               }
5598         }
5599       else
5600         {
5601           for (p = pend - 2; p >= pbeg; p--)
5602             if (*p == '\r')
5603               {
5604                 int pos_byte = coding->dst_pos_byte + (p - pbeg);
5605                 int pos = BYTE_TO_CHAR (pos_byte);
5606
5607                 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
5608                 n++;
5609               }
5610         }
5611       coding->produced -= n;
5612       coding->produced_char -= n;
5613     }
5614 }
5615
5616
5617 /* Return a translation table (or list of them) from coding system
5618    attribute vector ATTRS for encoding (ENCODEP is nonzero) or
5619    decoding (ENCODEP is zero). */
5620
5621 static Lisp_Object
5622 get_translation_table (attrs, encodep, max_lookup)
5623      Lisp_Object attrs;
5624      int encodep, *max_lookup;
5625 {
5626   Lisp_Object standard, translation_table;
5627   Lisp_Object val;
5628
5629   if (encodep)
5630     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
5631       standard = Vstandard_translation_table_for_encode;
5632   else
5633     translation_table = CODING_ATTR_DECODE_TBL (attrs),
5634       standard = Vstandard_translation_table_for_decode;
5635   if (NILP (translation_table))
5636     translation_table = standard;
5637   else
5638     {
5639       if (SYMBOLP (translation_table))
5640         translation_table = Fget (translation_table, Qtranslation_table);
5641       else if (CONSP (translation_table))
5642         {
5643           translation_table = Fcopy_sequence (translation_table);
5644           for (val = translation_table; CONSP (val); val = XCDR (val))
5645             if (SYMBOLP (XCAR (val)))
5646               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
5647         }
5648       if (CHAR_TABLE_P (standard))
5649         {
5650           if (CONSP (translation_table))
5651             translation_table = nconc2 (translation_table,
5652                                         Fcons (standard, Qnil));
5653           else
5654             translation_table = Fcons (translation_table,
5655                                        Fcons (standard, Qnil));
5656         }
5657     }
5658
5659   if (max_lookup)
5660     {
5661       *max_lookup = 1;
5662       if (CHAR_TABLE_P (translation_table)
5663           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
5664         {
5665           val = XCHAR_TABLE (translation_table)->extras[1];
5666           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
5667             *max_lookup = XFASTINT (val);
5668         }
5669       else if (CONSP (translation_table))
5670         {
5671           Lisp_Object tail, val;
5672
5673           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
5674             if (CHAR_TABLE_P (XCAR (tail))
5675                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
5676               {
5677                 val = XCHAR_TABLE (XCAR (tail))->extras[1];
5678                 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
5679                   *max_lookup = XFASTINT (val);
5680               }
5681         }
5682     }
5683   return translation_table;
5684 }
5685
5686 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
5687   do {                                                          \
5688     trans = Qnil;                                               \
5689     if (CHAR_TABLE_P (table))                                   \
5690       {                                                         \
5691         trans = CHAR_TABLE_REF (table, c);                      \
5692         if (CHARACTERP (trans))                                 \
5693           c = XFASTINT (trans), trans = Qnil;                   \
5694       }                                                         \
5695     else if (CONSP (table))                                     \
5696       {                                                         \
5697         Lisp_Object tail;                                       \
5698                                                                 \
5699         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
5700           if (CHAR_TABLE_P (XCAR (tail)))                       \
5701             {                                                   \
5702               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
5703               if (CHARACTERP (trans))                           \
5704                 c = XFASTINT (trans), trans = Qnil;             \
5705               else if (! NILP (trans))                          \
5706                 break;                                          \
5707             }                                                   \
5708       }                                                         \
5709   } while (0)
5710
5711
5712 static Lisp_Object
5713 get_translation (val, buf, buf_end, last_block, from_nchars, to_nchars)
5714      Lisp_Object val;
5715      int *buf, *buf_end;
5716      int last_block;
5717      int *from_nchars, *to_nchars;
5718 {
5719   /* VAL is TO or (([FROM-CHAR ...] .  TO) ...) where TO is TO-CHAR or
5720      [TO-CHAR ...].  */
5721   if (CONSP (val))
5722     {
5723       Lisp_Object from, tail;
5724       int i, len;
5725
5726       for (tail = val; CONSP (tail); tail = XCDR (tail))
5727         {
5728           val = XCAR (tail);
5729           from = XCAR (val);
5730           len = ASIZE (from);
5731           for (i = 0; i < len; i++)
5732             {
5733               if (buf + i == buf_end)
5734                 {
5735                   if (! last_block)
5736                     return Qt;
5737                   break;
5738                 }
5739               if (XINT (AREF (from, i)) != buf[i])
5740                 break;
5741             }
5742           if (i == len)
5743             {
5744               val = XCDR (val);
5745               *from_nchars = len;
5746               break;
5747             }
5748         }
5749       if (! CONSP (tail))
5750         return Qnil;
5751     }
5752   if (VECTORP (val))
5753     *buf = XINT (AREF (val, 0)), *to_nchars = ASIZE (val);
5754   else
5755     *buf = XINT (val);
5756   return val;
5757 }
5758
5759
5760 static int
5761 produce_chars (coding, translation_table, last_block)
5762      struct coding_system *coding;
5763      Lisp_Object translation_table;
5764      int last_block;
5765 {
5766   unsigned char *dst = coding->destination + coding->produced;
5767   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5768   int produced;
5769   int produced_chars = 0;
5770   int carryover = 0;
5771
5772   if (! coding->chars_at_source)
5773     {
5774       /* Characters are in coding->charbuf.  */
5775       int *buf = coding->charbuf;
5776       int *buf_end = buf + coding->charbuf_used;
5777
5778       if (BUFFERP (coding->src_object)
5779           && EQ (coding->src_object, coding->dst_object))
5780         dst_end = ((unsigned char *) coding->source) + coding->consumed;
5781
5782       while (buf < buf_end)
5783         {
5784           int c = *buf, i;
5785
5786           if (c >= 0)
5787             {
5788               int from_nchars = 1, to_nchars = 1;
5789               Lisp_Object trans = Qnil;
5790
5791               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
5792               if (! NILP (trans))
5793                 {
5794                   trans = get_translation (trans, buf, buf_end, last_block,
5795                                            &from_nchars, &to_nchars);
5796                   if (EQ (trans, Qt))
5797                     break;
5798                   c = *buf;
5799                 }
5800
5801               if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end)
5802                 {
5803                   dst = alloc_destination (coding,
5804                                            buf_end - buf
5805                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
5806                                            dst);
5807                   dst_end = coding->destination + coding->dst_bytes;
5808                 }
5809
5810               for (i = 0; i < to_nchars; i++)
5811                 {
5812                   if (i > 0)
5813                     c = XINT (AREF (trans, i));
5814                   if (coding->dst_multibyte
5815                       || ! CHAR_BYTE8_P (c))
5816                     CHAR_STRING_ADVANCE (c, dst);
5817                   else
5818                     *dst++ = CHAR_TO_BYTE8 (c);
5819                 }
5820               produced_chars += to_nchars;
5821               *buf++ = to_nchars;
5822               while (--from_nchars > 0)
5823                 *buf++ = 0;
5824             }
5825           else
5826             /* This is an annotation datum.  (-C) is the length.  */
5827             buf += -c;
5828         }
5829       carryover = buf_end - buf;
5830     }
5831   else
5832     {
5833       const unsigned char *src = coding->source;
5834       const unsigned char *src_end = src + coding->src_bytes;
5835       Lisp_Object eol_type;
5836
5837       eol_type = CODING_ID_EOL_TYPE (coding->id);
5838
5839       if (coding->src_multibyte != coding->dst_multibyte)
5840         {
5841           if (coding->src_multibyte)
5842             {
5843               int multibytep = 1;
5844               int consumed_chars;
5845
5846               while (1)
5847                 {
5848                   const unsigned char *src_base = src;
5849                   int c;
5850
5851                   ONE_MORE_BYTE (c);
5852                   if (c == '\r')
5853                     {
5854                       if (EQ (eol_type, Qdos))
5855                         {
5856                           if (src == src_end)
5857                             {
5858                               record_conversion_result
5859                                 (coding, CODING_RESULT_INSUFFICIENT_SRC);
5860                               goto no_more_source;
5861                             }
5862                           if (*src == '\n')
5863                             c = *src++;
5864                         }
5865                       else if (EQ (eol_type, Qmac))
5866                         c = '\n';
5867                     }
5868                   if (dst == dst_end)
5869                     {
5870                       coding->consumed = src - coding->source;
5871
5872                     if (EQ (coding->src_object, coding->dst_object))
5873                       dst_end = (unsigned char *) src;
5874                     if (dst == dst_end)
5875                       {
5876                         dst = alloc_destination (coding, src_end - src + 1,
5877                                                  dst);
5878                         dst_end = coding->destination + coding->dst_bytes;
5879                         coding_set_source (coding);
5880                         src = coding->source + coding->consumed;
5881                         src_end = coding->source + coding->src_bytes;
5882                       }
5883                     }
5884                   *dst++ = c;
5885                   produced_chars++;
5886                 }
5887             no_more_source:
5888               ;
5889             }
5890           else
5891             while (src < src_end)
5892               {
5893                 int multibytep = 1;
5894                 int c = *src++;
5895
5896                 if (c == '\r')
5897                   {
5898                     if (EQ (eol_type, Qdos))
5899                       {
5900                         if (src < src_end
5901                             && *src == '\n')
5902                           c = *src++;
5903                       }
5904                     else if (EQ (eol_type, Qmac))
5905                       c = '\n';
5906                   }
5907                 if (dst >= dst_end - 1)
5908                   {
5909                     coding->consumed = src - coding->source;
5910
5911                     if (EQ (coding->src_object, coding->dst_object))
5912                       dst_end = (unsigned char *) src;
5913                     if (dst >= dst_end - 1)
5914                       {
5915                         dst = alloc_destination (coding, src_end - src + 2,
5916                                                  dst);
5917                         dst_end = coding->destination + coding->dst_bytes;
5918                         coding_set_source (coding);
5919                         src = coding->source + coding->consumed;
5920                         src_end = coding->source + coding->src_bytes;
5921                       }
5922                   }
5923                 EMIT_ONE_BYTE (c);
5924               }
5925         }
5926       else
5927         {
5928           if (!EQ (coding->src_object, coding->dst_object))
5929             {
5930               int require = coding->src_bytes - coding->dst_bytes;
5931
5932               if (require > 0)
5933                 {
5934                   EMACS_INT offset = src - coding->source;
5935
5936                   dst = alloc_destination (coding, require, dst);
5937                   coding_set_source (coding);
5938                   src = coding->source + offset;
5939                   src_end = coding->source + coding->src_bytes;
5940                 }
5941             }
5942           produced_chars = coding->src_chars;
5943           while (src < src_end)
5944             {
5945               int c = *src++;
5946
5947               if (c == '\r')
5948                 {
5949                   if (EQ (eol_type, Qdos))
5950                     {
5951                       if (src < src_end
5952                           && *src == '\n')
5953                         c = *src++;
5954                       produced_chars--;
5955                     }
5956                   else if (EQ (eol_type, Qmac))
5957                     c = '\n';
5958                 }
5959               *dst++ = c;
5960             }
5961         }
5962       coding->consumed = coding->src_bytes;
5963       coding->consumed_char = coding->src_chars;
5964     }
5965
5966   produced = dst - (coding->destination + coding->produced);
5967   if (BUFFERP (coding->dst_object))
5968     insert_from_gap (produced_chars, produced);
5969   coding->produced += produced;
5970   coding->produced_char += produced_chars;
5971   return carryover;
5972 }
5973
5974 /* Compose text in CODING->object according to the annotation data at
5975    CHARBUF.  CHARBUF is an array:
5976      [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
5977  */
5978
5979 static INLINE void
5980 produce_composition (coding, charbuf, pos)
5981      struct coding_system *coding;
5982      int *charbuf;
5983      EMACS_INT pos;
5984 {
5985   int len;
5986   EMACS_INT to;
5987   enum composition_method method;
5988   Lisp_Object components;
5989
5990   len = -charbuf[0];
5991   to = pos + charbuf[2];
5992   if (to <= pos)
5993     return;
5994   method = (enum composition_method) (charbuf[3]);
5995
5996   if (method == COMPOSITION_RELATIVE)
5997     components = Qnil;
5998   else if (method >= COMPOSITION_WITH_RULE
5999            && method <= COMPOSITION_WITH_RULE_ALTCHARS)
6000     {
6001       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6002       int i;
6003
6004       len -= 4;
6005       charbuf += 4;
6006       for (i = 0; i < len; i++)
6007         {
6008           args[i] = make_number (charbuf[i]);
6009           if (args[i] < 0)
6010             return;
6011         }
6012       components = (method == COMPOSITION_WITH_ALTCHARS
6013                     ? Fstring (len, args) : Fvector (len, args));
6014     }
6015   else
6016     return;
6017   compose_text (pos, to, components, Qnil, coding->dst_object);
6018 }
6019
6020
6021 /* Put `charset' property on text in CODING->object according to
6022    the annotation data at CHARBUF.  CHARBUF is an array:
6023      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6024  */
6025
6026 static INLINE void
6027 produce_charset (coding, charbuf, pos)
6028      struct coding_system *coding;
6029      int *charbuf;
6030      EMACS_INT pos;
6031 {
6032   EMACS_INT from = pos - charbuf[2];
6033   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
6034
6035   Fput_text_property (make_number (from), make_number (pos),
6036                       Qcharset, CHARSET_NAME (charset),
6037                       coding->dst_object);
6038 }
6039
6040
6041 #define CHARBUF_SIZE 0x4000
6042
6043 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
6044   do {                                                                  \
6045     int size = CHARBUF_SIZE;;                                           \
6046                                                                         \
6047     coding->charbuf = NULL;                                             \
6048     while (size > 1024)                                                 \
6049       {                                                                 \
6050         coding->charbuf = (int *) alloca (sizeof (int) * size);         \
6051         if (coding->charbuf)                                            \
6052           break;                                                        \
6053         size >>= 1;                                                     \
6054       }                                                                 \
6055     if (! coding->charbuf)                                              \
6056       {                                                                 \
6057         record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
6058         return coding->result;                                          \
6059       }                                                                 \
6060     coding->charbuf_size = size;                                        \
6061   } while (0)
6062
6063
6064 static void
6065 produce_annotation (coding, pos)
6066      struct coding_system *coding;
6067      EMACS_INT pos;
6068 {
6069   int *charbuf = coding->charbuf;
6070   int *charbuf_end = charbuf + coding->charbuf_used;
6071
6072   if (NILP (coding->dst_object))
6073     return;
6074
6075   while (charbuf < charbuf_end)
6076     {
6077       if (*charbuf >= 0)
6078         pos += *charbuf++;
6079       else
6080         {
6081           int len = -*charbuf;
6082           switch (charbuf[1])
6083             {
6084             case CODING_ANNOTATE_COMPOSITION_MASK:
6085               produce_composition (coding, charbuf, pos);
6086               break;
6087             case CODING_ANNOTATE_CHARSET_MASK:
6088               produce_charset (coding, charbuf, pos);
6089               break;
6090             default:
6091               abort ();
6092             }
6093           charbuf += len;
6094         }
6095     }
6096 }
6097
6098 /* Decode the data at CODING->src_object into CODING->dst_object.
6099    CODING->src_object is a buffer, a string, or nil.
6100    CODING->dst_object is a buffer.
6101
6102    If CODING->src_object is a buffer, it must be the current buffer.
6103    In this case, if CODING->src_pos is positive, it is a position of
6104    the source text in the buffer, otherwise, the source text is in the
6105    gap area of the buffer, and CODING->src_pos specifies the offset of
6106    the text from GPT (which must be the same as PT).  If this is the
6107    same buffer as CODING->dst_object, CODING->src_pos must be
6108    negative.
6109
6110    If CODING->src_object is a string, CODING->src_pos in an index to
6111    that string.
6112
6113    If CODING->src_object is nil, CODING->source must already point to
6114    the non-relocatable memory area.  In this case, CODING->src_pos is
6115    an offset from CODING->source.
6116
6117    The decoded data is inserted at the current point of the buffer
6118    CODING->dst_object.
6119 */
6120
6121 static int
6122 decode_coding (coding)
6123      struct coding_system *coding;
6124 {
6125   Lisp_Object attrs;
6126   Lisp_Object undo_list;
6127   Lisp_Object translation_table;
6128   int carryover;
6129   int i;
6130
6131   if (BUFFERP (coding->src_object)
6132       && coding->src_pos > 0
6133       && coding->src_pos < GPT
6134       && coding->src_pos + coding->src_chars > GPT)
6135     move_gap_both (coding->src_pos, coding->src_pos_byte);
6136
6137   undo_list = Qt;
6138   if (BUFFERP (coding->dst_object))
6139     {
6140       if (current_buffer != XBUFFER (coding->dst_object))
6141         set_buffer_internal (XBUFFER (coding->dst_object));
6142       if (GPT != PT)
6143         move_gap_both (PT, PT_BYTE);
6144       undo_list = current_buffer->undo_list;
6145       current_buffer->undo_list = Qt;
6146     }
6147
6148   coding->consumed = coding->consumed_char = 0;
6149   coding->produced = coding->produced_char = 0;
6150   coding->chars_at_source = 0;
6151   record_conversion_result (coding, CODING_RESULT_SUCCESS);
6152   coding->errors = 0;
6153
6154   ALLOC_CONVERSION_WORK_AREA (coding);
6155
6156   attrs = CODING_ID_ATTRS (coding->id);
6157   translation_table = get_translation_table (attrs, 0, NULL);
6158
6159   carryover = 0;
6160   do
6161     {
6162       EMACS_INT pos = coding->dst_pos + coding->produced_char;
6163
6164       coding_set_source (coding);
6165       coding->annotated = 0;
6166       coding->charbuf_used = carryover;
6167       (*(coding->decoder)) (coding);
6168       coding_set_destination (coding);
6169       carryover = produce_chars (coding, translation_table, 0);
6170       if (coding->annotated)
6171         produce_annotation (coding, pos);
6172       for (i = 0; i < carryover; i++)
6173         coding->charbuf[i]
6174           = coding->charbuf[coding->charbuf_used - carryover + i];
6175     }
6176   while (coding->consumed < coding->src_bytes
6177          && ! coding->result);
6178
6179   if (carryover > 0)
6180     {
6181       coding_set_destination (coding);
6182       coding->charbuf_used = carryover;
6183       produce_chars (coding, translation_table, 1);
6184     }
6185
6186   coding->carryover_bytes = 0;
6187   if (coding->consumed < coding->src_bytes)
6188     {
6189       int nbytes = coding->src_bytes - coding->consumed;
6190       const unsigned char *src;
6191
6192       coding_set_source (coding);
6193       coding_set_destination (coding);
6194       src = coding->source + coding->consumed;
6195
6196       if (coding->mode & CODING_MODE_LAST_BLOCK)
6197         {
6198           /* Flush out unprocessed data as binary chars.  We are sure
6199              that the number of data is less than the size of
6200              coding->charbuf.  */
6201           coding->charbuf_used = 0;
6202           while (nbytes-- > 0)
6203             {
6204               int c = *src++;
6205
6206               coding->charbuf[coding->charbuf_used++] = (c & 0x80 ? - c : c);
6207             }
6208           produce_chars (coding, Qnil, 1);
6209         }
6210       else
6211         {
6212           /* Record unprocessed bytes in coding->carryover.  We are
6213              sure that the number of data is less than the size of
6214              coding->carryover.  */
6215           unsigned char *p = coding->carryover;
6216
6217           coding->carryover_bytes = nbytes;
6218           while (nbytes-- > 0)
6219             *p++ = *src++;
6220         }
6221       coding->consumed = coding->src_bytes;
6222     }
6223
6224   if (BUFFERP (coding->dst_object))
6225     {
6226       current_buffer->undo_list = undo_list;
6227       record_insert (coding->dst_pos, coding->produced_char);
6228     }
6229   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
6230     decode_eol (coding);
6231   return coding->result;
6232 }
6233
6234
6235 /* Extract an annotation datum from a composition starting at POS and
6236    ending before LIMIT of CODING->src_object (buffer or string), store
6237    the data in BUF, set *STOP to a starting position of the next
6238    composition (if any) or to LIMIT, and return the address of the
6239    next element of BUF.
6240
6241    If such an annotation is not found, set *STOP to a starting
6242    position of a composition after POS (if any) or to LIMIT, and
6243    return BUF.  */
6244
6245 static INLINE int *
6246 handle_composition_annotation (pos, limit, coding, buf, stop)
6247      EMACS_INT pos, limit;
6248      struct coding_system *coding;
6249      int *buf;
6250      EMACS_INT *stop;
6251 {
6252   EMACS_INT start, end;
6253   Lisp_Object prop;
6254
6255   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
6256       || end > limit)
6257     *stop = limit;
6258   else if (start > pos)
6259     *stop = start;
6260   else
6261     {
6262       if (start == pos)
6263         {
6264           /* We found a composition.  Store the corresponding
6265              annotation data in BUF.  */
6266           int *head = buf;
6267           enum composition_method method = COMPOSITION_METHOD (prop);
6268           int nchars = COMPOSITION_LENGTH (prop);
6269
6270           ADD_COMPOSITION_DATA (buf, nchars, method);
6271           if (method != COMPOSITION_RELATIVE)
6272             {
6273               Lisp_Object components;
6274               int len, i, i_byte;
6275
6276               components = COMPOSITION_COMPONENTS (prop);
6277               if (VECTORP (components))
6278                 {
6279                   len = XVECTOR (components)->size;
6280                   for (i = 0; i < len; i++)
6281                     *buf++ = XINT (AREF (components, i));
6282                 }
6283               else if (STRINGP (components))
6284                 {
6285                   len = SCHARS (components);
6286                   i = i_byte = 0;
6287                   while (i < len)
6288                     {
6289                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
6290                       buf++;
6291                     }
6292                 }
6293               else if (INTEGERP (components))
6294                 {
6295                   len = 1;
6296                   *buf++ = XINT (components);
6297                 }
6298               else if (CONSP (components))
6299                 {
6300                   for (len = 0; CONSP (components);
6301                        len++, components = XCDR (components))
6302                     *buf++ = XINT (XCAR (components));
6303                 }
6304               else
6305                 abort ();
6306               *head -= len;
6307             }
6308         }
6309
6310       if (find_composition (end, limit, &start, &end, &prop,
6311                             coding->src_object)
6312           && end <= limit)
6313         *stop = start;
6314       else
6315         *stop = limit;
6316     }
6317   return buf;
6318 }
6319
6320
6321 /* Extract an annotation datum from a text property `charset' at POS of
6322    CODING->src_object (buffer of string), store the data in BUF, set
6323    *STOP to the position where the value of `charset' property changes
6324    (limiting by LIMIT), and return the address of the next element of
6325    BUF.
6326
6327    If the property value is nil, set *STOP to the position where the
6328    property value is non-nil (limiting by LIMIT), and return BUF.  */
6329
6330 static INLINE int *
6331 handle_charset_annotation (pos, limit, coding, buf, stop)
6332      EMACS_INT pos, limit;
6333      struct coding_system *coding;
6334      int *buf;
6335      EMACS_INT *stop;
6336 {
6337   Lisp_Object val, next;
6338   int id;
6339
6340   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
6341   if (! NILP (val) && CHARSETP (val))
6342     id = XINT (CHARSET_SYMBOL_ID (val));
6343   else
6344     id = -1;
6345   ADD_CHARSET_DATA (buf, 0, id);
6346   next = Fnext_single_property_change (make_number (pos), Qcharset,
6347                                        coding->src_object,
6348                                        make_number (limit));
6349   *stop = XINT (next);
6350   return buf;
6351 }
6352
6353
6354 static void
6355 consume_chars (coding, translation_table, max_lookup)
6356      struct coding_system *coding;
6357      Lisp_Object translation_table;
6358      int max_lookup;
6359 {
6360   int *buf = coding->charbuf;
6361   int *buf_end = coding->charbuf + coding->charbuf_size;
6362   const unsigned char *src = coding->source + coding->consumed;
6363   const unsigned char *src_end = coding->source + coding->src_bytes;
6364   EMACS_INT pos = coding->src_pos + coding->consumed_char;
6365   EMACS_INT end_pos = coding->src_pos + coding->src_chars;
6366   int multibytep = coding->src_multibyte;
6367   Lisp_Object eol_type;
6368   int c;
6369   EMACS_INT stop, stop_composition, stop_charset;
6370   int *lookup_buf = NULL;
6371
6372   if (! NILP (translation_table))
6373     lookup_buf = alloca (sizeof (int) * max_lookup);
6374
6375   eol_type = CODING_ID_EOL_TYPE (coding->id);
6376   if (VECTORP (eol_type))
6377     eol_type = Qunix;
6378
6379   /* Note: composition handling is not yet implemented.  */
6380   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
6381
6382   if (NILP (coding->src_object))
6383     stop = stop_composition = stop_charset = end_pos;
6384   else
6385     {
6386       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
6387         stop = stop_composition = pos;
6388       else
6389         stop = stop_composition = end_pos;
6390       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
6391         stop = stop_charset = pos;
6392       else
6393         stop_charset = end_pos;
6394     }
6395
6396   /* Compensate for CRLF and conversion.  */
6397   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
6398   while (buf < buf_end)
6399     {
6400       Lisp_Object trans;
6401
6402       if (pos == stop)
6403         {
6404           if (pos == end_pos)
6405             break;
6406           if (pos == stop_composition)
6407             buf = handle_composition_annotation (pos, end_pos, coding,
6408                                                  buf, &stop_composition);
6409           if (pos == stop_charset)
6410             buf = handle_charset_annotation (pos, end_pos, coding,
6411                                              buf, &stop_charset);
6412           stop = (stop_composition < stop_charset
6413                   ? stop_composition : stop_charset);
6414         }
6415
6416       if (! multibytep)
6417         {
6418           EMACS_INT bytes;
6419
6420           if (coding->encoder == encode_coding_raw_text)
6421             c = *src++, pos++;
6422           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
6423             c = STRING_CHAR_ADVANCE (src), pos += bytes;
6424           else
6425             c = BYTE8_TO_CHAR (*src), src++, pos++;
6426         }
6427       else
6428         c = STRING_CHAR_ADVANCE (src), pos++;
6429       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
6430         c = '\n';
6431       if (! EQ (eol_type, Qunix))
6432         {
6433           if (c == '\n')
6434             {
6435               if (EQ (eol_type, Qdos))
6436                 *buf++ = '\r';
6437               else
6438                 c = '\r';
6439             }
6440         }
6441
6442       trans = Qnil;
6443       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6444       if (NILP (trans))
6445         *buf++ = c;
6446       else
6447         {
6448           int from_nchars = 1, to_nchars = 1;
6449           int *lookup_buf_end;
6450           const unsigned char *p = src;
6451           int i;
6452
6453           lookup_buf[0] = c;
6454           for (i = 1; i < max_lookup && p < src_end; i++)
6455             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
6456           lookup_buf_end = lookup_buf + i;
6457           trans = get_translation (trans, lookup_buf, lookup_buf_end, 1,
6458                                    &from_nchars, &to_nchars);
6459           if (EQ (trans, Qt)
6460               || buf + to_nchars > buf_end)
6461             break;
6462           *buf++ = *lookup_buf;
6463           for (i = 1; i < to_nchars; i++)
6464             *buf++ = XINT (AREF (trans, i));
6465           for (i = 1; i < from_nchars; i++, pos++)
6466             src += MULTIBYTE_LENGTH_NO_CHECK (src);
6467         }
6468     }
6469
6470   coding->consumed = src - coding->source;
6471   coding->consumed_char = pos - coding->src_pos;
6472   coding->charbuf_used = buf - coding->charbuf;
6473   coding->chars_at_source = 0;
6474 }
6475
6476
6477 /* Encode the text at CODING->src_object into CODING->dst_object.
6478    CODING->src_object is a buffer or a string.
6479    CODING->dst_object is a buffer or nil.
6480
6481    If CODING->src_object is a buffer, it must be the current buffer.
6482    In this case, if CODING->src_pos is positive, it is a position of
6483    the source text in the buffer, otherwise. the source text is in the
6484    gap area of the buffer, and coding->src_pos specifies the offset of
6485    the text from GPT (which must be the same as PT).  If this is the
6486    same buffer as CODING->dst_object, CODING->src_pos must be
6487    negative and CODING should not have `pre-write-conversion'.
6488
6489    If CODING->src_object is a string, CODING should not have
6490    `pre-write-conversion'.
6491
6492    If CODING->dst_object is a buffer, the encoded data is inserted at
6493    the current point of that buffer.
6494
6495    If CODING->dst_object is nil, the encoded data is placed at the
6496    memory area specified by CODING->destination.  */
6497
6498 static int
6499 encode_coding (coding)
6500      struct coding_system *coding;
6501 {
6502   Lisp_Object attrs;
6503   Lisp_Object translation_table;
6504   int max_lookup;
6505
6506   attrs = CODING_ID_ATTRS (coding->id);
6507   if (coding->encoder == encode_coding_raw_text)
6508     translation_table = Qnil, max_lookup = 0;
6509   else
6510     translation_table = get_translation_table (attrs, 1, &max_lookup);
6511
6512   if (BUFFERP (coding->dst_object))
6513     {
6514       set_buffer_internal (XBUFFER (coding->dst_object));
6515       coding->dst_multibyte
6516         = ! NILP (current_buffer->enable_multibyte_characters);
6517     }
6518
6519   coding->consumed = coding->consumed_char = 0;
6520   coding->produced = coding->produced_char = 0;
6521   record_conversion_result (coding, CODING_RESULT_SUCCESS);
6522   coding->errors = 0;
6523
6524   ALLOC_CONVERSION_WORK_AREA (coding);
6525
6526   do {
6527     coding_set_source (coding);
6528     consume_chars (coding, translation_table, max_lookup);
6529     coding_set_destination (coding);
6530     (*(coding->encoder)) (coding);
6531   } while (coding->consumed_char < coding->src_chars);
6532
6533   if (BUFFERP (coding->dst_object))
6534     insert_from_gap (coding->produced_char, coding->produced);
6535
6536   return (coding->result);
6537 }
6538
6539
6540 /* Name (or base name) of work buffer for code conversion.  */
6541 static Lisp_Object Vcode_conversion_workbuf_name;
6542
6543 /* A working buffer used by the top level conversion.  Once it is
6544    created, it is never destroyed.  It has the name
6545    Vcode_conversion_workbuf_name.  The other working buffers are
6546    destroyed after the use is finished, and their names are modified
6547    versions of Vcode_conversion_workbuf_name.  */
6548 static Lisp_Object Vcode_conversion_reused_workbuf;
6549
6550 /* 1 iff Vcode_conversion_reused_workbuf is already in use.  */
6551 static int reused_workbuf_in_use;
6552
6553
6554 /* Return a working buffer of code convesion.  MULTIBYTE specifies the
6555    multibyteness of returning buffer.  */
6556
6557 static Lisp_Object
6558 make_conversion_work_buffer (multibyte)
6559      int multibyte;
6560 {
6561   Lisp_Object name, workbuf;
6562   struct buffer *current;
6563
6564   if (reused_workbuf_in_use++)
6565     {
6566       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6567       workbuf = Fget_buffer_create (name);
6568     }
6569   else
6570     {
6571       name = Vcode_conversion_workbuf_name;
6572       workbuf = Fget_buffer_create (name);
6573       if (NILP (Vcode_conversion_reused_workbuf))
6574         Vcode_conversion_reused_workbuf = workbuf;
6575     }
6576   current = current_buffer;
6577   set_buffer_internal (XBUFFER (workbuf));
6578   Ferase_buffer ();
6579   current_buffer->undo_list = Qt;
6580   current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
6581   set_buffer_internal (current);
6582   return workbuf;
6583 }
6584
6585
6586 static Lisp_Object
6587 code_conversion_restore (arg)
6588      Lisp_Object arg;
6589 {
6590   Lisp_Object current, workbuf;
6591
6592   current = XCAR (arg);
6593   workbuf = XCDR (arg);
6594   if (! NILP (workbuf))
6595     {
6596       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
6597         reused_workbuf_in_use = 0;
6598       else if (! NILP (Fbuffer_live_p (workbuf)))
6599         Fkill_buffer (workbuf);
6600     }
6601   set_buffer_internal (XBUFFER (current));
6602   return Qnil;
6603 }
6604
6605 Lisp_Object
6606 code_conversion_save (with_work_buf, multibyte)
6607      int with_work_buf, multibyte;
6608 {
6609   Lisp_Object workbuf = Qnil;
6610
6611   if (with_work_buf)
6612     workbuf = make_conversion_work_buffer (multibyte);
6613   record_unwind_protect (code_conversion_restore,
6614                          Fcons (Fcurrent_buffer (), workbuf));
6615   return workbuf;
6616 }
6617
6618 int
6619 decode_coding_gap (coding, chars, bytes)
6620      struct coding_system *coding;
6621      EMACS_INT chars, bytes;
6622 {
6623   int count = specpdl_ptr - specpdl;
6624   Lisp_Object attrs;
6625
6626   code_conversion_save (0, 0);
6627
6628   coding->src_object = Fcurrent_buffer ();
6629   coding->src_chars = chars;
6630   coding->src_bytes = bytes;
6631   coding->src_pos = -chars;
6632   coding->src_pos_byte = -bytes;
6633   coding->src_multibyte = chars < bytes;
6634   coding->dst_object = coding->src_object;
6635   coding->dst_pos = PT;
6636   coding->dst_pos_byte = PT_BYTE;
6637   coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
6638   coding->mode |= CODING_MODE_LAST_BLOCK;
6639
6640   if (CODING_REQUIRE_DETECTION (coding))
6641     detect_coding (coding);
6642
6643   decode_coding (coding);
6644
6645   attrs = CODING_ID_ATTRS (coding->id);
6646   if (! NILP (CODING_ATTR_POST_READ (attrs)))
6647     {
6648       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6649       Lisp_Object val;
6650
6651       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6652       val = call1 (CODING_ATTR_POST_READ (attrs),
6653                    make_number (coding->produced_char));
6654       CHECK_NATNUM (val);
6655       coding->produced_char += Z - prev_Z;
6656       coding->produced += Z_BYTE - prev_Z_BYTE;
6657     }
6658
6659   unbind_to (count, Qnil);
6660   return coding->result;
6661 }
6662
6663 int
6664 encode_coding_gap (coding, chars, bytes)
6665      struct coding_system *coding;
6666      EMACS_INT chars, bytes;
6667 {
6668   int count = specpdl_ptr - specpdl;
6669
6670   code_conversion_save (0, 0);
6671
6672   coding->src_object = Fcurrent_buffer ();
6673   coding->src_chars = chars;
6674   coding->src_bytes = bytes;
6675   coding->src_pos = -chars;
6676   coding->src_pos_byte = -bytes;
6677   coding->src_multibyte = chars < bytes;
6678   coding->dst_object = coding->src_object;
6679   coding->dst_pos = PT;
6680   coding->dst_pos_byte = PT_BYTE;
6681
6682   encode_coding (coding);
6683
6684   unbind_to (count, Qnil);
6685   return coding->result;
6686 }
6687
6688
6689 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
6690    SRC_OBJECT into DST_OBJECT by coding context CODING.
6691
6692    SRC_OBJECT is a buffer, a string, or Qnil.
6693
6694    If it is a buffer, the text is at point of the buffer.  FROM and TO
6695    are positions in the buffer.
6696
6697    If it is a string, the text is at the beginning of the string.
6698    FROM and TO are indices to the string.
6699
6700    If it is nil, the text is at coding->source.  FROM and TO are
6701    indices to coding->source.
6702
6703    DST_OBJECT is a buffer, Qt, or Qnil.
6704
6705    If it is a buffer, the decoded text is inserted at point of the
6706    buffer.  If the buffer is the same as SRC_OBJECT, the source text
6707    is deleted.
6708
6709    If it is Qt, a string is made from the decoded text, and
6710    set in CODING->dst_object.
6711
6712    If it is Qnil, the decoded text is stored at CODING->destination.
6713    The caller must allocate CODING->dst_bytes bytes at
6714    CODING->destination by xmalloc.  If the decoded text is longer than
6715    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
6716  */
6717
6718 void
6719 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6720                       dst_object)
6721      struct coding_system *coding;
6722      Lisp_Object src_object;
6723      EMACS_INT from, from_byte, to, to_byte;
6724      Lisp_Object dst_object;
6725 {
6726   int count = specpdl_ptr - specpdl;
6727   unsigned char *destination;
6728   EMACS_INT dst_bytes;
6729   EMACS_INT chars = to - from;
6730   EMACS_INT bytes = to_byte - from_byte;
6731   Lisp_Object attrs;
6732   Lisp_Object buffer;
6733   int saved_pt = -1, saved_pt_byte;
6734
6735   buffer = Fcurrent_buffer ();
6736
6737   if (NILP (dst_object))
6738     {
6739       destination = coding->destination;
6740       dst_bytes = coding->dst_bytes;
6741     }
6742
6743   coding->src_object = src_object;
6744   coding->src_chars = chars;
6745   coding->src_bytes = bytes;
6746   coding->src_multibyte = chars < bytes;
6747
6748   if (STRINGP (src_object))
6749     {
6750       coding->src_pos = from;
6751       coding->src_pos_byte = from_byte;
6752     }
6753   else if (BUFFERP (src_object))
6754     {
6755       set_buffer_internal (XBUFFER (src_object));
6756       if (from != GPT)
6757         move_gap_both (from, from_byte);
6758       if (EQ (src_object, dst_object))
6759         {
6760           saved_pt = PT, saved_pt_byte = PT_BYTE;
6761           TEMP_SET_PT_BOTH (from, from_byte);
6762           del_range_both (from, from_byte, to, to_byte, 1);
6763           coding->src_pos = -chars;
6764           coding->src_pos_byte = -bytes;
6765         }
6766       else
6767         {
6768           coding->src_pos = from;
6769           coding->src_pos_byte = from_byte;
6770         }
6771     }
6772
6773   if (CODING_REQUIRE_DETECTION (coding))
6774     detect_coding (coding);
6775   attrs = CODING_ID_ATTRS (coding->id);
6776
6777   if (EQ (dst_object, Qt)
6778       || (! NILP (CODING_ATTR_POST_READ (attrs))
6779           && NILP (dst_object)))
6780     {
6781       coding->dst_object = code_conversion_save (1, 1);
6782       coding->dst_pos = BEG;
6783       coding->dst_pos_byte = BEG_BYTE;
6784       coding->dst_multibyte = 1;
6785     }
6786   else if (BUFFERP (dst_object))
6787     {
6788       code_conversion_save (0, 0);
6789       coding->dst_object = dst_object;
6790       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6791       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6792       coding->dst_multibyte
6793         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6794     }
6795   else
6796     {
6797       code_conversion_save (0, 0);
6798       coding->dst_object = Qnil;
6799       coding->dst_multibyte = 1;
6800     }
6801
6802   decode_coding (coding);
6803
6804   if (BUFFERP (coding->dst_object))
6805     set_buffer_internal (XBUFFER (coding->dst_object));
6806
6807   if (! NILP (CODING_ATTR_POST_READ (attrs)))
6808     {
6809       struct gcpro gcpro1, gcpro2;
6810       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6811       Lisp_Object val;
6812
6813       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6814       GCPRO2 (coding->src_object, coding->dst_object);
6815       val = call1 (CODING_ATTR_POST_READ (attrs),
6816                    make_number (coding->produced_char));
6817       UNGCPRO;
6818       CHECK_NATNUM (val);
6819       coding->produced_char += Z - prev_Z;
6820       coding->produced += Z_BYTE - prev_Z_BYTE;
6821     }
6822
6823   if (EQ (dst_object, Qt))
6824     {
6825       coding->dst_object = Fbuffer_string ();
6826     }
6827   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
6828     {
6829       set_buffer_internal (XBUFFER (coding->dst_object));
6830       if (dst_bytes < coding->produced)
6831         {
6832           destination
6833             = (unsigned char *) xrealloc (destination, coding->produced);
6834           if (! destination)
6835             {
6836               record_conversion_result (coding,
6837                                         CODING_RESULT_INSUFFICIENT_DST);
6838               unbind_to (count, Qnil);
6839               return;
6840             }
6841           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
6842             move_gap_both (BEGV, BEGV_BYTE);
6843           bcopy (BEGV_ADDR, destination, coding->produced);
6844           coding->destination = destination;
6845         }
6846     }
6847
6848   if (saved_pt >= 0)
6849     {
6850       /* This is the case of:
6851          (BUFFERP (src_object) && EQ (src_object, dst_object))
6852          As we have moved PT while replacing the original buffer
6853          contents, we must recover it now.  */
6854       set_buffer_internal (XBUFFER (src_object));
6855       if (saved_pt < from)
6856         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6857       else if (saved_pt < from + chars)
6858         TEMP_SET_PT_BOTH (from, from_byte);
6859       else if (! NILP (current_buffer->enable_multibyte_characters))
6860         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6861                           saved_pt_byte + (coding->produced - bytes));
6862       else
6863         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6864                           saved_pt_byte + (coding->produced - bytes));
6865     }
6866
6867   unbind_to (count, coding->dst_object);
6868 }
6869
6870
6871 void
6872 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6873                       dst_object)
6874      struct coding_system *coding;
6875      Lisp_Object src_object;
6876      EMACS_INT from, from_byte, to, to_byte;
6877      Lisp_Object dst_object;
6878 {
6879   int count = specpdl_ptr - specpdl;
6880   EMACS_INT chars = to - from;
6881   EMACS_INT bytes = to_byte - from_byte;
6882   Lisp_Object attrs;
6883   Lisp_Object buffer;
6884   int saved_pt = -1, saved_pt_byte;
6885
6886   buffer = Fcurrent_buffer ();
6887
6888   coding->src_object = src_object;
6889   coding->src_chars = chars;
6890   coding->src_bytes = bytes;
6891   coding->src_multibyte = chars < bytes;
6892
6893   attrs = CODING_ID_ATTRS (coding->id);
6894
6895   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6896     {
6897       coding->src_object = code_conversion_save (1, coding->src_multibyte);
6898       set_buffer_internal (XBUFFER (coding->src_object));
6899       if (STRINGP (src_object))
6900         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
6901       else if (BUFFERP (src_object))
6902         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
6903       else
6904         insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
6905
6906       if (EQ (src_object, dst_object))
6907         {
6908           set_buffer_internal (XBUFFER (src_object));
6909           saved_pt = PT, saved_pt_byte = PT_BYTE;
6910           del_range_both (from, from_byte, to, to_byte, 1);
6911           set_buffer_internal (XBUFFER (coding->src_object));
6912         }
6913
6914       call2 (CODING_ATTR_PRE_WRITE (attrs),
6915              make_number (BEG), make_number (Z));
6916       coding->src_object = Fcurrent_buffer ();
6917       if (BEG != GPT)
6918         move_gap_both (BEG, BEG_BYTE);
6919       coding->src_chars = Z - BEG;
6920       coding->src_bytes = Z_BYTE - BEG_BYTE;
6921       coding->src_pos = BEG;
6922       coding->src_pos_byte = BEG_BYTE;
6923       coding->src_multibyte = Z < Z_BYTE;
6924     }
6925   else if (STRINGP (src_object))
6926     {
6927       code_conversion_save (0, 0);
6928       coding->src_pos = from;
6929       coding->src_pos_byte = from_byte;
6930     }
6931   else if (BUFFERP (src_object))
6932     {
6933       code_conversion_save (0, 0);
6934       set_buffer_internal (XBUFFER (src_object));
6935       if (EQ (src_object, dst_object))
6936         {
6937           saved_pt = PT, saved_pt_byte = PT_BYTE;
6938           coding->src_object = del_range_1 (from, to, 1, 1);
6939           coding->src_pos = 0;
6940           coding->src_pos_byte = 0;
6941         }
6942       else
6943         {
6944           if (from < GPT && to >= GPT)
6945             move_gap_both (from, from_byte);
6946           coding->src_pos = from;
6947           coding->src_pos_byte = from_byte;
6948         }
6949     }
6950   else
6951     code_conversion_save (0, 0);
6952
6953   if (BUFFERP (dst_object))
6954     {
6955       coding->dst_object = dst_object;
6956       if (EQ (src_object, dst_object))
6957         {
6958           coding->dst_pos = from;
6959           coding->dst_pos_byte = from_byte;
6960         }
6961       else
6962         {
6963           coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6964           coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6965         }
6966       coding->dst_multibyte
6967         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6968     }
6969   else if (EQ (dst_object, Qt))
6970     {
6971       coding->dst_object = Qnil;
6972       coding->dst_bytes = coding->src_chars;
6973       if (coding->dst_bytes == 0)
6974         coding->dst_bytes = 1;
6975       coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
6976       coding->dst_multibyte = 0;
6977     }
6978   else
6979     {
6980       coding->dst_object = Qnil;
6981       coding->dst_multibyte = 0;
6982     }
6983
6984   encode_coding (coding);
6985
6986   if (EQ (dst_object, Qt))
6987     {
6988       if (BUFFERP (coding->dst_object))
6989         coding->dst_object = Fbuffer_string ();
6990       else
6991         {
6992           coding->dst_object
6993             = make_unibyte_string ((char *) coding->destination,
6994                                    coding->produced);
6995           xfree (coding->destination);
6996         }
6997     }
6998
6999   if (saved_pt >= 0)
7000     {
7001       /* This is the case of:
7002          (BUFFERP (src_object) && EQ (src_object, dst_object))
7003          As we have moved PT while replacing the original buffer
7004          contents, we must recover it now.  */
7005       set_buffer_internal (XBUFFER (src_object));
7006       if (saved_pt < from)
7007         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7008       else if (saved_pt < from + chars)
7009         TEMP_SET_PT_BOTH (from, from_byte);
7010       else if (! NILP (current_buffer->enable_multibyte_characters))
7011         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7012                           saved_pt_byte + (coding->produced - bytes));
7013       else
7014         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7015                           saved_pt_byte + (coding->produced - bytes));
7016     }
7017
7018   unbind_to (count, Qnil);
7019 }
7020
7021
7022 Lisp_Object
7023 preferred_coding_system ()
7024 {
7025   int id = coding_categories[coding_priorities[0]].id;
7026
7027   return CODING_ID_NAME (id);
7028 }
7029
7030 \f
7031 #ifdef emacs
7032 /*** 8. Emacs Lisp library functions ***/
7033
7034 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
7035        doc: /* Return t if OBJECT is nil or a coding-system.
7036 See the documentation of `define-coding-system' for information
7037 about coding-system objects.  */)
7038      (obj)
7039      Lisp_Object obj;
7040 {
7041   return ((NILP (obj) || CODING_SYSTEM_P (obj)) ? Qt : Qnil);
7042 }
7043
7044 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
7045        Sread_non_nil_coding_system, 1, 1, 0,
7046        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
7047      (prompt)
7048      Lisp_Object prompt;
7049 {
7050   Lisp_Object val;
7051   do
7052     {
7053       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
7054                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
7055     }
7056   while (SCHARS (val) == 0);
7057   return (Fintern (val, Qnil));
7058 }
7059
7060 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
7061        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
7062 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.  */)
7063      (prompt, default_coding_system)
7064      Lisp_Object prompt, default_coding_system;
7065 {
7066   Lisp_Object val;
7067   if (SYMBOLP (default_coding_system))
7068     XSETSTRING (default_coding_system, XPNTR (SYMBOL_NAME (default_coding_system)));
7069   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
7070                           Qt, Qnil, Qcoding_system_history,
7071                           default_coding_system, Qnil);
7072   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
7073 }
7074
7075 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
7076        1, 1, 0,
7077        doc: /* Check validity of CODING-SYSTEM.
7078 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
7079 It is valid if it is nil or a symbol defined as a coding system by the
7080 function `define-coding-system'.  */)
7081   (coding_system)
7082      Lisp_Object coding_system;
7083 {
7084   CHECK_SYMBOL (coding_system);
7085   if (!NILP (Fcoding_system_p (coding_system)))
7086     return coding_system;
7087   while (1)
7088     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
7089 }
7090
7091 \f
7092 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
7093    HIGHEST is nonzero, return the coding system of the highest
7094    priority among the detected coding systems.  Otherwize return a
7095    list of detected coding systems sorted by their priorities.  If
7096    MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
7097    multibyte form but contains only ASCII and eight-bit chars.
7098    Otherwise, the bytes are raw bytes.
7099
7100    CODING-SYSTEM controls the detection as below:
7101
7102    If it is nil, detect both text-format and eol-format.  If the
7103    text-format part of CODING-SYSTEM is already specified
7104    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
7105    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
7106    detect only text-format.  */
7107
7108 Lisp_Object
7109 detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
7110                       coding_system)
7111      const unsigned char *src;
7112      int src_chars, src_bytes, highest;
7113      int multibytep;
7114      Lisp_Object coding_system;
7115 {
7116   const unsigned char *src_end = src + src_bytes;
7117   Lisp_Object attrs, eol_type;
7118   Lisp_Object val;
7119   struct coding_system coding;
7120   int id;
7121   struct coding_detection_info detect_info;
7122   enum coding_category base_category;
7123
7124   if (NILP (coding_system))
7125     coding_system = Qundecided;
7126   setup_coding_system (coding_system, &coding);
7127   attrs = CODING_ID_ATTRS (coding.id);
7128   eol_type = CODING_ID_EOL_TYPE (coding.id);
7129   coding_system = CODING_ATTR_BASE_NAME (attrs);
7130
7131   coding.source = src;
7132   coding.src_chars = src_chars;
7133   coding.src_bytes = src_bytes;
7134   coding.src_multibyte = multibytep;
7135   coding.consumed = 0;
7136   coding.mode |= CODING_MODE_LAST_BLOCK;
7137
7138   detect_info.checked = detect_info.found = detect_info.rejected = 0;
7139
7140   /* At first, detect text-format if necessary.  */
7141   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
7142   if (base_category == coding_category_undecided)
7143     {
7144       enum coding_category category;
7145       struct coding_system *this;
7146       int c, i;
7147
7148       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
7149       for (i = 0; src < src_end; i++, src++)
7150         {
7151           c = *src;
7152           if (c & 0x80)
7153             break;
7154           if (c < 0x20
7155               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
7156               && inhibit_iso_escape_detection)
7157             {
7158               coding.head_ascii = src - coding.source;
7159               if (detect_coding_iso_2022 (&coding, &detect_info))
7160                 {
7161                   /* We have scanned the whole data.  */
7162                   if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
7163                     /* We didn't find an 8-bit code.  */
7164                     src = src_end;
7165                   break;
7166                 }
7167             }
7168         }
7169       coding.head_ascii = src - coding.source;
7170
7171       if (src < src_end
7172           || detect_info.found)
7173         {
7174           if (src == src_end)
7175             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
7176             for (i = 0; i < coding_category_raw_text; i++)
7177               {
7178                 category = coding_priorities[i];
7179                 if (detect_info.found & (1 << category))
7180                   break;
7181               }
7182           else
7183             for (i = 0; i < coding_category_raw_text; i++)
7184               {
7185                 category = coding_priorities[i];
7186                 this = coding_categories + category;
7187
7188                 if (this->id < 0)
7189                   {
7190                     /* No coding system of this category is defined.  */
7191                     detect_info.rejected |= (1 << category);
7192                   }
7193                 else if (category >= coding_category_raw_text)
7194                   continue;
7195                 else if (detect_info.checked & (1 << category))
7196                   {
7197                     if (highest
7198                         && (detect_info.found & (1 << category)))
7199                       break;
7200                   }
7201                 else
7202                   {
7203                     if ((*(this->detector)) (&coding, &detect_info)
7204                         && highest
7205                         && (detect_info.found & (1 << category)))
7206                       {
7207                         if (category == coding_category_utf_16_auto)
7208                           {
7209                             if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7210                               category = coding_category_utf_16_le;
7211                             else
7212                               category = coding_category_utf_16_be;
7213                           }
7214                         break;
7215                       }
7216                   }
7217               }
7218         }
7219
7220       if (detect_info.rejected == CATEGORY_MASK_ANY)
7221         {
7222           detect_info.found = CATEGORY_MASK_RAW_TEXT;
7223           id = coding_categories[coding_category_raw_text].id;
7224           val = Fcons (make_number (id), Qnil);
7225         }
7226       else if (! detect_info.rejected && ! detect_info.found)
7227         {
7228           detect_info.found = CATEGORY_MASK_ANY;
7229           id = coding_categories[coding_category_undecided].id;
7230           val = Fcons (make_number (id), Qnil);
7231         }
7232       else if (highest)
7233         {
7234           if (detect_info.found)
7235             {
7236               detect_info.found = 1 << category;
7237               val = Fcons (make_number (this->id), Qnil);
7238             }
7239           else
7240             for (i = 0; i < coding_category_raw_text; i++)
7241               if (! (detect_info.rejected & (1 << coding_priorities[i])))
7242                 {
7243                   detect_info.found = 1 << coding_priorities[i];
7244                   id = coding_categories[coding_priorities[i]].id;
7245                   val = Fcons (make_number (id), Qnil);
7246                   break;
7247                 }
7248         }
7249       else
7250         {
7251           int mask = detect_info.rejected | detect_info.found;
7252           int found = 0;
7253           val = Qnil;
7254
7255           for (i = coding_category_raw_text - 1; i >= 0; i--)
7256             {
7257               category = coding_priorities[i];
7258               if (! (mask & (1 << category)))
7259                 {
7260                   found |= 1 << category;
7261                   id = coding_categories[category].id;
7262                   val = Fcons (make_number (id), val);
7263                 }
7264             }
7265           for (i = coding_category_raw_text - 1; i >= 0; i--)
7266             {
7267               category = coding_priorities[i];
7268               if (detect_info.found & (1 << category))
7269                 {
7270                   id = coding_categories[category].id;
7271                   val = Fcons (make_number (id), val);
7272                 }
7273             }
7274           detect_info.found |= found;
7275         }
7276     }
7277   else if (base_category == coding_category_utf_16_auto)
7278     {
7279       if (detect_coding_utf_16 (&coding, &detect_info))
7280         {
7281           struct coding_system *this;
7282
7283           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7284             this = coding_categories + coding_category_utf_16_le;
7285           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
7286             this = coding_categories + coding_category_utf_16_be;
7287           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
7288             this = coding_categories + coding_category_utf_16_be_nosig;
7289           else
7290             this = coding_categories + coding_category_utf_16_le_nosig;
7291           val = Fcons (make_number (this->id), Qnil);
7292         }
7293     }
7294   else
7295     {
7296       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
7297       val = Fcons (make_number (coding.id), Qnil);
7298     }
7299
7300   /* Then, detect eol-format if necessary.  */
7301   {
7302     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
7303     Lisp_Object tail;
7304
7305     if (VECTORP (eol_type))
7306       {
7307         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
7308           normal_eol = detect_eol (coding.source, src_bytes,
7309                                    coding_category_raw_text);
7310         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
7311                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
7312           utf_16_be_eol = detect_eol (coding.source, src_bytes,
7313                                       coding_category_utf_16_be);
7314         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
7315                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
7316           utf_16_le_eol = detect_eol (coding.source, src_bytes,
7317                                       coding_category_utf_16_le);
7318       }
7319     else
7320       {
7321         if (EQ (eol_type, Qunix))
7322           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
7323         else if (EQ (eol_type, Qdos))
7324           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
7325         else
7326           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
7327       }
7328
7329     for (tail = val; CONSP (tail); tail = XCDR (tail))
7330       {
7331         enum coding_category category;
7332         int this_eol;
7333
7334         id = XINT (XCAR (tail));
7335         attrs = CODING_ID_ATTRS (id);
7336         category = XINT (CODING_ATTR_CATEGORY (attrs));
7337         eol_type = CODING_ID_EOL_TYPE (id);
7338         if (VECTORP (eol_type))
7339           {
7340             if (category == coding_category_utf_16_be
7341                 || category == coding_category_utf_16_be_nosig)
7342               this_eol = utf_16_be_eol;
7343             else if (category == coding_category_utf_16_le
7344                      || category == coding_category_utf_16_le_nosig)
7345               this_eol = utf_16_le_eol;
7346             else
7347               this_eol = normal_eol;
7348
7349             if (this_eol == EOL_SEEN_LF)
7350               XSETCAR (tail, AREF (eol_type, 0));
7351             else if (this_eol == EOL_SEEN_CRLF)
7352               XSETCAR (tail, AREF (eol_type, 1));
7353             else if (this_eol == EOL_SEEN_CR)
7354               XSETCAR (tail, AREF (eol_type, 2));
7355             else
7356               XSETCAR (tail, CODING_ID_NAME (id));
7357           }
7358         else
7359           XSETCAR (tail, CODING_ID_NAME (id));
7360       }
7361   }
7362
7363   return (highest ? XCAR (val) : val);
7364 }
7365
7366
7367 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
7368        2, 3, 0,
7369        doc: /* Detect coding system of the text in the region between START and END.
7370 Return a list of possible coding systems ordered by priority.
7371
7372 If only ASCII characters are found, it returns a list of single element
7373 `undecided' or its subsidiary coding system according to a detected
7374 end-of-line format.
7375
7376 If optional argument HIGHEST is non-nil, return the coding system of
7377 highest priority.  */)
7378      (start, end, highest)
7379      Lisp_Object start, end, highest;
7380 {
7381   int from, to;
7382   int from_byte, to_byte;
7383
7384   CHECK_NUMBER_COERCE_MARKER (start);
7385   CHECK_NUMBER_COERCE_MARKER (end);
7386
7387   validate_region (&start, &end);
7388   from = XINT (start), to = XINT (end);
7389   from_byte = CHAR_TO_BYTE (from);
7390   to_byte = CHAR_TO_BYTE (to);
7391
7392   if (from < GPT && to >= GPT)
7393     move_gap_both (to, to_byte);
7394
7395   return detect_coding_system (BYTE_POS_ADDR (from_byte),
7396                                to - from, to_byte - from_byte,
7397                                !NILP (highest),
7398                                !NILP (current_buffer
7399                                       ->enable_multibyte_characters),
7400                                Qnil);
7401 }
7402
7403 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
7404        1, 2, 0,
7405        doc: /* Detect coding system of the text in STRING.
7406 Return a list of possible coding systems ordered by priority.
7407
7408 If only ASCII characters are found, it returns a list of single element
7409 `undecided' or its subsidiary coding system according to a detected
7410 end-of-line format.
7411
7412 If optional argument HIGHEST is non-nil, return the coding system of
7413 highest priority.  */)
7414      (string, highest)
7415      Lisp_Object string, highest;
7416 {
7417   CHECK_STRING (string);
7418
7419   return detect_coding_system (SDATA (string),
7420                                SCHARS (string), SBYTES (string),
7421                                !NILP (highest), STRING_MULTIBYTE (string),
7422                                Qnil);
7423 }
7424
7425
7426 static INLINE int
7427 char_encodable_p (c, attrs)
7428      int c;
7429      Lisp_Object attrs;
7430 {
7431   Lisp_Object tail;
7432   struct charset *charset;
7433   Lisp_Object translation_table;
7434
7435   translation_table = CODING_ATTR_TRANS_TBL (attrs);
7436   if (! NILP (translation_table))
7437     c = translate_char (translation_table, c);
7438   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
7439        CONSP (tail); tail = XCDR (tail))
7440     {
7441       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
7442       if (CHAR_CHARSET_P (c, charset))
7443         break;
7444     }
7445   return (! NILP (tail));
7446 }
7447
7448
7449 /* Return a list of coding systems that safely encode the text between
7450    START and END.  If EXCLUDE is non-nil, it is a list of coding
7451    systems not to check.  The returned list doesn't contain any such
7452    coding systems.  In any case, if the text contains only ASCII or is
7453    unibyte, return t.  */
7454
7455 DEFUN ("find-coding-systems-region-internal",
7456        Ffind_coding_systems_region_internal,
7457        Sfind_coding_systems_region_internal, 2, 3, 0,
7458        doc: /* Internal use only.  */)
7459      (start, end, exclude)
7460      Lisp_Object start, end, exclude;
7461 {
7462   Lisp_Object coding_attrs_list, safe_codings;
7463   EMACS_INT start_byte, end_byte;
7464   const unsigned char *p, *pbeg, *pend;
7465   int c;
7466   Lisp_Object tail, elt;
7467
7468   if (STRINGP (start))
7469     {
7470       if (!STRING_MULTIBYTE (start)
7471           || SCHARS (start) == SBYTES (start))
7472         return Qt;
7473       start_byte = 0;
7474       end_byte = SBYTES (start);
7475     }
7476   else
7477     {
7478       CHECK_NUMBER_COERCE_MARKER (start);
7479       CHECK_NUMBER_COERCE_MARKER (end);
7480       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7481         args_out_of_range (start, end);
7482       if (NILP (current_buffer->enable_multibyte_characters))
7483         return Qt;
7484       start_byte = CHAR_TO_BYTE (XINT (start));
7485       end_byte = CHAR_TO_BYTE (XINT (end));
7486       if (XINT (end) - XINT (start) == end_byte - start_byte)
7487         return Qt;
7488
7489       if (XINT (start) < GPT && XINT (end) > GPT)
7490         {
7491           if ((GPT - XINT (start)) < (XINT (end) - GPT))
7492             move_gap_both (XINT (start), start_byte);
7493           else
7494             move_gap_both (XINT (end), end_byte);
7495         }
7496     }
7497
7498   coding_attrs_list = Qnil;
7499   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
7500     if (NILP (exclude)
7501         || NILP (Fmemq (XCAR (tail), exclude)))
7502       {
7503         Lisp_Object attrs;
7504
7505         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
7506         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
7507             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
7508           {
7509             ASET (attrs, coding_attr_trans_tbl,
7510                   get_translation_table (attrs, 1, NULL));
7511             coding_attrs_list = Fcons (attrs, coding_attrs_list);
7512           }
7513       }
7514
7515   if (STRINGP (start))
7516     p = pbeg = SDATA (start);
7517   else
7518     p = pbeg = BYTE_POS_ADDR (start_byte);
7519   pend = p + (end_byte - start_byte);
7520
7521   while (p < pend && ASCII_BYTE_P (*p)) p++;
7522   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7523
7524   while (p < pend)
7525     {
7526       if (ASCII_BYTE_P (*p))
7527         p++;
7528       else
7529         {
7530           c = STRING_CHAR_ADVANCE (p);
7531
7532           charset_map_loaded = 0;
7533           for (tail = coding_attrs_list; CONSP (tail);)
7534             {
7535               elt = XCAR (tail);
7536               if (NILP (elt))
7537                 tail = XCDR (tail);
7538               else if (char_encodable_p (c, elt))
7539                 tail = XCDR (tail);
7540               else if (CONSP (XCDR (tail)))
7541                 {
7542                   XSETCAR (tail, XCAR (XCDR (tail)));
7543                   XSETCDR (tail, XCDR (XCDR (tail)));
7544                 }
7545               else
7546                 {
7547                   XSETCAR (tail, Qnil);
7548                   tail = XCDR (tail);
7549                 }
7550             }
7551           if (charset_map_loaded)
7552             {
7553               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7554
7555               if (STRINGP (start))
7556                 pbeg = SDATA (start);
7557               else
7558                 pbeg = BYTE_POS_ADDR (start_byte);
7559               p = pbeg + p_offset;
7560               pend = pbeg + pend_offset;
7561             }
7562         }
7563     }
7564
7565   safe_codings = list2 (Qraw_text, Qno_conversion);
7566   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
7567     if (! NILP (XCAR (tail)))
7568       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
7569
7570   return safe_codings;
7571 }
7572
7573
7574 DEFUN ("unencodable-char-position", Funencodable_char_position,
7575        Sunencodable_char_position, 3, 5, 0,
7576        doc: /*
7577 Return position of first un-encodable character in a region.
7578 START and END specfiy the region and CODING-SYSTEM specifies the
7579 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
7580
7581 If optional 4th argument COUNT is non-nil, it specifies at most how
7582 many un-encodable characters to search.  In this case, the value is a
7583 list of positions.
7584
7585 If optional 5th argument STRING is non-nil, it is a string to search
7586 for un-encodable characters.  In that case, START and END are indexes
7587 to the string.  */)
7588      (start, end, coding_system, count, string)
7589      Lisp_Object start, end, coding_system, count, string;
7590 {
7591   int n;
7592   struct coding_system coding;
7593   Lisp_Object attrs, charset_list, translation_table;
7594   Lisp_Object positions;
7595   int from, to;
7596   const unsigned char *p, *stop, *pend;
7597   int ascii_compatible;
7598
7599   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7600   attrs = CODING_ID_ATTRS (coding.id);
7601   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
7602     return Qnil;
7603   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
7604   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7605   translation_table = get_translation_table (attrs, 1, NULL);
7606
7607   if (NILP (string))
7608     {
7609       validate_region (&start, &end);
7610       from = XINT (start);
7611       to = XINT (end);
7612       if (NILP (current_buffer->enable_multibyte_characters)
7613           || (ascii_compatible
7614               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
7615         return Qnil;
7616       p = CHAR_POS_ADDR (from);
7617       pend = CHAR_POS_ADDR (to);
7618       if (from < GPT && to >= GPT)
7619         stop = GPT_ADDR;
7620       else
7621         stop = pend;
7622     }
7623   else
7624     {
7625       CHECK_STRING (string);
7626       CHECK_NATNUM (start);
7627       CHECK_NATNUM (end);
7628       from = XINT (start);
7629       to = XINT (end);
7630       if (from > to
7631           || to > SCHARS (string))
7632         args_out_of_range_3 (string, start, end);
7633       if (! STRING_MULTIBYTE (string))
7634         return Qnil;
7635       p = SDATA (string) + string_char_to_byte (string, from);
7636       stop = pend = SDATA (string) + string_char_to_byte (string, to);
7637       if (ascii_compatible && (to - from) == (pend - p))
7638         return Qnil;
7639     }
7640
7641   if (NILP (count))
7642     n = 1;
7643   else
7644     {
7645       CHECK_NATNUM (count);
7646       n = XINT (count);
7647     }
7648
7649   positions = Qnil;
7650   while (1)
7651     {
7652       int c;
7653
7654       if (ascii_compatible)
7655         while (p < stop && ASCII_BYTE_P (*p))
7656           p++, from++;
7657       if (p >= stop)
7658         {
7659           if (p >= pend)
7660             break;
7661           stop = pend;
7662           p = GAP_END_ADDR;
7663         }
7664
7665       c = STRING_CHAR_ADVANCE (p);
7666       if (! (ASCII_CHAR_P (c) && ascii_compatible)
7667           && ! char_charset (translate_char (translation_table, c),
7668                              charset_list, NULL))
7669         {
7670           positions = Fcons (make_number (from), positions);
7671           n--;
7672           if (n == 0)
7673             break;
7674         }
7675
7676       from++;
7677     }
7678
7679   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
7680 }
7681
7682
7683 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
7684        Scheck_coding_systems_region, 3, 3, 0,
7685        doc: /* Check if the region is encodable by coding systems.
7686
7687 START and END are buffer positions specifying the region.
7688 CODING-SYSTEM-LIST is a list of coding systems to check.
7689
7690 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
7691 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
7692 whole region, POS0, POS1, ... are buffer positions where non-encodable
7693 characters are found.
7694
7695 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
7696 value is nil.
7697
7698 START may be a string.  In that case, check if the string is
7699 encodable, and the value contains indices to the string instead of
7700 buffer positions.  END is ignored.  */)
7701      (start, end, coding_system_list)
7702      Lisp_Object start, end, coding_system_list;
7703 {
7704   Lisp_Object list;
7705   EMACS_INT start_byte, end_byte;
7706   int pos;
7707   const unsigned char *p, *pbeg, *pend;
7708   int c;
7709   Lisp_Object tail, elt, attrs;
7710
7711   if (STRINGP (start))
7712     {
7713       if (!STRING_MULTIBYTE (start)
7714           && SCHARS (start) != SBYTES (start))
7715         return Qnil;
7716       start_byte = 0;
7717       end_byte = SBYTES (start);
7718       pos = 0;
7719     }
7720   else
7721     {
7722       CHECK_NUMBER_COERCE_MARKER (start);
7723       CHECK_NUMBER_COERCE_MARKER (end);
7724       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7725         args_out_of_range (start, end);
7726       if (NILP (current_buffer->enable_multibyte_characters))
7727         return Qnil;
7728       start_byte = CHAR_TO_BYTE (XINT (start));
7729       end_byte = CHAR_TO_BYTE (XINT (end));
7730       if (XINT (end) - XINT (start) == end_byte - start_byte)
7731         return Qt;
7732
7733       if (XINT (start) < GPT && XINT (end) > GPT)
7734         {
7735           if ((GPT - XINT (start)) < (XINT (end) - GPT))
7736             move_gap_both (XINT (start), start_byte);
7737           else
7738             move_gap_both (XINT (end), end_byte);
7739         }
7740       pos = XINT (start);
7741     }
7742
7743   list = Qnil;
7744   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
7745     {
7746       elt = XCAR (tail);
7747       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
7748       ASET (attrs, coding_attr_trans_tbl,
7749             get_translation_table (attrs, 1, NULL));
7750       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
7751     }
7752
7753   if (STRINGP (start))
7754     p = pbeg = SDATA (start);
7755   else
7756     p = pbeg = BYTE_POS_ADDR (start_byte);
7757   pend = p + (end_byte - start_byte);
7758
7759   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
7760   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7761
7762   while (p < pend)
7763     {
7764       if (ASCII_BYTE_P (*p))
7765         p++;
7766       else
7767         {
7768           c = STRING_CHAR_ADVANCE (p);
7769
7770           charset_map_loaded = 0;
7771           for (tail = list; CONSP (tail); tail = XCDR (tail))
7772             {
7773               elt = XCDR (XCAR (tail));
7774               if (! char_encodable_p (c, XCAR (elt)))
7775                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
7776             }
7777           if (charset_map_loaded)
7778             {
7779               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7780
7781               if (STRINGP (start))
7782                 pbeg = SDATA (start);
7783               else
7784                 pbeg = BYTE_POS_ADDR (start_byte);
7785               p = pbeg + p_offset;
7786               pend = pbeg + pend_offset;
7787             }
7788         }
7789       pos++;
7790     }
7791
7792   tail = list;
7793   list = Qnil;
7794   for (; CONSP (tail); tail = XCDR (tail))
7795     {
7796       elt = XCAR (tail);
7797       if (CONSP (XCDR (XCDR (elt))))
7798         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
7799                       list);
7800     }
7801
7802   return list;
7803 }
7804
7805
7806 Lisp_Object
7807 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
7808      Lisp_Object start, end, coding_system, dst_object;
7809      int encodep, norecord;
7810 {
7811   struct coding_system coding;
7812   EMACS_INT from, from_byte, to, to_byte;
7813   Lisp_Object src_object;
7814
7815   CHECK_NUMBER_COERCE_MARKER (start);
7816   CHECK_NUMBER_COERCE_MARKER (end);
7817   if (NILP (coding_system))
7818     coding_system = Qno_conversion;
7819   else
7820     CHECK_CODING_SYSTEM (coding_system);
7821   src_object = Fcurrent_buffer ();
7822   if (NILP (dst_object))
7823     dst_object = src_object;
7824   else if (! EQ (dst_object, Qt))
7825     CHECK_BUFFER (dst_object);
7826
7827   validate_region (&start, &end);
7828   from = XFASTINT (start);
7829   from_byte = CHAR_TO_BYTE (from);
7830   to = XFASTINT (end);
7831   to_byte = CHAR_TO_BYTE (to);
7832
7833   setup_coding_system (coding_system, &coding);
7834   coding.mode |= CODING_MODE_LAST_BLOCK;
7835
7836   if (encodep)
7837     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7838                           dst_object);
7839   else
7840     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7841                           dst_object);
7842   if (! norecord)
7843     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7844
7845   return (BUFFERP (dst_object)
7846           ? make_number (coding.produced_char)
7847           : coding.dst_object);
7848 }
7849
7850
7851 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7852        3, 4, "r\nzCoding system: ",
7853        doc: /* Decode the current region from the specified coding system.
7854 When called from a program, takes four arguments:
7855         START, END, CODING-SYSTEM, and DESTINATION.
7856 START and END are buffer positions.
7857
7858 Optional 4th arguments DESTINATION specifies where the decoded text goes.
7859 If nil, the region between START and END is replace by the decoded text.
7860 If buffer, the decoded text is inserted in the buffer.
7861 If t, the decoded text is returned.
7862
7863 This function sets `last-coding-system-used' to the precise coding system
7864 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7865 not fully specified.)
7866 It returns the length of the decoded text.  */)
7867      (start, end, coding_system, destination)
7868      Lisp_Object start, end, coding_system, destination;
7869 {
7870   return code_convert_region (start, end, coding_system, destination, 0, 0);
7871 }
7872
7873 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7874        3, 4, "r\nzCoding system: ",
7875        doc: /* Encode the current region by specified coding system.
7876 When called from a program, takes three arguments:
7877 START, END, and CODING-SYSTEM.  START and END are buffer positions.
7878
7879 Optional 4th arguments DESTINATION specifies where the encoded text goes.
7880 If nil, the region between START and END is replace by the encoded text.
7881 If buffer, the encoded text is inserted in the buffer.
7882 If t, the encoded text is returned.
7883
7884 This function sets `last-coding-system-used' to the precise coding system
7885 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7886 not fully specified.)
7887 It returns the length of the encoded text.  */)
7888   (start, end, coding_system, destination)
7889      Lisp_Object start, end, coding_system, destination;
7890 {
7891   return code_convert_region (start, end, coding_system, destination, 1, 0);
7892 }
7893
7894 Lisp_Object
7895 code_convert_string (string, coding_system, dst_object,
7896                      encodep, nocopy, norecord)
7897      Lisp_Object string, coding_system, dst_object;
7898      int encodep, nocopy, norecord;
7899 {
7900   struct coding_system coding;
7901   EMACS_INT chars, bytes;
7902
7903   CHECK_STRING (string);
7904   if (NILP (coding_system))
7905     {
7906       if (! norecord)
7907         Vlast_coding_system_used = Qno_conversion;
7908       if (NILP (dst_object))
7909         return (nocopy ? Fcopy_sequence (string) : string);
7910     }
7911
7912   if (NILP (coding_system))
7913     coding_system = Qno_conversion;
7914   else
7915     CHECK_CODING_SYSTEM (coding_system);
7916   if (NILP (dst_object))
7917     dst_object = Qt;
7918   else if (! EQ (dst_object, Qt))
7919     CHECK_BUFFER (dst_object);
7920
7921   setup_coding_system (coding_system, &coding);
7922   coding.mode |= CODING_MODE_LAST_BLOCK;
7923   chars = SCHARS (string);
7924   bytes = SBYTES (string);
7925   if (encodep)
7926     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7927   else
7928     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7929   if (! norecord)
7930     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7931
7932   return (BUFFERP (dst_object)
7933           ? make_number (coding.produced_char)
7934           : coding.dst_object);
7935 }
7936
7937
7938 /* Encode or decode STRING according to CODING_SYSTEM.
7939    Do not set Vlast_coding_system_used.
7940
7941    This function is called only from macros DECODE_FILE and
7942    ENCODE_FILE, thus we ignore character composition.  */
7943
7944 Lisp_Object
7945 code_convert_string_norecord (string, coding_system, encodep)
7946      Lisp_Object string, coding_system;
7947      int encodep;
7948 {
7949   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
7950 }
7951
7952
7953 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7954        2, 4, 0,
7955        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7956
7957 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
7958 if the decoding operation is trivial.
7959
7960 Optional fourth arg BUFFER non-nil meant that the decoded text is
7961 inserted in BUFFER instead of returned as a string.  In this case,
7962 the return value is BUFFER.
7963
7964 This function sets `last-coding-system-used' to the precise coding system
7965 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7966 not fully specified.  */)
7967   (string, coding_system, nocopy, buffer)
7968      Lisp_Object string, coding_system, nocopy, buffer;
7969 {
7970   return code_convert_string (string, coding_system, buffer,
7971                               0, ! NILP (nocopy), 0);
7972 }
7973
7974 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
7975        2, 4, 0,
7976        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7977
7978 Optional third arg NOCOPY non-nil means it is OK to return STRING
7979 itself if the encoding operation is trivial.
7980
7981 Optional fourth arg BUFFER non-nil meant that the encoded text is
7982 inserted in BUFFER instead of returned as a string.  In this case,
7983 the return value is BUFFER.
7984
7985 This function sets `last-coding-system-used' to the precise coding system
7986 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7987 not fully specified.)  */)
7988      (string, coding_system, nocopy, buffer)
7989      Lisp_Object string, coding_system, nocopy, buffer;
7990 {
7991   return code_convert_string (string, coding_system, buffer,
7992                               1, ! NILP (nocopy), 1);
7993 }
7994
7995 \f
7996 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
7997        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7998 Return the corresponding character.  */)
7999      (code)
8000      Lisp_Object code;
8001 {
8002   Lisp_Object spec, attrs, val;
8003   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
8004   int c;
8005
8006   CHECK_NATNUM (code);
8007   c = XFASTINT (code);
8008   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
8009   attrs = AREF (spec, 0);
8010
8011   if (ASCII_BYTE_P (c)
8012       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8013     return code;
8014
8015   val = CODING_ATTR_CHARSET_LIST (attrs);
8016   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8017   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8018   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
8019
8020   if (c <= 0x7F)
8021     charset = charset_roman;
8022   else if (c >= 0xA0 && c < 0xDF)
8023     {
8024       charset = charset_kana;
8025       c -= 0x80;
8026     }
8027   else
8028     {
8029       int s1 = c >> 8, s2 = c & 0xFF;
8030
8031       if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
8032           || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
8033         error ("Invalid code: %d", code);
8034       SJIS_TO_JIS (c);
8035       charset = charset_kanji;
8036     }
8037   c = DECODE_CHAR (charset, c);
8038   if (c < 0)
8039     error ("Invalid code: %d", code);
8040   return make_number (c);
8041 }
8042
8043
8044 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
8045        doc: /* Encode a Japanese character CHAR to shift_jis encoding.
8046 Return the corresponding code in SJIS.  */)
8047      (ch)
8048     Lisp_Object ch;
8049 {
8050   Lisp_Object spec, attrs, charset_list;
8051   int c;
8052   struct charset *charset;
8053   unsigned code;
8054
8055   CHECK_CHARACTER (ch);
8056   c = XFASTINT (ch);
8057   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
8058   attrs = AREF (spec, 0);
8059
8060   if (ASCII_CHAR_P (c)
8061       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8062     return ch;
8063
8064   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8065   charset = char_charset (c, charset_list, &code);
8066   if (code == CHARSET_INVALID_CODE (charset))
8067     error ("Can't encode by shift_jis encoding: %d", c);
8068   JIS_TO_SJIS (code);
8069
8070   return make_number (code);
8071 }
8072
8073 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
8074        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
8075 Return the corresponding character.  */)
8076      (code)
8077      Lisp_Object code;
8078 {
8079   Lisp_Object spec, attrs, val;
8080   struct charset *charset_roman, *charset_big5, *charset;
8081   int c;
8082
8083   CHECK_NATNUM (code);
8084   c = XFASTINT (code);
8085   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
8086   attrs = AREF (spec, 0);
8087
8088   if (ASCII_BYTE_P (c)
8089       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8090     return code;
8091
8092   val = CODING_ATTR_CHARSET_LIST (attrs);
8093   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8094   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
8095
8096   if (c <= 0x7F)
8097     charset = charset_roman;
8098   else
8099     {
8100       int b1 = c >> 8, b2 = c & 0x7F;
8101       if (b1 < 0xA1 || b1 > 0xFE
8102           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
8103         error ("Invalid code: %d", code);
8104       charset = charset_big5;
8105     }
8106   c = DECODE_CHAR (charset, (unsigned )c);
8107   if (c < 0)
8108     error ("Invalid code: %d", code);
8109   return make_number (c);
8110 }
8111
8112 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
8113        doc: /* Encode the Big5 character CHAR to BIG5 coding system.
8114 Return the corresponding character code in Big5.  */)
8115      (ch)
8116      Lisp_Object ch;
8117 {
8118   Lisp_Object spec, attrs, charset_list;
8119   struct charset *charset;
8120   int c;
8121   unsigned code;
8122
8123   CHECK_CHARACTER (ch);
8124   c = XFASTINT (ch);
8125   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
8126   attrs = AREF (spec, 0);
8127   if (ASCII_CHAR_P (c)
8128       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8129     return ch;
8130
8131   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8132   charset = char_charset (c, charset_list, &code);
8133   if (code == CHARSET_INVALID_CODE (charset))
8134     error ("Can't encode by Big5 encoding: %d", c);
8135
8136   return make_number (code);
8137 }
8138
8139 \f
8140 DEFUN ("set-terminal-coding-system-internal",
8141        Fset_terminal_coding_system_internal,
8142        Sset_terminal_coding_system_internal, 1, 1, 0,
8143        doc: /* Internal use only.  */)
8144      (coding_system)
8145      Lisp_Object coding_system;
8146 {
8147   CHECK_SYMBOL (coding_system);
8148   setup_coding_system (Fcheck_coding_system (coding_system),
8149                         &terminal_coding);
8150
8151   /* We had better not send unsafe characters to terminal.  */
8152   terminal_coding.mode |= CODING_MODE_SAFE_ENCODING;
8153   /* Characer composition should be disabled.  */
8154   terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8155   terminal_coding.src_multibyte = 1;
8156   terminal_coding.dst_multibyte = 0;
8157   return Qnil;
8158 }
8159
8160 DEFUN ("set-safe-terminal-coding-system-internal",
8161        Fset_safe_terminal_coding_system_internal,
8162        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
8163        doc: /* Internal use only.  */)
8164      (coding_system)
8165      Lisp_Object coding_system;
8166 {
8167   CHECK_SYMBOL (coding_system);
8168   setup_coding_system (Fcheck_coding_system (coding_system),
8169                        &safe_terminal_coding);
8170   /* Characer composition should be disabled.  */
8171   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8172   safe_terminal_coding.src_multibyte = 1;
8173   safe_terminal_coding.dst_multibyte = 0;
8174   return Qnil;
8175 }
8176
8177 DEFUN ("terminal-coding-system",
8178        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
8179        doc: /* Return coding system specified for terminal output.  */)
8180      ()
8181 {
8182   Lisp_Object coding_system;
8183
8184   coding_system = CODING_ID_NAME (terminal_coding.id);
8185   /* For backward compatibility, return nil if it is `undecided'. */
8186   return (coding_system != Qundecided ? coding_system : Qnil);
8187 }
8188
8189 DEFUN ("set-keyboard-coding-system-internal",
8190        Fset_keyboard_coding_system_internal,
8191        Sset_keyboard_coding_system_internal, 1, 1, 0,
8192        doc: /* Internal use only.  */)
8193      (coding_system)
8194      Lisp_Object coding_system;
8195 {
8196   CHECK_SYMBOL (coding_system);
8197   setup_coding_system (Fcheck_coding_system (coding_system),
8198                        &keyboard_coding);
8199   /* Characer composition should be disabled.  */
8200   keyboard_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8201   return Qnil;
8202 }
8203
8204 DEFUN ("keyboard-coding-system",
8205        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
8206        doc: /* Return coding system specified for decoding keyboard input.  */)
8207      ()
8208 {
8209   return CODING_ID_NAME (keyboard_coding.id);
8210 }
8211
8212 \f
8213 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
8214        Sfind_operation_coding_system,  1, MANY, 0,
8215        doc: /* Choose a coding system for an operation based on the target name.
8216 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
8217 DECODING-SYSTEM is the coding system to use for decoding
8218 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
8219 for encoding (in case OPERATION does encoding).
8220
8221 The first argument OPERATION specifies an I/O primitive:
8222   For file I/O, `insert-file-contents' or `write-region'.
8223   For process I/O, `call-process', `call-process-region', or `start-process'.
8224   For network I/O, `open-network-stream'.
8225
8226 The remaining arguments should be the same arguments that were passed
8227 to the primitive.  Depending on which primitive, one of those arguments
8228 is selected as the TARGET.  For example, if OPERATION does file I/O,
8229 whichever argument specifies the file name is TARGET.
8230
8231 TARGET has a meaning which depends on OPERATION:
8232   For file I/O, TARGET is a file name.
8233   For process I/O, TARGET is a process name.
8234   For network I/O, TARGET is a service name or a port number
8235
8236 This function looks up what specified for TARGET in,
8237 `file-coding-system-alist', `process-coding-system-alist',
8238 or `network-coding-system-alist' depending on OPERATION.
8239 They may specify a coding system, a cons of coding systems,
8240 or a function symbol to call.
8241 In the last case, we call the function with one argument,
8242 which is a list of all the arguments given to this function.
8243
8244 usage: (find-operation-coding-system OPERATION ARGUMENTS ...)  */)
8245      (nargs, args)
8246      int nargs;
8247      Lisp_Object *args;
8248 {
8249   Lisp_Object operation, target_idx, target, val;
8250   register Lisp_Object chain;
8251
8252   if (nargs < 2)
8253     error ("Too few arguments");
8254   operation = args[0];
8255   if (!SYMBOLP (operation)
8256       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
8257     error ("Invalid first arguement");
8258   if (nargs < 1 + XINT (target_idx))
8259     error ("Too few arguments for operation: %s",
8260            SDATA (SYMBOL_NAME (operation)));
8261   target = args[XINT (target_idx) + 1];
8262   if (!(STRINGP (target)
8263         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
8264     error ("Invalid %dth argument", XINT (target_idx) + 1);
8265
8266   chain = ((EQ (operation, Qinsert_file_contents)
8267             || EQ (operation, Qwrite_region))
8268            ? Vfile_coding_system_alist
8269            : (EQ (operation, Qopen_network_stream)
8270               ? Vnetwork_coding_system_alist
8271               : Vprocess_coding_system_alist));
8272   if (NILP (chain))
8273     return Qnil;
8274
8275   for (; CONSP (chain); chain = XCDR (chain))
8276     {
8277       Lisp_Object elt;
8278
8279       elt = XCAR (chain);
8280       if (CONSP (elt)
8281           && ((STRINGP (target)
8282                && STRINGP (XCAR (elt))
8283                && fast_string_match (XCAR (elt), target) >= 0)
8284               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
8285         {
8286           val = XCDR (elt);
8287           /* Here, if VAL is both a valid coding system and a valid
8288              function symbol, we return VAL as a coding system.  */
8289           if (CONSP (val))
8290             return val;
8291           if (! SYMBOLP (val))
8292             return Qnil;
8293           if (! NILP (Fcoding_system_p (val)))
8294             return Fcons (val, val);
8295           if (! NILP (Ffboundp (val)))
8296             {
8297               val = call1 (val, Flist (nargs, args));
8298               if (CONSP (val))
8299                 return val;
8300               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
8301                 return Fcons (val, val);
8302             }
8303           return Qnil;
8304         }
8305     }
8306   return Qnil;
8307 }
8308
8309 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
8310        Sset_coding_system_priority, 0, MANY, 0,
8311        doc: /* Assign higher priority to the coding systems given as arguments.
8312 If multiple coding systems belongs to the same category,
8313 all but the first one are ignored.
8314
8315 usage: (set-coding-system-priority ...)  */)
8316      (nargs, args)
8317      int nargs;
8318      Lisp_Object *args;
8319 {
8320   int i, j;
8321   int changed[coding_category_max];
8322   enum coding_category priorities[coding_category_max];
8323
8324   bzero (changed, sizeof changed);
8325
8326   for (i = j = 0; i < nargs; i++)
8327     {
8328       enum coding_category category;
8329       Lisp_Object spec, attrs;
8330
8331       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
8332       attrs = AREF (spec, 0);
8333       category = XINT (CODING_ATTR_CATEGORY (attrs));
8334       if (changed[category])
8335         /* Ignore this coding system because a coding system of the
8336            same category already had a higher priority.  */
8337         continue;
8338       changed[category] = 1;
8339       priorities[j++] = category;
8340       if (coding_categories[category].id >= 0
8341           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
8342         setup_coding_system (args[i], &coding_categories[category]);
8343       Fset (AREF (Vcoding_category_table, category), args[i]);
8344     }
8345
8346   /* Now we have decided top J priorities.  Reflect the order of the
8347      original priorities to the remaining priorities.  */
8348
8349   for (i = j, j = 0; i < coding_category_max; i++, j++)
8350     {
8351       while (j < coding_category_max
8352              && changed[coding_priorities[j]])
8353         j++;
8354       if (j == coding_category_max)
8355         abort ();
8356       priorities[i] = coding_priorities[j];
8357     }
8358
8359   bcopy (priorities, coding_priorities, sizeof priorities);
8360
8361   /* Update `coding-category-list'.  */
8362   Vcoding_category_list = Qnil;
8363   for (i = coding_category_max - 1; i >= 0; i--)
8364     Vcoding_category_list
8365       = Fcons (AREF (Vcoding_category_table, priorities[i]),
8366                Vcoding_category_list);
8367
8368   return Qnil;
8369 }
8370
8371 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
8372        Scoding_system_priority_list, 0, 1, 0,
8373        doc: /* Return a list of coding systems ordered by their priorities.
8374 HIGHESTP non-nil means just return the highest priority one.  */)
8375      (highestp)
8376      Lisp_Object highestp;
8377 {
8378   int i;
8379   Lisp_Object val;
8380
8381   for (i = 0, val = Qnil; i < coding_category_max; i++)
8382     {
8383       enum coding_category category = coding_priorities[i];
8384       int id = coding_categories[category].id;
8385       Lisp_Object attrs;
8386
8387       if (id < 0)
8388         continue;
8389       attrs = CODING_ID_ATTRS (id);
8390       if (! NILP (highestp))
8391         return CODING_ATTR_BASE_NAME (attrs);
8392       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
8393     }
8394   return Fnreverse (val);
8395 }
8396
8397 static char *suffixes[] = { "-unix", "-dos", "-mac" };
8398
8399 static Lisp_Object
8400 make_subsidiaries (base)
8401      Lisp_Object base;
8402 {
8403   Lisp_Object subsidiaries;
8404   int base_name_len = SBYTES (SYMBOL_NAME (base));
8405   char *buf = (char *) alloca (base_name_len + 6);
8406   int i;
8407
8408   bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
8409   subsidiaries = Fmake_vector (make_number (3), Qnil);
8410   for (i = 0; i < 3; i++)
8411     {
8412       bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
8413       ASET (subsidiaries, i, intern (buf));
8414     }
8415   return subsidiaries;
8416 }
8417
8418
8419 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
8420        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
8421        doc: /* For internal use only.
8422 usage: (define-coding-system-internal ...)  */)
8423      (nargs, args)
8424      int nargs;
8425      Lisp_Object *args;
8426 {
8427   Lisp_Object name;
8428   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
8429   Lisp_Object attrs;            /* Vector of attributes.  */
8430   Lisp_Object eol_type;
8431   Lisp_Object aliases;
8432   Lisp_Object coding_type, charset_list, safe_charsets;
8433   enum coding_category category;
8434   Lisp_Object tail, val;
8435   int max_charset_id = 0;
8436   int i;
8437
8438   if (nargs < coding_arg_max)
8439     goto short_args;
8440
8441   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
8442
8443   name = args[coding_arg_name];
8444   CHECK_SYMBOL (name);
8445   CODING_ATTR_BASE_NAME (attrs) = name;
8446
8447   val = args[coding_arg_mnemonic];
8448   if (! STRINGP (val))
8449     CHECK_CHARACTER (val);
8450   CODING_ATTR_MNEMONIC (attrs) = val;
8451
8452   coding_type = args[coding_arg_coding_type];
8453   CHECK_SYMBOL (coding_type);
8454   CODING_ATTR_TYPE (attrs) = coding_type;
8455
8456   charset_list = args[coding_arg_charset_list];
8457   if (SYMBOLP (charset_list))
8458     {
8459       if (EQ (charset_list, Qiso_2022))
8460         {
8461           if (! EQ (coding_type, Qiso_2022))
8462             error ("Invalid charset-list");
8463           charset_list = Viso_2022_charset_list;
8464         }
8465       else if (EQ (charset_list, Qemacs_mule))
8466         {
8467           if (! EQ (coding_type, Qemacs_mule))
8468             error ("Invalid charset-list");
8469           charset_list = Vemacs_mule_charset_list;
8470         }
8471       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8472         if (max_charset_id < XFASTINT (XCAR (tail)))
8473           max_charset_id = XFASTINT (XCAR (tail));
8474     }
8475   else
8476     {
8477       charset_list = Fcopy_sequence (charset_list);
8478       for (tail = charset_list; !NILP (tail); tail = Fcdr (tail))
8479         {
8480           struct charset *charset;
8481
8482           val = Fcar (tail);
8483           CHECK_CHARSET_GET_CHARSET (val, charset);
8484           if (EQ (coding_type, Qiso_2022)
8485               ? CHARSET_ISO_FINAL (charset) < 0
8486               : EQ (coding_type, Qemacs_mule)
8487               ? CHARSET_EMACS_MULE_ID (charset) < 0
8488               : 0)
8489             error ("Can't handle charset `%s'",
8490                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8491
8492           XSETCAR (tail, make_number (charset->id));
8493           if (max_charset_id < charset->id)
8494             max_charset_id = charset->id;
8495         }
8496     }
8497   CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
8498
8499   safe_charsets = Fmake_string (make_number (max_charset_id + 1),
8500                                 make_number (255));
8501   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8502     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
8503   CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
8504
8505   CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
8506
8507   val = args[coding_arg_decode_translation_table];
8508   if (! CHAR_TABLE_P (val) && ! CONSP (val))
8509     CHECK_SYMBOL (val);
8510   CODING_ATTR_DECODE_TBL (attrs) = val;
8511
8512   val = args[coding_arg_encode_translation_table];
8513   if (! CHAR_TABLE_P (val) && ! CONSP (val))
8514     CHECK_SYMBOL (val);
8515   CODING_ATTR_ENCODE_TBL (attrs) = val;
8516
8517   val = args[coding_arg_post_read_conversion];
8518   CHECK_SYMBOL (val);
8519   CODING_ATTR_POST_READ (attrs) = val;
8520
8521   val = args[coding_arg_pre_write_conversion];
8522   CHECK_SYMBOL (val);
8523   CODING_ATTR_PRE_WRITE (attrs) = val;
8524
8525   val = args[coding_arg_default_char];
8526   if (NILP (val))
8527     CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
8528   else
8529     {
8530       CHECK_CHARACTER (val);
8531       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
8532     }
8533
8534   val = args[coding_arg_for_unibyte];
8535   CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
8536
8537   val = args[coding_arg_plist];
8538   CHECK_LIST (val);
8539   CODING_ATTR_PLIST (attrs) = val;
8540
8541   if (EQ (coding_type, Qcharset))
8542     {
8543       /* Generate a lisp vector of 256 elements.  Each element is nil,
8544          integer, or a list of charset IDs.
8545
8546          If Nth element is nil, the byte code N is invalid in this
8547          coding system.
8548
8549          If Nth element is a number NUM, N is the first byte of a
8550          charset whose ID is NUM.
8551
8552          If Nth element is a list of charset IDs, N is the first byte
8553          of one of them.  The list is sorted by dimensions of the
8554          charsets.  A charset of smaller dimension comes firtst. */
8555       val = Fmake_vector (make_number (256), Qnil);
8556
8557       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8558         {
8559           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
8560           int dim = CHARSET_DIMENSION (charset);
8561           int idx = (dim - 1) * 4;
8562
8563           if (CHARSET_ASCII_COMPATIBLE_P (charset))
8564             CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8565
8566           for (i = charset->code_space[idx];
8567                i <= charset->code_space[idx + 1]; i++)
8568             {
8569               Lisp_Object tmp, tmp2;
8570               int dim2;
8571
8572               tmp = AREF (val, i);
8573               if (NILP (tmp))
8574                 tmp = XCAR (tail);
8575               else if (NUMBERP (tmp))
8576                 {
8577                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
8578                   if (dim < dim2)
8579                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
8580                   else
8581                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
8582                 }
8583               else
8584                 {
8585                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
8586                     {
8587                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
8588                       if (dim < dim2)
8589                         break;
8590                     }
8591                   if (NILP (tmp2))
8592                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
8593                   else
8594                     {
8595                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
8596                       XSETCAR (tmp2, XCAR (tail));
8597                     }
8598                 }
8599               ASET (val, i, tmp);
8600             }
8601         }
8602       ASET (attrs, coding_attr_charset_valids, val);
8603       category = coding_category_charset;
8604     }
8605   else if (EQ (coding_type, Qccl))
8606     {
8607       Lisp_Object valids;
8608
8609       if (nargs < coding_arg_ccl_max)
8610         goto short_args;
8611
8612       val = args[coding_arg_ccl_decoder];
8613       CHECK_CCL_PROGRAM (val);
8614       if (VECTORP (val))
8615         val = Fcopy_sequence (val);
8616       ASET (attrs, coding_attr_ccl_decoder, val);
8617
8618       val = args[coding_arg_ccl_encoder];
8619       CHECK_CCL_PROGRAM (val);
8620       if (VECTORP (val))
8621         val = Fcopy_sequence (val);
8622       ASET (attrs, coding_attr_ccl_encoder, val);
8623
8624       val = args[coding_arg_ccl_valids];
8625       valids = Fmake_string (make_number (256), make_number (0));
8626       for (tail = val; !NILP (tail); tail = Fcdr (tail))
8627         {
8628           int from, to;
8629
8630           val = Fcar (tail);
8631           if (INTEGERP (val))
8632             {
8633               from = to = XINT (val);
8634               if (from < 0 || from > 255)
8635                 args_out_of_range_3 (val, make_number (0), make_number (255));
8636             }
8637           else
8638             {
8639               CHECK_CONS (val);
8640               CHECK_NATNUM_CAR (val);
8641               CHECK_NATNUM_CDR (val);
8642               from = XINT (XCAR (val));
8643               if (from > 255)
8644                 args_out_of_range_3 (XCAR (val),
8645                                      make_number (0), make_number (255));
8646               to = XINT (XCDR (val));
8647               if (to < from || to > 255)
8648                 args_out_of_range_3 (XCDR (val),
8649                                      XCAR (val), make_number (255));
8650             }
8651           for (i = from; i <= to; i++)
8652             SSET (valids, i, 1);
8653         }
8654       ASET (attrs, coding_attr_ccl_valids, valids);
8655
8656       category = coding_category_ccl;
8657     }
8658   else if (EQ (coding_type, Qutf_16))
8659     {
8660       Lisp_Object bom, endian;
8661
8662       CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8663
8664       if (nargs < coding_arg_utf16_max)
8665         goto short_args;
8666
8667       bom = args[coding_arg_utf16_bom];
8668       if (! NILP (bom) && ! EQ (bom, Qt))
8669         {
8670           CHECK_CONS (bom);
8671           val = XCAR (bom);
8672           CHECK_CODING_SYSTEM (val);
8673           val = XCDR (bom);
8674           CHECK_CODING_SYSTEM (val);
8675         }
8676       ASET (attrs, coding_attr_utf_16_bom, bom);
8677
8678       endian = args[coding_arg_utf16_endian];
8679       CHECK_SYMBOL (endian);
8680       if (NILP (endian))
8681         endian = Qbig;
8682       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
8683         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
8684       ASET (attrs, coding_attr_utf_16_endian, endian);
8685
8686       category = (CONSP (bom)
8687                   ? coding_category_utf_16_auto
8688                   : NILP (bom)
8689                   ? (EQ (endian, Qbig)
8690                      ? coding_category_utf_16_be_nosig
8691                      : coding_category_utf_16_le_nosig)
8692                   : (EQ (endian, Qbig)
8693                      ? coding_category_utf_16_be
8694                      : coding_category_utf_16_le));
8695     }
8696   else if (EQ (coding_type, Qiso_2022))
8697     {
8698       Lisp_Object initial, reg_usage, request, flags;
8699       int i;
8700
8701       if (nargs < coding_arg_iso2022_max)
8702         goto short_args;
8703
8704       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
8705       CHECK_VECTOR (initial);
8706       for (i = 0; i < 4; i++)
8707         {
8708           val = Faref (initial, make_number (i));
8709           if (! NILP (val))
8710             {
8711               struct charset *charset;
8712
8713               CHECK_CHARSET_GET_CHARSET (val, charset);
8714               ASET (initial, i, make_number (CHARSET_ID (charset)));
8715               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
8716                 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8717             }
8718           else
8719             ASET (initial, i, make_number (-1));
8720         }
8721
8722       reg_usage = args[coding_arg_iso2022_reg_usage];
8723       CHECK_CONS (reg_usage);
8724       CHECK_NUMBER_CAR (reg_usage);
8725       CHECK_NUMBER_CDR (reg_usage);
8726
8727       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
8728       for (tail = request; ! NILP (tail); tail = Fcdr (tail))
8729         {
8730           int id;
8731           Lisp_Object tmp;
8732
8733           val = Fcar (tail);
8734           CHECK_CONS (val);
8735           tmp = XCAR (val);
8736           CHECK_CHARSET_GET_ID (tmp, id);
8737           CHECK_NATNUM_CDR (val);
8738           if (XINT (XCDR (val)) >= 4)
8739             error ("Invalid graphic register number: %d", XINT (XCDR (val)));
8740           XSETCAR (val, make_number (id));
8741         }
8742
8743       flags = args[coding_arg_iso2022_flags];
8744       CHECK_NATNUM (flags);
8745       i = XINT (flags);
8746       if (EQ (args[coding_arg_charset_list], Qiso_2022))
8747         flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
8748
8749       ASET (attrs, coding_attr_iso_initial, initial);
8750       ASET (attrs, coding_attr_iso_usage, reg_usage);
8751       ASET (attrs, coding_attr_iso_request, request);
8752       ASET (attrs, coding_attr_iso_flags, flags);
8753       setup_iso_safe_charsets (attrs);
8754
8755       if (i & CODING_ISO_FLAG_SEVEN_BITS)
8756         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
8757                           | CODING_ISO_FLAG_SINGLE_SHIFT))
8758                     ? coding_category_iso_7_else
8759                     : EQ (args[coding_arg_charset_list], Qiso_2022)
8760                     ? coding_category_iso_7
8761                     : coding_category_iso_7_tight);
8762       else
8763         {
8764           int id = XINT (AREF (initial, 1));
8765
8766           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
8767                        || EQ (args[coding_arg_charset_list], Qiso_2022)
8768                        || id < 0)
8769                       ? coding_category_iso_8_else
8770                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
8771                       ? coding_category_iso_8_1
8772                       : coding_category_iso_8_2);
8773         }
8774       if (category != coding_category_iso_8_1
8775           && category != coding_category_iso_8_2)
8776         CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8777     }
8778   else if (EQ (coding_type, Qemacs_mule))
8779     {
8780       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
8781         ASET (attrs, coding_attr_emacs_mule_full, Qt);
8782       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8783       category = coding_category_emacs_mule;
8784     }
8785   else if (EQ (coding_type, Qshift_jis))
8786     {
8787
8788       struct charset *charset;
8789
8790       if (XINT (Flength (charset_list)) != 3
8791           && XINT (Flength (charset_list)) != 4)
8792         error ("There should be three or four charsets");
8793
8794       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8795       if (CHARSET_DIMENSION (charset) != 1)
8796         error ("Dimension of charset %s is not one",
8797                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8798       if (CHARSET_ASCII_COMPATIBLE_P (charset))
8799         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8800
8801       charset_list = XCDR (charset_list);
8802       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8803       if (CHARSET_DIMENSION (charset) != 1)
8804         error ("Dimension of charset %s is not one",
8805                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8806
8807       charset_list = XCDR (charset_list);
8808       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8809       if (CHARSET_DIMENSION (charset) != 2)
8810         error ("Dimension of charset %s is not two",
8811                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8812
8813       charset_list = XCDR (charset_list);
8814       if (! NILP (charset_list))
8815         {
8816           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8817           if (CHARSET_DIMENSION (charset) != 2)
8818             error ("Dimension of charset %s is not two",
8819                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8820         }
8821
8822       category = coding_category_sjis;
8823       Vsjis_coding_system = name;
8824     }
8825   else if (EQ (coding_type, Qbig5))
8826     {
8827       struct charset *charset;
8828
8829       if (XINT (Flength (charset_list)) != 2)
8830         error ("There should be just two charsets");
8831
8832       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8833       if (CHARSET_DIMENSION (charset) != 1)
8834         error ("Dimension of charset %s is not one",
8835                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8836       if (CHARSET_ASCII_COMPATIBLE_P (charset))
8837         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8838
8839       charset_list = XCDR (charset_list);
8840       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8841       if (CHARSET_DIMENSION (charset) != 2)
8842         error ("Dimension of charset %s is not two",
8843                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8844
8845       category = coding_category_big5;
8846       Vbig5_coding_system = name;
8847     }
8848   else if (EQ (coding_type, Qraw_text))
8849     {
8850       category = coding_category_raw_text;
8851       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8852     }
8853   else if (EQ (coding_type, Qutf_8))
8854     {
8855       category = coding_category_utf_8;
8856       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8857     }
8858   else if (EQ (coding_type, Qundecided))
8859     category = coding_category_undecided;
8860   else
8861     error ("Invalid coding system type: %s",
8862            SDATA (SYMBOL_NAME (coding_type)));
8863
8864   CODING_ATTR_CATEGORY (attrs) = make_number (category);
8865   CODING_ATTR_PLIST (attrs)
8866     = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
8867                                 CODING_ATTR_PLIST (attrs)));
8868
8869   eol_type = args[coding_arg_eol_type];
8870   if (! NILP (eol_type)
8871       && ! EQ (eol_type, Qunix)
8872       && ! EQ (eol_type, Qdos)
8873       && ! EQ (eol_type, Qmac))
8874     error ("Invalid eol-type");
8875
8876   aliases = Fcons (name, Qnil);
8877
8878   if (NILP (eol_type))
8879     {
8880       eol_type = make_subsidiaries (name);
8881       for (i = 0; i < 3; i++)
8882         {
8883           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
8884
8885           this_name = AREF (eol_type, i);
8886           this_aliases = Fcons (this_name, Qnil);
8887           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
8888           this_spec = Fmake_vector (make_number (3), attrs);
8889           ASET (this_spec, 1, this_aliases);
8890           ASET (this_spec, 2, this_eol_type);
8891           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
8892           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
8893           Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
8894                                         Vcoding_system_alist);
8895         }
8896     }
8897
8898   spec_vec = Fmake_vector (make_number (3), attrs);
8899   ASET (spec_vec, 1, aliases);
8900   ASET (spec_vec, 2, eol_type);
8901
8902   Fputhash (name, spec_vec, Vcoding_system_hash_table);
8903   Vcoding_system_list = Fcons (name, Vcoding_system_list);
8904   Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
8905                                 Vcoding_system_alist);
8906
8907   {
8908     int id = coding_categories[category].id;
8909
8910     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
8911       setup_coding_system (name, &coding_categories[category]);
8912   }
8913
8914   return Qnil;
8915
8916  short_args:
8917   return Fsignal (Qwrong_number_of_arguments,
8918                   Fcons (intern ("define-coding-system-internal"),
8919                          make_number (nargs)));
8920 }
8921
8922
8923 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
8924        3, 3, 0,
8925        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
8926   (coding_system, prop, val)
8927      Lisp_Object coding_system, prop, val;
8928 {
8929   Lisp_Object spec, attrs;
8930
8931   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8932   attrs = AREF (spec, 0);
8933   if (EQ (prop, QCmnemonic))
8934     {
8935       if (! STRINGP (val))
8936         CHECK_CHARACTER (val);
8937       CODING_ATTR_MNEMONIC (attrs) = val;
8938     }
8939   else if (EQ (prop, QCdefalut_char))
8940     {
8941       if (NILP (val))
8942         val = make_number (' ');
8943       else
8944         CHECK_CHARACTER (val);
8945       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
8946     }
8947   else if (EQ (prop, QCdecode_translation_table))
8948     {
8949       if (! CHAR_TABLE_P (val) && ! CONSP (val))
8950         CHECK_SYMBOL (val);
8951       CODING_ATTR_DECODE_TBL (attrs) = val;
8952     }
8953   else if (EQ (prop, QCencode_translation_table))
8954     {
8955       if (! CHAR_TABLE_P (val) && ! CONSP (val))
8956         CHECK_SYMBOL (val);
8957       CODING_ATTR_ENCODE_TBL (attrs) = val;
8958     }
8959   else if (EQ (prop, QCpost_read_conversion))
8960     {
8961       CHECK_SYMBOL (val);
8962       CODING_ATTR_POST_READ (attrs) = val;
8963     }
8964   else if (EQ (prop, QCpre_write_conversion))
8965     {
8966       CHECK_SYMBOL (val);
8967       CODING_ATTR_PRE_WRITE (attrs) = val;
8968     }
8969
8970   CODING_ATTR_PLIST (attrs)
8971     = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
8972   return val;
8973 }
8974
8975
8976 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
8977        Sdefine_coding_system_alias, 2, 2, 0,
8978        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
8979      (alias, coding_system)
8980      Lisp_Object alias, coding_system;
8981 {
8982   Lisp_Object spec, aliases, eol_type;
8983
8984   CHECK_SYMBOL (alias);
8985   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8986   aliases = AREF (spec, 1);
8987   /* ALISES should be a list of length more than zero, and the first
8988      element is a base coding system.  Append ALIAS at the tail of the
8989      list.  */
8990   while (!NILP (XCDR (aliases)))
8991     aliases = XCDR (aliases);
8992   XSETCDR (aliases, Fcons (alias, Qnil));
8993
8994   eol_type = AREF (spec, 2);
8995   if (VECTORP (eol_type))
8996     {
8997       Lisp_Object subsidiaries;
8998       int i;
8999
9000       subsidiaries = make_subsidiaries (alias);
9001       for (i = 0; i < 3; i++)
9002         Fdefine_coding_system_alias (AREF (subsidiaries, i),
9003                                      AREF (eol_type, i));
9004     }
9005
9006   Fputhash (alias, spec, Vcoding_system_hash_table);
9007   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
9008   Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
9009                                 Vcoding_system_alist);
9010
9011   return Qnil;
9012 }
9013
9014 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
9015        1, 1, 0,
9016        doc: /* Return the base of CODING-SYSTEM.
9017 Any alias or subsidiary coding system is not a base coding system.  */)
9018   (coding_system)
9019      Lisp_Object coding_system;
9020 {
9021   Lisp_Object spec, attrs;
9022
9023   if (NILP (coding_system))
9024     return (Qno_conversion);
9025   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9026   attrs = AREF (spec, 0);
9027   return CODING_ATTR_BASE_NAME (attrs);
9028 }
9029
9030 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
9031        1, 1, 0,
9032        doc: "Return the property list of CODING-SYSTEM.")
9033      (coding_system)
9034      Lisp_Object coding_system;
9035 {
9036   Lisp_Object spec, attrs;
9037
9038   if (NILP (coding_system))
9039     coding_system = Qno_conversion;
9040   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9041   attrs = AREF (spec, 0);
9042   return CODING_ATTR_PLIST (attrs);
9043 }
9044
9045
9046 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
9047        1, 1, 0,
9048        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
9049      (coding_system)
9050      Lisp_Object coding_system;
9051 {
9052   Lisp_Object spec;
9053
9054   if (NILP (coding_system))
9055     coding_system = Qno_conversion;
9056   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9057   return AREF (spec, 1);
9058 }
9059
9060 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
9061        Scoding_system_eol_type, 1, 1, 0,
9062        doc: /* Return eol-type of CODING-SYSTEM.
9063 An eol-type is integer 0, 1, 2, or a vector of coding systems.
9064
9065 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
9066 and CR respectively.
9067
9068 A vector value indicates that a format of end-of-line should be
9069 detected automatically.  Nth element of the vector is the subsidiary
9070 coding system whose eol-type is N.  */)
9071      (coding_system)
9072      Lisp_Object coding_system;
9073 {
9074   Lisp_Object spec, eol_type;
9075   int n;
9076
9077   if (NILP (coding_system))
9078     coding_system = Qno_conversion;
9079   if (! CODING_SYSTEM_P (coding_system))
9080     return Qnil;
9081   spec = CODING_SYSTEM_SPEC (coding_system);
9082   eol_type = AREF (spec, 2);
9083   if (VECTORP (eol_type))
9084     return Fcopy_sequence (eol_type);
9085   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
9086   return make_number (n);
9087 }
9088
9089 #endif /* emacs */
9090
9091 \f
9092 /*** 9. Post-amble ***/
9093
9094 void
9095 init_coding_once ()
9096 {
9097   int i;
9098
9099   for (i = 0; i < coding_category_max; i++)
9100     {
9101       coding_categories[i].id = -1;
9102       coding_priorities[i] = i;
9103     }
9104
9105   /* ISO2022 specific initialize routine.  */
9106   for (i = 0; i < 0x20; i++)
9107     iso_code_class[i] = ISO_control_0;
9108   for (i = 0x21; i < 0x7F; i++)
9109     iso_code_class[i] = ISO_graphic_plane_0;
9110   for (i = 0x80; i < 0xA0; i++)
9111     iso_code_class[i] = ISO_control_1;
9112   for (i = 0xA1; i < 0xFF; i++)
9113     iso_code_class[i] = ISO_graphic_plane_1;
9114   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
9115   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
9116   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
9117   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
9118   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
9119   iso_code_class[ISO_CODE_ESC] = ISO_escape;
9120   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
9121   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
9122   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
9123
9124   for (i = 0; i < 256; i++)
9125     {
9126       emacs_mule_bytes[i] = 1;
9127     }
9128   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
9129   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
9130   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
9131   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
9132 }
9133
9134 #ifdef emacs
9135
9136 void
9137 syms_of_coding ()
9138 {
9139   staticpro (&Vcoding_system_hash_table);
9140   {
9141     Lisp_Object args[2];
9142     args[0] = QCtest;
9143     args[1] = Qeq;
9144     Vcoding_system_hash_table = Fmake_hash_table (2, args);
9145   }
9146
9147   staticpro (&Vsjis_coding_system);
9148   Vsjis_coding_system = Qnil;
9149
9150   staticpro (&Vbig5_coding_system);
9151   Vbig5_coding_system = Qnil;
9152
9153   staticpro (&Vcode_conversion_reused_workbuf);
9154   Vcode_conversion_reused_workbuf = Qnil;
9155
9156   staticpro (&Vcode_conversion_workbuf_name);
9157   Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
9158
9159   reused_workbuf_in_use = 0;
9160
9161   DEFSYM (Qcharset, "charset");
9162   DEFSYM (Qtarget_idx, "target-idx");
9163   DEFSYM (Qcoding_system_history, "coding-system-history");
9164   Fset (Qcoding_system_history, Qnil);
9165
9166   /* Target FILENAME is the first argument.  */
9167   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9168   /* Target FILENAME is the third argument.  */
9169   Fput (Qwrite_region, Qtarget_idx, make_number (2));
9170
9171   DEFSYM (Qcall_process, "call-process");
9172   /* Target PROGRAM is the first argument.  */
9173   Fput (Qcall_process, Qtarget_idx, make_number (0));
9174
9175   DEFSYM (Qcall_process_region, "call-process-region");
9176   /* Target PROGRAM is the third argument.  */
9177   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
9178
9179   DEFSYM (Qstart_process, "start-process");
9180   /* Target PROGRAM is the third argument.  */
9181   Fput (Qstart_process, Qtarget_idx, make_number (2));
9182
9183   DEFSYM (Qopen_network_stream, "open-network-stream");
9184   /* Target SERVICE is the fourth argument.  */
9185   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
9186
9187   DEFSYM (Qcoding_system, "coding-system");
9188   DEFSYM (Qcoding_aliases, "coding-aliases");
9189
9190   DEFSYM (Qeol_type, "eol-type");
9191   DEFSYM (Qunix, "unix");
9192   DEFSYM (Qdos, "dos");
9193
9194   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
9195   DEFSYM (Qpost_read_conversion, "post-read-conversion");
9196   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
9197   DEFSYM (Qdefault_char, "default-char");
9198   DEFSYM (Qundecided, "undecided");
9199   DEFSYM (Qno_conversion, "no-conversion");
9200   DEFSYM (Qraw_text, "raw-text");
9201
9202   DEFSYM (Qiso_2022, "iso-2022");
9203
9204   DEFSYM (Qutf_8, "utf-8");
9205   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
9206
9207   DEFSYM (Qutf_16, "utf-16");
9208   DEFSYM (Qbig, "big");
9209   DEFSYM (Qlittle, "little");
9210
9211   DEFSYM (Qshift_jis, "shift-jis");
9212   DEFSYM (Qbig5, "big5");
9213
9214   DEFSYM (Qcoding_system_p, "coding-system-p");
9215
9216   DEFSYM (Qcoding_system_error, "coding-system-error");
9217   Fput (Qcoding_system_error, Qerror_conditions,
9218         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
9219   Fput (Qcoding_system_error, Qerror_message,
9220         build_string ("Invalid coding system"));
9221
9222   /* Intern this now in case it isn't already done.
9223      Setting this variable twice is harmless.
9224      But don't staticpro it here--that is done in alloc.c.  */
9225   Qchar_table_extra_slots = intern ("char-table-extra-slots");
9226
9227   DEFSYM (Qtranslation_table, "translation-table");
9228   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
9229   DEFSYM (Qtranslation_table_id, "translation-table-id");
9230   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
9231   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
9232
9233   DEFSYM (Qvalid_codes, "valid-codes");
9234
9235   DEFSYM (Qemacs_mule, "emacs-mule");
9236
9237   DEFSYM (QCcategory, ":category");
9238   DEFSYM (QCmnemonic, ":mnemonic");
9239   DEFSYM (QCdefalut_char, ":default-char");
9240   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
9241   DEFSYM (QCencode_translation_table, ":encode-translation-table");
9242   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
9243   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
9244
9245   Vcoding_category_table
9246     = Fmake_vector (make_number (coding_category_max), Qnil);
9247   staticpro (&Vcoding_category_table);
9248   /* Followings are target of code detection.  */
9249   ASET (Vcoding_category_table, coding_category_iso_7,
9250         intern ("coding-category-iso-7"));
9251   ASET (Vcoding_category_table, coding_category_iso_7_tight,
9252         intern ("coding-category-iso-7-tight"));
9253   ASET (Vcoding_category_table, coding_category_iso_8_1,
9254         intern ("coding-category-iso-8-1"));
9255   ASET (Vcoding_category_table, coding_category_iso_8_2,
9256         intern ("coding-category-iso-8-2"));
9257   ASET (Vcoding_category_table, coding_category_iso_7_else,
9258         intern ("coding-category-iso-7-else"));
9259   ASET (Vcoding_category_table, coding_category_iso_8_else,
9260         intern ("coding-category-iso-8-else"));
9261   ASET (Vcoding_category_table, coding_category_utf_8,
9262         intern ("coding-category-utf-8"));
9263   ASET (Vcoding_category_table, coding_category_utf_16_be,
9264         intern ("coding-category-utf-16-be"));
9265   ASET (Vcoding_category_table, coding_category_utf_16_auto,
9266         intern ("coding-category-utf-16-auto"));
9267   ASET (Vcoding_category_table, coding_category_utf_16_le,
9268         intern ("coding-category-utf-16-le"));
9269   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
9270         intern ("coding-category-utf-16-be-nosig"));
9271   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
9272         intern ("coding-category-utf-16-le-nosig"));
9273   ASET (Vcoding_category_table, coding_category_charset,
9274         intern ("coding-category-charset"));
9275   ASET (Vcoding_category_table, coding_category_sjis,
9276         intern ("coding-category-sjis"));
9277   ASET (Vcoding_category_table, coding_category_big5,
9278         intern ("coding-category-big5"));
9279   ASET (Vcoding_category_table, coding_category_ccl,
9280         intern ("coding-category-ccl"));
9281   ASET (Vcoding_category_table, coding_category_emacs_mule,
9282         intern ("coding-category-emacs-mule"));
9283   /* Followings are NOT target of code detection.  */
9284   ASET (Vcoding_category_table, coding_category_raw_text,
9285         intern ("coding-category-raw-text"));
9286   ASET (Vcoding_category_table, coding_category_undecided,
9287         intern ("coding-category-undecided"));
9288
9289   DEFSYM (Qinsufficient_source, "insufficient-source");
9290   DEFSYM (Qinconsistent_eol, "inconsistent-eol");
9291   DEFSYM (Qinvalid_source, "invalid-source");
9292   DEFSYM (Qinterrupted, "interrupted");
9293   DEFSYM (Qinsufficient_memory, "insufficient-memory");
9294
9295   defsubr (&Scoding_system_p);
9296   defsubr (&Sread_coding_system);
9297   defsubr (&Sread_non_nil_coding_system);
9298   defsubr (&Scheck_coding_system);
9299   defsubr (&Sdetect_coding_region);
9300   defsubr (&Sdetect_coding_string);
9301   defsubr (&Sfind_coding_systems_region_internal);
9302   defsubr (&Sunencodable_char_position);
9303   defsubr (&Scheck_coding_systems_region);
9304   defsubr (&Sdecode_coding_region);
9305   defsubr (&Sencode_coding_region);
9306   defsubr (&Sdecode_coding_string);
9307   defsubr (&Sencode_coding_string);
9308   defsubr (&Sdecode_sjis_char);
9309   defsubr (&Sencode_sjis_char);
9310   defsubr (&Sdecode_big5_char);
9311   defsubr (&Sencode_big5_char);
9312   defsubr (&Sset_terminal_coding_system_internal);
9313   defsubr (&Sset_safe_terminal_coding_system_internal);
9314   defsubr (&Sterminal_coding_system);
9315   defsubr (&Sset_keyboard_coding_system_internal);
9316   defsubr (&Skeyboard_coding_system);
9317   defsubr (&Sfind_operation_coding_system);
9318   defsubr (&Sset_coding_system_priority);
9319   defsubr (&Sdefine_coding_system_internal);
9320   defsubr (&Sdefine_coding_system_alias);
9321   defsubr (&Scoding_system_put);
9322   defsubr (&Scoding_system_base);
9323   defsubr (&Scoding_system_plist);
9324   defsubr (&Scoding_system_aliases);
9325   defsubr (&Scoding_system_eol_type);
9326   defsubr (&Scoding_system_priority_list);
9327
9328   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
9329                doc: /* List of coding systems.
9330
9331 Do not alter the value of this variable manually.  This variable should be
9332 updated by the functions `define-coding-system' and
9333 `define-coding-system-alias'.  */);
9334   Vcoding_system_list = Qnil;
9335
9336   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
9337                doc: /* Alist of coding system names.
9338 Each element is one element list of coding system name.
9339 This variable is given to `completing-read' as TABLE argument.
9340
9341 Do not alter the value of this variable manually.  This variable should be
9342 updated by the functions `make-coding-system' and
9343 `define-coding-system-alias'.  */);
9344   Vcoding_system_alist = Qnil;
9345
9346   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
9347                doc: /* List of coding-categories (symbols) ordered by priority.
9348
9349 On detecting a coding system, Emacs tries code detection algorithms
9350 associated with each coding-category one by one in this order.  When
9351 one algorithm agrees with a byte sequence of source text, the coding
9352 system bound to the corresponding coding-category is selected.
9353
9354 When you modify this variable, `update-coding-systems-internal' must
9355 be called.  */);
9356   {
9357     int i;
9358
9359     Vcoding_category_list = Qnil;
9360     for (i = coding_category_max - 1; i >= 0; i--)
9361       Vcoding_category_list
9362         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
9363                  Vcoding_category_list);
9364   }
9365
9366   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
9367                doc: /* Specify the coding system for read operations.
9368 It is useful to bind this variable with `let', but do not set it globally.
9369 If the value is a coding system, it is used for decoding on read operation.
9370 If not, an appropriate element is used from one of the coding system alists:
9371 There are three such tables, `file-coding-system-alist',
9372 `process-coding-system-alist', and `network-coding-system-alist'.  */);
9373   Vcoding_system_for_read = Qnil;
9374
9375   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
9376                doc: /* Specify the coding system for write operations.
9377 Programs bind this variable with `let', but you should not set it globally.
9378 If the value is a coding system, it is used for encoding of output,
9379 when writing it to a file and when sending it to a file or subprocess.
9380
9381 If this does not specify a coding system, an appropriate element
9382 is used from one of the coding system alists:
9383 There are three such tables, `file-coding-system-alist',
9384 `process-coding-system-alist', and `network-coding-system-alist'.
9385 For output to files, if the above procedure does not specify a coding system,
9386 the value of `buffer-file-coding-system' is used.  */);
9387   Vcoding_system_for_write = Qnil;
9388
9389   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
9390                doc: /*
9391 Coding system used in the latest file or process I/O.  */);
9392   Vlast_coding_system_used = Qnil;
9393
9394   DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error,
9395                doc: /*
9396 Error status of the last code conversion.
9397
9398 When an error was detected in the last code conversion, this variable
9399 is set to one of the following symbols.
9400   `insufficient-source'
9401   `inconsistent-eol'
9402   `invalid-source'
9403   `interrupted'
9404   `insufficient-memory'
9405 When no error was detected, the value doesn't change.  So, to check
9406 the error status of a code conversion by this variable, you must
9407 explicitly set this variable to nil before performing code
9408 conversion.  */);
9409   Vlast_code_conversion_error = Qnil;
9410
9411   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
9412                doc: /*
9413 *Non-nil means always inhibit code conversion of end-of-line format.
9414 See info node `Coding Systems' and info node `Text and Binary' concerning
9415 such conversion.  */);
9416   inhibit_eol_conversion = 0;
9417
9418   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
9419                doc: /*
9420 Non-nil means process buffer inherits coding system of process output.
9421 Bind it to t if the process output is to be treated as if it were a file
9422 read from some filesystem.  */);
9423   inherit_process_coding_system = 0;
9424
9425   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
9426                doc: /*
9427 Alist to decide a coding system to use for a file I/O operation.
9428 The format is ((PATTERN . VAL) ...),
9429 where PATTERN is a regular expression matching a file name,
9430 VAL is a coding system, a cons of coding systems, or a function symbol.
9431 If VAL is a coding system, it is used for both decoding and encoding
9432 the file contents.
9433 If VAL is a cons of coding systems, the car part is used for decoding,
9434 and the cdr part is used for encoding.
9435 If VAL is a function symbol, the function must return a coding system
9436 or a cons of coding systems which are used as above.  The function gets
9437 the arguments with which `find-operation-coding-systems' was called.
9438
9439 See also the function `find-operation-coding-system'
9440 and the variable `auto-coding-alist'.  */);
9441   Vfile_coding_system_alist = Qnil;
9442
9443   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
9444                doc: /*
9445 Alist to decide a coding system to use for a process I/O operation.
9446 The format is ((PATTERN . VAL) ...),
9447 where PATTERN is a regular expression matching a program name,
9448 VAL is a coding system, a cons of coding systems, or a function symbol.
9449 If VAL is a coding system, it is used for both decoding what received
9450 from the program and encoding what sent to the program.
9451 If VAL is a cons of coding systems, the car part is used for decoding,
9452 and the cdr part is used for encoding.
9453 If VAL is a function symbol, the function must return a coding system
9454 or a cons of coding systems which are used as above.
9455
9456 See also the function `find-operation-coding-system'.  */);
9457   Vprocess_coding_system_alist = Qnil;
9458
9459   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
9460                doc: /*
9461 Alist to decide a coding system to use for a network I/O operation.
9462 The format is ((PATTERN . VAL) ...),
9463 where PATTERN is a regular expression matching a network service name
9464 or is a port number to connect to,
9465 VAL is a coding system, a cons of coding systems, or a function symbol.
9466 If VAL is a coding system, it is used for both decoding what received
9467 from the network stream and encoding what sent to the network stream.
9468 If VAL is a cons of coding systems, the car part is used for decoding,
9469 and the cdr part is used for encoding.
9470 If VAL is a function symbol, the function must return a coding system
9471 or a cons of coding systems which are used as above.
9472
9473 See also the function `find-operation-coding-system'.  */);
9474   Vnetwork_coding_system_alist = Qnil;
9475
9476   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
9477                doc: /* Coding system to use with system messages.
9478 Also used for decoding keyboard input on X Window system.  */);
9479   Vlocale_coding_system = Qnil;
9480
9481   /* The eol mnemonics are reset in startup.el system-dependently.  */
9482   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
9483                doc: /*
9484 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
9485   eol_mnemonic_unix = build_string (":");
9486
9487   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
9488                doc: /*
9489 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
9490   eol_mnemonic_dos = build_string ("\\");
9491
9492   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
9493                doc: /*
9494 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
9495   eol_mnemonic_mac = build_string ("/");
9496
9497   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
9498                doc: /*
9499 *String displayed in mode line when end-of-line format is not yet determined.  */);
9500   eol_mnemonic_undecided = build_string (":");
9501
9502   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
9503                doc: /*
9504 *Non-nil enables character translation while encoding and decoding.  */);
9505   Venable_character_translation = Qt;
9506
9507   DEFVAR_LISP ("standard-translation-table-for-decode",
9508                &Vstandard_translation_table_for_decode,
9509                doc: /* Table for translating characters while decoding.  */);
9510   Vstandard_translation_table_for_decode = Qnil;
9511
9512   DEFVAR_LISP ("standard-translation-table-for-encode",
9513                &Vstandard_translation_table_for_encode,
9514                doc: /* Table for translating characters while encoding.  */);
9515   Vstandard_translation_table_for_encode = Qnil;
9516
9517   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
9518                doc: /* Alist of charsets vs revision numbers.
9519 While encoding, if a charset (car part of an element) is found,
9520 designate it with the escape sequence identifying revision (cdr part
9521 of the element).  */);
9522   Vcharset_revision_table = Qnil;
9523
9524   DEFVAR_LISP ("default-process-coding-system",
9525                &Vdefault_process_coding_system,
9526                doc: /* Cons of coding systems used for process I/O by default.
9527 The car part is used for decoding a process output,
9528 the cdr part is used for encoding a text to be sent to a process.  */);
9529   Vdefault_process_coding_system = Qnil;
9530
9531   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
9532                doc: /*
9533 Table of extra Latin codes in the range 128..159 (inclusive).
9534 This is a vector of length 256.
9535 If Nth element is non-nil, the existence of code N in a file
9536 \(or output of subprocess) doesn't prevent it to be detected as
9537 a coding system of ISO 2022 variant which has a flag
9538 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
9539 or reading output of a subprocess.
9540 Only 128th through 159th elements has a meaning.  */);
9541   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
9542
9543   DEFVAR_LISP ("select-safe-coding-system-function",
9544                &Vselect_safe_coding_system_function,
9545                doc: /*
9546 Function to call to select safe coding system for encoding a text.
9547
9548 If set, this function is called to force a user to select a proper
9549 coding system which can encode the text in the case that a default
9550 coding system used in each operation can't encode the text.
9551
9552 The default value is `select-safe-coding-system' (which see).  */);
9553   Vselect_safe_coding_system_function = Qnil;
9554
9555   DEFVAR_BOOL ("coding-system-require-warning",
9556                &coding_system_require_warning,
9557                doc: /* Internal use only.
9558 If non-nil, on writing a file, `select-safe-coding-system-function' is
9559 called even if `coding-system-for-write' is non-nil.  The command
9560 `universal-coding-system-argument' binds this variable to t temporarily.  */);
9561   coding_system_require_warning = 0;
9562
9563
9564   DEFVAR_BOOL ("inhibit-iso-escape-detection",
9565                &inhibit_iso_escape_detection,
9566                doc: /*
9567 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
9568
9569 By default, on reading a file, Emacs tries to detect how the text is
9570 encoded.  This code detection is sensitive to escape sequences.  If
9571 the sequence is valid as ISO2022, the code is determined as one of
9572 the ISO2022 encodings, and the file is decoded by the corresponding
9573 coding system (e.g. `iso-2022-7bit').
9574
9575 However, there may be a case that you want to read escape sequences in
9576 a file as is.  In such a case, you can set this variable to non-nil.
9577 Then, as the code detection ignores any escape sequences, no file is
9578 detected as encoded in some ISO2022 encoding.  The result is that all
9579 escape sequences become visible in a buffer.
9580
9581 The default value is nil, and it is strongly recommended not to change
9582 it.  That is because many Emacs Lisp source files that contain
9583 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
9584 in Emacs's distribution, and they won't be decoded correctly on
9585 reading if you suppress escape sequence detection.
9586
9587 The other way to read escape sequences in a file without decoding is
9588 to explicitly specify some coding system that doesn't use ISO2022's
9589 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
9590   inhibit_iso_escape_detection = 0;
9591
9592   DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
9593                doc: /* Char table for translating self-inserting characters.
9594 This is applied to the result of input methods, not their input.  See also
9595 `keyboard-translate-table'.  */);
9596     Vtranslation_table_for_input = Qnil;
9597
9598   {
9599     Lisp_Object args[coding_arg_max];
9600     Lisp_Object plist[16];
9601     int i;
9602
9603     for (i = 0; i < coding_arg_max; i++)
9604       args[i] = Qnil;
9605
9606     plist[0] = intern (":name");
9607     plist[1] = args[coding_arg_name] = Qno_conversion;
9608     plist[2] = intern (":mnemonic");
9609     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
9610     plist[4] = intern (":coding-type");
9611     plist[5] = args[coding_arg_coding_type] = Qraw_text;
9612     plist[6] = intern (":ascii-compatible-p");
9613     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
9614     plist[8] = intern (":default-char");
9615     plist[9] = args[coding_arg_default_char] = make_number (0);
9616     plist[10] = intern (":for-unibyte");
9617     plist[11] = args[coding_arg_for_unibyte] = Qt;
9618     plist[12] = intern (":docstring");
9619     plist[13] = build_string ("Do no conversion.\n\
9620 \n\
9621 When you visit a file with this coding, the file is read into a\n\
9622 unibyte buffer as is, thus each byte of a file is treated as a\n\
9623 character.");
9624     plist[14] = intern (":eol-type");
9625     plist[15] = args[coding_arg_eol_type] = Qunix;
9626     args[coding_arg_plist] = Flist (16, plist);
9627     Fdefine_coding_system_internal (coding_arg_max, args);
9628
9629     plist[1] = args[coding_arg_name] = Qundecided;
9630     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
9631     plist[5] = args[coding_arg_coding_type] = Qundecided;
9632     /* This is already set.
9633     /*plist[7] = args[coding_arg_ascii_compatible_p] = Qt;*/
9634     plist[8] = intern (":charset-list");
9635     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
9636     plist[11] = args[coding_arg_for_unibyte] = Qnil;
9637     plist[13] = build_string ("No conversion on encoding, automatic conversion on decoding.");
9638     plist[15] = args[coding_arg_eol_type] = Qnil;
9639     args[coding_arg_plist] = Flist (16, plist);
9640     Fdefine_coding_system_internal (coding_arg_max, args);
9641   }
9642
9643   setup_coding_system (Qno_conversion, &keyboard_coding);
9644   setup_coding_system (Qundecided, &terminal_coding);
9645   setup_coding_system (Qno_conversion, &safe_terminal_coding);
9646
9647   {
9648     int i;
9649
9650     for (i = 0; i < coding_category_max; i++)
9651       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
9652   }
9653 }
9654
9655 char *
9656 emacs_strerror (error_number)
9657      int error_number;
9658 {
9659   char *str;
9660
9661   synchronize_system_messages_locale ();
9662   str = strerror (error_number);
9663
9664   if (! NILP (Vlocale_coding_system))
9665     {
9666       Lisp_Object dec = code_convert_string_norecord (build_string (str),
9667                                                       Vlocale_coding_system,
9668                                                       0);
9669       str = (char *) SDATA (dec);
9670     }
9671
9672   return str;
9673 }
9674
9675 #endif /* emacs */
9676
9677 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
9678    (do not change this comment) */