1 ;;; china-util.el --- utilities for Chinese -*- coding: iso-2022-7bit -*-
3 ;; Copyright (C) 1995, 2003 Electrotechnical Laboratory, JAPAN.
4 ;; Licensed to the Free Software Foundation.
5 ;; Copyright (C) 1995, 2001 Free Software Foundation, Inc.
7 ;; Keywords: mule, multilingual, Chinese
9 ;; This file is part of GNU Emacs.
11 ;; GNU Emacs is free software; you can redistribute it and/or modify
12 ;; it under the terms of the GNU General Public License as published by
13 ;; the Free Software Foundation; either version 2, or (at your option)
16 ;; GNU Emacs is distributed in the hope that it will be useful,
17 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 ;; GNU General Public License for more details.
21 ;; You should have received a copy of the GNU General Public License
22 ;; along with GNU Emacs; see the file COPYING. If not, write to the
23 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 ;; Boston, MA 02111-1307, USA.
30 ;; Hz/ZW/EUC-TW encoding stuff
32 ;; HZ is an encoding method for Chinese character set GB2312 used
33 ;; widely in Internet. It is very similar to 7-bit environment of
34 ;; ISO-2022. The difference is that HZ uses the sequence "~{" and
35 ;; "~}" for designating GB2312 and ASCII respectively, hence, it
36 ;; doesn't uses ESC (0x1B) code.
38 ;; ZW is another encoding method for Chinese character set GB2312. It
39 ;; encodes Chinese characters line by line by starting each line with
40 ;; the sequence "zW". It also uses only 7-bit as HZ.
42 ;; EUC-TW is similar to EUC-KS or EUC-JP. Its main character set is
43 ;; plane 1 of CNS 11643; characters of planes 2 to 7 are accessed with
44 ;; a single shift escape followed by three bytes: the first gives the
45 ;; plane, the second and third the character code. Note that characters
46 ;; of plane 1 are (redundantly) accessible with a single shift escape
49 ;; ISO-2022 escape sequence to designate GB2312.
50 (defvar iso2022-gb-designation "\e$A")
51 ;; HZ escape sequence to designate GB2312.
52 (defvar hz-gb-designnation "~{")
53 ;; ISO-2022 escape sequence to designate ASCII.
54 (defvar iso2022-ascii-designation "\e(B")
55 ;; HZ escape sequence to designate ASCII.
56 (defvar hz-ascii-designnation "~}")
57 ;; Regexp of ZW sequence to start GB2312.
58 (defvar zw-start-gb "^zW")
59 ;; Regexp for start of GB2312 in an encoding mixture of HZ and ZW.
60 (defvar hz/zw-start-gb
61 (concat hz-gb-designnation "\\|" zw-start-gb "\\|[^\0-\177]"))
63 (defvar decode-hz-line-continuation nil
64 "Flag to tell if we should care line continuation convention of Hz.")
66 (defconst hz-set-msb-table
74 (push (+ i 128) chars)
76 (apply 'string (nreverse chars)))))
79 (defun decode-hz-region (beg end)
80 "Decode HZ/ZW encoded text in the current region.
81 Return the length of resulting text."
86 (narrow-to-region beg end)
88 ;; We, at first, convert HZ/ZW to `euc-china',
91 ;; "~\n" -> "\n", "~~" -> "~"
92 (goto-char (point-min))
93 (while (search-forward "~" nil t)
94 (setq ch (following-char))
95 (if (or (= ch ?\n) (= ch ?~)) (delete-char -1)))
97 ;; "^zW...\n" -> Chinese GB2312
98 ;; "~{...~}" -> Chinese GB2312
99 (goto-char (point-min))
101 (while (re-search-forward hz/zw-start-gb nil t)
102 (setq pos (match-beginning 0)
104 ;; Record the first position to start conversion.
105 (or beg (setq beg pos))
108 (if (>= ch 128) ; 8bit GB2312
113 (if (= ch ?z) ; ZW -> euc-china
115 (translate-region (point) end hz-set-msb-table)
117 (if (search-forward hz-ascii-designnation
118 (if decode-hz-line-continuation nil end)
122 (translate-region pos (point) hz-set-msb-table))))
124 (decode-coding-region beg end 'euc-china)))
125 (- (point-max) (point-min)))))
128 (defun decode-hz-buffer ()
129 "Decode HZ/ZW encoded text in the current buffer."
131 (decode-hz-region (point-min) (point-max)))
134 (defun encode-hz-region (beg end)
135 "Encode the text in the current region to HZ.
136 Return the length of resulting text."
140 (narrow-to-region beg end)
143 (goto-char (point-min))
144 (while (search-forward "~" nil t) (insert ?~))
146 ;; Chinese GB2312 -> "~{...~}"
147 (goto-char (point-min))
148 (if (re-search-forward "\\cc" nil t)
150 (goto-char (setq pos (match-beginning 0)))
151 (encode-coding-region pos (point-max) 'iso-2022-7bit)
153 (while (search-forward iso2022-gb-designation nil t)
155 (insert hz-gb-designnation))
157 (while (search-forward iso2022-ascii-designation nil t)
159 (insert hz-ascii-designnation))))
160 (- (point-max) (point-min)))))
163 (defun encode-hz-buffer ()
164 "Encode the text in the current buffer to HZ."
166 (encode-hz-region (point-min) (point-max)))
168 ;; The following sets up a translation table (big5-to-cns) from Big 5
169 ;; to CNS encoding, using some auxiliary functions to make the code
172 ;; Many kudos to Himi! The used code has been adapted from his
176 (defun big5-to-flat-code (num)
177 "Convert NUM in Big 5 encoding to a `flat code'.
178 0xA140 will be mapped to position 0, 0xA141 to position 1, etc.
179 There are no gaps in the flat code."
181 (let ((hi (/ num 256))
183 (+ (* 157 (- hi #xa1))
184 (- lo (if (>= lo #xa1) 98 64)))))
186 (defun flat-code-to-big5 (num)
187 "Convert NUM from a `flat code' to Big 5 encoding.
188 This is the inverse function of `big5-to-flat-code'."
190 (let ((hi (/ num 157))
192 (+ (* 256 (+ hi #xa1))
193 (+ lo (if (< lo 63) 64 98)))))
195 (defun euc-to-flat-code (num)
196 "Convert NUM in EUC encoding (in GL representation) to a `flat code'.
197 0x2121 will be mapped to position 0, 0x2122 to position 1, etc.
198 There are no gaps in the flat code."
200 (let ((hi (/ num 256))
202 (+ (* 94 (- hi #x21))
205 (defun flat-code-to-euc (num)
206 "Convert NUM from a `flat code' to EUC encoding (in GL representation).
207 The inverse function of `euc-to-flat-code'. The high and low bytes are
210 (let ((hi (/ num 94))
212 (list (+ hi #x21) (+ lo #x21))))
214 (defun expand-euc-big5-alist (alist)
215 "Create a translation table and fills it with data given in ALIST.
216 Elements of ALIST can be either given as
218 ((euc-charset . startchar) . (big5-range-begin . big5-range-end))
222 (euc-character . big5-charcode)
224 The former maps a range of glyphs in an EUC charset (where STARTCHAR
225 is in GL representation) to a certain range of Big 5 encoded
226 characters, the latter maps a single glyph. Glyphs which can't be
227 mapped will be represented with the byte 0xFF.
229 The return value is the filled translation table."
231 (let ((chartable (make-char-table 'translation-table #xFF))
239 (setq char (car elem)
241 (cond ((and (consp char)
243 (setq i (big5-to-flat-code (car big5))
244 end (big5-to-flat-code (cdr big5))
245 codepoint (euc-to-flat-code (cdr char))
249 (decode-big5-char (flat-code-to-big5 i))
250 (apply (function make-char)
252 (flat-code-to-euc codepoint)))
254 codepoint (1+ codepoint))))
255 ((and (char-valid-p char)
257 (setq i (decode-big5-char big5))
258 (aset chartable i char))
260 (error "Unknown slot type: %S" elem))))
264 ;; All non-CNS encodings are commented out.
266 (define-translation-table 'big5-to-cns
268 (expand-euc-big5-alist
271 ((chinese-cns11643-1 . #x2121) . (#xA140 . #xA1F5))
272 (?
\e$(G"X
\e(B . #xA1F6)
273 (?
\e$(G"W
\e(B . #xA1F7)
274 ((chinese-cns11643-1 . #x2259) . (#xA1F8 . #xA2AE))
275 ((chinese-cns11643-1 . #x2421) . (#xA2AF . #xA3BF))
276 ;; Control codes (vendor dependent)
277 ((chinese-cns11643-1 . #x4221) . (#xA3C0 . #xA3E0))
278 ;; Level 1 Ideographs
279 ((chinese-cns11643-1 . #x4421) . (#xA440 . #xACFD))
280 (?
\e$(GWS
\e(B . #xACFE)
281 ((chinese-cns11643-1 . #x5323) . (#xAD40 . #xAFCF))
282 ((chinese-cns11643-1 . #x5754) . (#xAFD0 . #xBBC7))
283 ((chinese-cns11643-1 . #x6B51) . (#xBBC8 . #xBE51))
284 (?
\e$(GkP
\e(B . #xBE52)
285 ((chinese-cns11643-1 . #x6F5C) . (#xBE53 . #xC1AA))
286 ((chinese-cns11643-1 . #x7536) . (#xC1AB . #xC2CA))
287 (?
\e$(Gu5
\e(B . #xC2CB)
288 ((chinese-cns11643-1 . #x7737) . (#xC2CC . #xC360))
289 ((chinese-cns11643-1 . #x782E) . (#xC361 . #xC3B8))
290 (?
\e$(Gxe
\e(B . #xC3B9)
291 (?
\e$(Gxd
\e(B . #xC3BA)
292 ((chinese-cns11643-1 . #x7866) . (#xC3BB . #xC455))
293 (?
\e$(Gx-
\e(B . #xC456)
294 ((chinese-cns11643-1 . #x7962) . (#xC457 . #xC67E))
296 ((chinese-cns11643-1 . #x2621) . (#xC6A1 . #xC6BE))
298 (?
\e$(G'#
\e(B . #xC6BF)
299 (?
\e$(G'$
\e(B . #xC6C0)
300 (?
\e$(G'&
\e(B . #xC6C1)
301 (?
\e$(G'(
\e(B . #xC6C2)
302 (?
\e$(G'-
\e(B . #xC6C3)
303 (?
\e$(G'.
\e(B . #xC6C4)
304 (?
\e$(G'/
\e(B . #xC6C5)
305 (?
\e$(G'4
\e(B . #xC6C6)
306 (?
\e$(G'7
\e(B . #xC6C7)
307 (?
\e$(G':
\e(B . #xC6C8)
308 (?
\e$(G'<
\e(B . #xC6C9)
309 (?
\e$(G'B
\e(B . #xC6CA)
310 (?
\e$(G'G
\e(B . #xC6CB)
311 (?
\e$(G'N
\e(B . #xC6CC)
312 (?
\e$(G'S
\e(B . #xC6CD)
313 (?
\e$(G'T
\e(B . #xC6CE)
314 (?
\e$(G'U
\e(B . #xC6CF)
315 (?
\e$(G'Y
\e(B . #xC6D0)
316 (?
\e$(G'Z
\e(B . #xC6D1)
317 (?
\e$(G'a
\e(B . #xC6D2)
318 (?
\e$(G'f
\e(B . #xC6D3)
319 (?
\e$(G()
\e(B . #xC6D4)
320 (?
\e$(G(*
\e(B . #xC6D5)
321 (?
\e$(G(c
\e(B . #xC6D6)
322 (?
\e$(G(l
\e(B . #xC6D7)
324 ; ((japanese-jisx0208 . #x212F) . (#xC6D8 . #xC6D9))
325 ;; Japanese Kana Supplement
326 ; ((japanese-jisx0208 . #x2133) . (#xC6DA . #xC6E3))
328 ; ((japanese-jisx0208 . #x2421) . (#xC6E7 . #xC77A))
330 ; ((japanese-jisx0208 . #x2521) . (#xC77B . #xC7F2))
331 ;; Cyrillic Characters
332 ; ((japanese-jisx0208 . #x2721) . (#xC7F3 . #xC854))
333 ; ((japanese-jisx0208 . #x2751) . (#xC855 . #xC875))
334 ;; Special Chinese Characters
335 (?
\e$(J!#
\e(B . #xC879)
336 (?
\e$(J!$
\e(B . #xC87B)
337 (?
\e$(J!*
\e(B . #xC87D)
338 (?
\e$(J!R
\e(B . #xC8A2)
340 ;; JIS X 0208 NOT SIGN (cf. U+00AC)
341 ; (?
\e$B"L
\e(B . #xC8CD)
342 ;; JIS X 0212 BROKEN BAR (cf. U+00A6)
343 ; (?
\e$(D"C
\e(B . #xC8CE)
345 ;; GB 2312 characters
346 ; (?
\e$A!d
\e(B . #xC8CF)
347 ; (?
\e$A!e
\e(B . #xC8D0)
348 ;;;;; C8D1 - Japanese `(
\e$B3t
\e(B)'
349 ; (?
\e$A!m
\e(B . #xC8D2)
352 ;; Level 2 Ideographs
353 ((chinese-cns11643-2 . #x2121) . (#xC940 . #xC949))
354 (?
\e$(GDB
\e(B . #xC94A);; a duplicate of #xA461
355 ((chinese-cns11643-2 . #x212B) . (#xC94B . #xC96B))
356 ((chinese-cns11643-2 . #x214D) . (#xC96C . #xC9BD))
357 (?
\e$(H!L
\e(B . #xC9BE)
358 ((chinese-cns11643-2 . #x217D) . (#xC9BF . #xC9EC))
359 ((chinese-cns11643-2 . #x224E) . (#xC9ED . #xCAF6))
360 (?
\e$(H"M
\e(B . #xCAF7)
361 ((chinese-cns11643-2 . #x2439) . (#xCAF8 . #xD6CB))
362 (?
\e$(H>c
\e(B . #xD6CC)
363 ((chinese-cns11643-2 . #x3770) . (#xD6CD . #xD779))
364 (?
\e$(H?j
\e(B . #xD77A)
365 ((chinese-cns11643-2 . #x387E) . (#xD77B . #xDADE))
366 (?
\e$(H7o
\e(B . #xDADF)
367 ((chinese-cns11643-2 . #x3E64) . (#xDAE0 . #xDBA6))
368 ((chinese-cns11643-2 . #x3F6B) . (#xDBA7 . #xDDFB))
369 (?
\e$(HAv
\e(B . #xDDFC);; a duplicate of #xDCD1
370 ((chinese-cns11643-2 . #x4424) . (#xDDFD . #xE8A2))
371 ((chinese-cns11643-2 . #x554C) . (#xE8A3 . #xE975))
372 ((chinese-cns11643-2 . #x5723) . (#xE976 . #xEB5A))
373 ((chinese-cns11643-2 . #x5A29) . (#xEB5B . #xEBF0))
374 (?
\e$(HUK
\e(B . #xEBF1)
375 ((chinese-cns11643-2 . #x5B3F) . (#xEBF2 . #xECDD))
376 (?
\e$(HW"
\e(B . #xECDE)
377 ((chinese-cns11643-2 . #x5C6A) . (#xECDF . #xEDA9))
378 ((chinese-cns11643-2 . #x5D75) . (#xEDAA . #xEEEA))
379 (?
\e$(Hd/
\e(B . #xEEEB)
380 ((chinese-cns11643-2 . #x6039) . (#xEEEC . #xF055))
381 (?
\e$(H]t
\e(B . #xF056)
382 ((chinese-cns11643-2 . #x6243) . (#xF057 . #xF0CA))
383 (?
\e$(HZ(
\e(B . #xF0CB)
384 ((chinese-cns11643-2 . #x6337) . (#xF0CC . #xF162))
385 ((chinese-cns11643-2 . #x6430) . (#xF163 . #xF16A))
386 (?
\e$(Hga
\e(B . #xF16B)
387 ((chinese-cns11643-2 . #x6438) . (#xF16C . #xF267))
388 (?
\e$(Hi4
\e(B . #xF268)
389 ((chinese-cns11643-2 . #x6573) . (#xF269 . #xF2C2))
390 ((chinese-cns11643-2 . #x664E) . (#xF2C3 . #xF374))
391 ((chinese-cns11643-2 . #x6762) . (#xF375 . #xF465))
392 ((chinese-cns11643-2 . #x6935) . (#xF466 . #xF4B4))
393 (?
\e$(HfM
\e(B . #xF4B5)
394 ((chinese-cns11643-2 . #x6962) . (#xF4B6 . #xF4FC))
395 ((chinese-cns11643-2 . #x6A4C) . (#xF4FD . #xF662))
396 (?
\e$(HjK
\e(B . #xF663)
397 ((chinese-cns11643-2 . #x6C52) . (#xF664 . #xF976))
398 ((chinese-cns11643-2 . #x7167) . (#xF977 . #xF9C3))
399 (?
\e$(Hqf
\e(B . #xF9C4)
400 (?
\e$(Hr4
\e(B . #xF9C5)
401 (?
\e$(Hr@
\e(B . #xF9C6)
402 ((chinese-cns11643-2 . #x7235) . (#xF9C7 . #xF9D1))
403 ((chinese-cns11643-2 . #x7241) . (#xF9D2 . #xF9D5))
405 ;; Additional Ideographs
406 (?
\e$(IC7
\e(B . #xF9D6)
407 (?
\e$(IOP
\e(B . #xF9D7)
408 (?
\e$(IDN
\e(B . #xF9D8)
409 (?
\e$(IPJ
\e(B . #xF9D9)
410 (?
\e$(I,]
\e(B . #xF9DA)
411 (?
\e$(I=~
\e(B . #xF9DB)
412 (?
\e$(IK\
\e(B . #xF9DC)
418 (provide 'china-util)
420 ;;; china-util.el ends here