]> code.delx.au - gnu-emacs/blob - lisp/international/characters.el
Remove prefix "coding-system-" from coding system symbol names.
[gnu-emacs] / lisp / international / characters.el
1 ;;; characters.el --- set syntax and category for multibyte characters
2
3 ;; Copyright (C) 1995 Free Software Foundation, Inc.
4 ;; Copyright (C) 1995 Electrotechnical Laboratory, JAPAN.
5
6 ;; Keywords: multibyte character, character set, syntax, category
7
8 ;; This file is part of GNU Emacs.
9
10 ;; GNU Emacs is free software; you can redistribute it and/or modify
11 ;; it under the terms of the GNU General Public License as published by
12 ;; the Free Software Foundation; either version 2, or (at your option)
13 ;; any later version.
14
15 ;; GNU Emacs is distributed in the hope that it will be useful,
16 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;; GNU General Public License for more details.
19
20 ;; You should have received a copy of the GNU General Public License
21 ;; along with GNU Emacs; see the file COPYING. If not, write to the
22 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 ;; Boston, MA 02111-1307, USA.
24
25 ;;; Commentary:
26
27 ;; This file contains multibyte characters. Save this file always in
28 ;; the coding system `iso-2022-7'.
29
30 ;;; Predefined categories.
31
32 ;; For each character set.
33
34 (define-category ?a "ASCII")
35 (define-category ?l "Latin")
36 (define-category ?t "Thai")
37 (define-category ?g "Greek")
38 (define-category ?b "Arabic")
39 (define-category ?w "Hebrew")
40 (define-category ?y "Cyrillic")
41 (define-category ?k "Japanese katakana")
42 (define-category ?r "Japanese roman")
43 (define-category ?c "Chinese")
44 (define-category ?j "Japanese")
45 (define-category ?h "Korean")
46 (define-category ?e "Ethiopic (Ge'ez)")
47 (define-category ?v "Vietnamese")
48 (define-category ?i "Indian")
49 (define-category ?o "Lao")
50
51 ;; For each group (row) of 2-byte character sets.
52
53 (define-category ?A "Alpha numeric characters of 2-byte character sets")
54 (define-category ?C "Chinese (Han) characters of 2-byte character sets")
55 (define-category ?G "Greek characters of 2-byte characters sets")
56 (define-category ?H "Japanese Hiragana characters of 2-byte character sets")
57 (define-category ?K "Japanese Katakana characters of 2-byte character sets")
58 (define-category ?N "Korean Hangul characters of 2-byte character sets")
59 (define-category ?Y "Cyrillic character of 2-byte character sets")
60 (define-category ?I "Indian Glyphs")
61
62 ;; For phonetic classifications.
63
64 (define-category ?0 "consonant")
65 (define-category ?1 "base vowel")
66 (define-category ?2 "upper diacritical mark (including upper vowel)")
67 (define-category ?3 "lower diacritical mark (including lower vowel)")
68 (define-category ?4 "tone mark")
69 (define-category ?5 "vowel")
70 (define-category ?6 "digit")
71 (define-category ?7 "vowel-modifying diacritical mark")
72 (define-category ?8 "vowel-signs")
73 (define-category ?9 "semivowel lower")
74
75 ;; For filling.
76 (define-category ?| "While filling, we can break a line at this character.")
77
78 ;; Keep the followings for `kinsoku' processing. See comments in
79 ;; kinsoku.el.
80 (define-category ?> "A character which can't be placed at beginning of line.")
81 (define-category ?< "A character which can't be placed at end of line.")
82
83 \f
84 ;;; Setting syntax and category.
85
86 ;; ASCII
87
88 (let ((ch 32))
89 (while (< ch 127) ; All ASCII characters have
90 (modify-category-entry ch ?a) ; the category `a' (ASCII)
91 (modify-category-entry ch ?l) ; and `l' (Latin).
92 (setq ch (1+ ch))))
93
94 ;; Arabic character set
95
96 (let ((charsets '(arabic-iso8859-6
97 arabic-digit
98 arabic-1-column
99 arabic-2-column)))
100 (while charsets
101 (modify-syntax-entry (make-char (car charsets)) "w")
102 (modify-category-entry (make-char (car charsets)) ?b)
103 (setq charsets (cdr charsets))))
104
105 ;; Chinese character set (GB2312)
106
107 (modify-syntax-entry (make-char 'chinese-gb2312) "w")
108 (modify-syntax-entry (make-char 'chinese-gb2312 33) "_")
109 (modify-syntax-entry (make-char 'chinese-gb2312 34) "_")
110 (modify-syntax-entry (make-char 'chinese-gb2312 41) "_")
111 (modify-syntax-entry ?\\e$A!2\e(B "(\e$A!3\e(B")
112 (modify-syntax-entry ?\\e$A!4\e(B "(\e$A!5\e(B")
113 (modify-syntax-entry ?\\e$A!6\e(B "(\e$A!7\e(B")
114 (modify-syntax-entry ?\\e$A!8\e(B "(\e$A!9\e(B")
115 (modify-syntax-entry ?\\e$A!:\e(B "(\e$A!;\e(B")
116 (modify-syntax-entry ?\\e$A!<\e(B "(\e$A!=\e(B")
117 (modify-syntax-entry ?\\e$A!>\e(B "(\e$A!?\e(B")
118 (modify-syntax-entry ?\\e$A!3\e(B ")\e$A!2\e(B")
119 (modify-syntax-entry ?\\e$A!5\e(B ")\e$A!4\e(B")
120 (modify-syntax-entry ?\\e$A!7\e(B ")\e$A!6\e(B")
121 (modify-syntax-entry ?\\e$A!9\e(B ")\e$A!8\e(B")
122 (modify-syntax-entry ?\\e$A!;\e(B ")\e$A!:\e(B")
123 (modify-syntax-entry ?\\e$A!=\e(B ")\e$A!<\e(B")
124 (modify-syntax-entry ?\\e$A!?\e(B ")\e$A!>\e(B")
125
126 (modify-category-entry (make-char 'chinese-gb2312) ?c)
127 (modify-category-entry (make-char 'chinese-gb2312) ?\|)
128 (modify-category-entry (make-char 'chinese-gb2312 35) ?A)
129 (modify-category-entry (make-char 'chinese-gb2312 36) ?H)
130 (modify-category-entry (make-char 'chinese-gb2312 37) ?K)
131 (modify-category-entry (make-char 'chinese-gb2312 38) ?G)
132 (modify-category-entry (make-char 'chinese-gb2312 39) ?Y)
133 (modify-category-entry (make-char 'chinese-gb2312 35) ?A)
134 (let ((row 48))
135 (while (< row 127)
136 (modify-category-entry (make-char 'chinese-gb2312 row) ?C)
137 (setq row (1+ row))))
138
139 ;; Chinese character set (BIG5)
140
141 (let ((generic-big5-1-char (make-char 'chinese-big5-1))
142 (generic-big5-2-char (make-char 'chinese-big5-2)))
143 (modify-syntax-entry generic-big5-1-char "w")
144 (modify-syntax-entry generic-big5-2-char "w")
145
146 (modify-category-entry generic-big5-1-char ?c)
147 (modify-category-entry generic-big5-2-char ?c)
148
149 (modify-category-entry generic-big5-1-char ?C)
150 (modify-category-entry generic-big5-2-char ?C)
151
152 (modify-category-entry generic-big5-1-char ?\|)
153 (modify-category-entry generic-big5-2-char ?\|))
154
155
156 ;; Chinese character set (CNS11643)
157
158 (let ((cns-list '(chinese-cns11643-1
159 chinese-cns11643-2
160 chinese-cns11643-3
161 chinese-cns11643-4
162 chinese-cns11643-5
163 chinese-cns11643-6
164 chinese-cns11643-7))
165 generic-char)
166 (while cns-list
167 (setq generic-char (make-char (car cns-list)))
168 (modify-syntax-entry generic-char "w")
169 (modify-category-entry generic-char ?c)
170 (modify-category-entry generic-char ?C)
171 (modify-category-entry generic-char ?|)
172 (setq cns-list (cdr cns-list))))
173
174 ;; Cyrillic character set (ISO-8859-5)
175
176 (modify-category-entry (make-char 'cyrillic-iso8859-5) ?y)
177
178 (let ((c 160))
179 (while (< c 256)
180 (modify-syntax-entry (make-char 'cyrillic-iso8859-5 c) "w")
181 (setq c (1+ c))))
182 (modify-syntax-entry ?\e,L-\e(B ".")
183 (modify-syntax-entry ?\e,Lp\e(B ".")
184 (modify-syntax-entry ?\e,L}\e(B ".")
185
186 ;; Ethiopic character set
187
188 (modify-category-entry (make-char 'ethiopic) ?e)
189
190 ;; European character set (Latin-1,2,3,4,5)
191
192 (modify-category-entry (make-char 'latin-iso8859-1) ?l)
193 (modify-category-entry (make-char 'latin-iso8859-2) ?l)
194 (modify-category-entry (make-char 'latin-iso8859-3) ?l)
195 (modify-category-entry (make-char 'latin-iso8859-4) ?l)
196 (modify-category-entry (make-char 'latin-iso8859-9) ?l)
197
198 ;; ISO-8859-1 (Latin-1)
199 (let ((c 64))
200 (while (< c 128) ; from '\e,A@\e(B' to '\e,A\7f\e(B'
201 (modify-syntax-entry (make-char 'latin-iso8859-1 c) "w")
202 (setq c (1+ c)))
203 (modify-syntax-entry (make-char 'latin-iso8859-1 32) "w") ; NBSP
204 (modify-syntax-entry ?\e,AW\e(B "_")
205 (modify-syntax-entry ?\e,Aw\e(B "_")
206 )
207
208 ;; ISO-8859-2 (Latin-2)
209 (let ((c 190))
210 (while (< c 255)
211 (modify-syntax-entry (make-char 'latin-iso8859-2 c) "w")
212 (setq c (1+ c))))
213 (let ((chars '(?\e,B!\e(B ?\e,B#\e(B ?\e,B%\e(B ?\e,B&\e(B ?\e,B)\e(B ?\e,B*\e(B ?\e,B+\e(B ?\e,B,\e(B ?\e,B.\e(B ?\e,B/\e(B ?\e,B1\e(B ?\e,B3\e(B ?\e,B5\e(B ?\e,B6\e(B ?\e,B9\e(B ?\e,B:\e(B ?\e,B;\e(B ?\e,B<\e(B)))
214 (while chars
215 (modify-syntax-entry (car chars) "w")
216 (setq chars (cdr chars))))
217 (modify-syntax-entry (make-char 'latin-iso8859-2 160) "w") ; NBSP
218 (modify-syntax-entry ?\e,BW\e(B ".")
219 (modify-syntax-entry ?\e,Bw\e(B ".")
220
221 ;; Greek character set (ISO-8859-7)
222
223 (modify-category-entry (make-char 'greek-iso8859-7) ?g)
224
225 (let ((c 182))
226 (while (< c 255)
227 (modify-syntax-entry (make-char 'greek-iso8859-7 c) "w")
228 (setq c (1+ c))))
229 (modify-syntax-entry (make-char 'greek-iso8859-7 160) "w") ; NBSP
230 (modify-syntax-entry ?\e,F7\e(B ".")
231 (modify-syntax-entry ?\e,F;\e(B ".")
232 (modify-syntax-entry ?\e,F=\e(B ".")
233
234 ;; Hebrew character set (ISO-8859-8)
235
236 (modify-category-entry (make-char 'hebrew-iso8859-8) ?w)
237
238 (let ((c 224))
239 (while (< c 251)
240 (modify-syntax-entry (make-char 'hebrew-iso8859-8 c) "w")
241 (setq c (1+ c))))
242 (modify-syntax-entry (make-char 'hebrew-iso8859-8 160) "w") ; NBSP
243
244 ;; Indian character set (IS 13194 and other Emacs original Indian charsets)
245
246 (modify-category-entry (make-char 'indian-is13194) ?i)
247 (modify-category-entry (make-char 'indian-2-column) ?I)
248 (modify-category-entry (make-char 'indian-1-column) ?I)
249
250 ;; Japanese character set (JISX0201-kana, JISX0201-roman, JISX0208, JISX0212)
251
252 (modify-category-entry (make-char 'katakana-jisx0201) ?k)
253 (modify-category-entry (make-char 'latin-jisx0201) ?r)
254 (modify-category-entry (make-char 'japanese-jisx0208) ?j)
255 (modify-category-entry (make-char 'japanese-jisx0212) ?j)
256 (modify-category-entry (make-char 'japanese-jisx0208) ?\|)
257
258 ;; JISX0208
259 (modify-syntax-entry (make-char 'japanese-jisx0208) "w")
260 (modify-syntax-entry (make-char 'japanese-jisx0208 33) "_")
261 (modify-syntax-entry (make-char 'japanese-jisx0208 34) "_")
262 (modify-syntax-entry (make-char 'japanese-jisx0208 40) "_")
263 (let ((chars '(?\e$B!<\e(B ?\e$B!+\e(B ?\e$B!,\e(B ?\e$B!3\e(B ?\e$B!4\e(B ?\e$B!5\e(B ?\e$B!6\e(B ?\e$B!7\e(B ?\e$B!8\e(B ?\e$B!9\e(B ?\e$B!:\e(B ?\e$B!;\e(B)))
264 (while chars
265 (modify-syntax-entry (car chars) "w")
266 (setq chars (cdr chars))))
267 (modify-syntax-entry ?\\e$B!J\e(B "(\e$B!K\e(B")
268 (modify-syntax-entry ?\\e$B!N\e(B "(\e$B!O\e(B")
269 (modify-syntax-entry ?\\e$B!P\e(B "(\e$B!Q\e(B")
270 (modify-syntax-entry ?\\e$B!V\e(B "(\e$B!W\e(B")
271 (modify-syntax-entry ?\\e$B!X\e(B "(\e$B!Y\e(B")
272 (modify-syntax-entry ?\\e$B!K\e(B ")\e$B!J\e(B")
273 (modify-syntax-entry ?\\e$B!O\e(B ")\e$B!N\e(B")
274 (modify-syntax-entry ?\\e$B!Q\e(B ")\e$B!P\e(B")
275 (modify-syntax-entry ?\\e$B!W\e(B ")\e$B!V\e(B")
276 (modify-syntax-entry ?\\e$B!Y\e(B ")\e$B!X\e(B")
277
278 (modify-category-entry (make-char 'japanese-jisx0208 35) ?A)
279 (modify-category-entry (make-char 'japanese-jisx0208 36) ?H)
280 (modify-category-entry (make-char 'japanese-jisx0208 37) ?K)
281 (modify-category-entry (make-char 'japanese-jisx0208 38) ?G)
282 (modify-category-entry (make-char 'japanese-jisx0208 39) ?Y)
283 (let ((row 48))
284 (while (< row 127)
285 (modify-category-entry (make-char 'japanese-jisx0208 row) ?C)
286 (setq row (1+ row))))
287 (let ((chars '(?\e$B!<\e(B ?\e$B!+\e(B ?\e$B!,\e(B)))
288 (while chars
289 (modify-category-entry (car chars) ?K)
290 (modify-category-entry (car chars) ?H)
291 (setq chars (cdr chars))))
292 (let ((chars '(?\e$B!3\e(B ?\e$B!4\e(B ?\e$B!5\e(B ?\e$B!6\e(B ?\e$B!7\e(B ?\e$B!8\e(B ?\e$B!9\e(B ?\e$B!:\e(B ?\e$B!;\e(B)))
293 (while chars
294 (modify-category-entry (car chars) ?C)
295 (setq chars (cdr chars))))
296
297 ;; JISX0212
298 (modify-syntax-entry (make-char 'japanese-jisx0212) "w")
299 (modify-syntax-entry (make-char 'japanese-jisx0212 33) "_")
300 (modify-syntax-entry (make-char 'japanese-jisx0212 34) "_")
301 (modify-syntax-entry (make-char 'japanese-jisx0212 35) "_")
302
303 (modify-category-entry (make-char 'japanese-jisx0212 ) ?C)
304
305 ;; JISX0201-Kana
306 (modify-syntax-entry (make-char 'katakana-jisx0201) "w")
307 (let ((chars '(?\e(I!\e(B ?\e(I"\e(B ?\e(I#\e(B ?\e(I$\e(B ?\e(I%\e(B)))
308 (while chars
309 (modify-syntax-entry (car chars) ".")
310 (setq chars (cdr chars))))
311
312 ;; Korean character set (KSC5601)
313
314 (modify-syntax-entry (make-char 'korean-ksc5601) "w")
315 (modify-syntax-entry (make-char 'korean-ksc5601 33) "_")
316 (modify-syntax-entry (make-char 'korean-ksc5601 34) "_")
317 (modify-syntax-entry (make-char 'korean-ksc5601 38) "_")
318 (modify-syntax-entry (make-char 'korean-ksc5601 39) "_")
319 (modify-syntax-entry (make-char 'korean-ksc5601 40) "_")
320 (modify-syntax-entry (make-char 'korean-ksc5601 41) "_")
321
322 (modify-category-entry (make-char 'korean-ksc5601) ?h)
323 (modify-category-entry (make-char 'korean-ksc5601 35) ?A)
324 (modify-category-entry (make-char 'korean-ksc5601 37) ?G)
325 (modify-category-entry (make-char 'korean-ksc5601 42) ?H)
326 (modify-category-entry (make-char 'korean-ksc5601 43) ?K)
327 (modify-category-entry (make-char 'korean-ksc5601 44) ?Y)
328
329 ;; Lao character set
330
331 (modify-category-entry (make-char 'lao) ?o)
332
333 (let ((deflist '(;; chars syntax category
334 ("\e(1!\e(B-\e(1N\e(B" "w" ?0) ; consonant
335 ("\e(1PRS]`\e(B-\e(1d\e(B" "w" ?1) ; vowel base
336 ("\e(1QT\e(B-\e(1W[m\e(B" "w" ?2) ; vowel upper
337 ("\e(1XY\e(B" "w" ?3) ; vowel lower
338 ("\e(1h\e(B-\e(1l\e(B" "w" ?4) ; tone mark
339 ("\e(1\\e(B" "w" ?9) ; semivowel lower
340 ("\e(1p\e(B-\e(1y\e(B" "w" ?0) ; digit and misc
341 ("\e(1Of\e(B" "_" ?0) ; symbol
342 ))
343 elm chars len syntax category to ch i)
344 (while deflist
345 (setq elm (car deflist))
346 (setq chars (car elm)
347 len (length chars)
348 syntax (nth 1 elm)
349 category (nth 2 elm)
350 i 0)
351 (while (< i len)
352 (if (= (aref chars i) ?-)
353 (setq i (1+ i)
354 to (sref chars i))
355 (setq ch (sref chars i)
356 to ch))
357 (while (<= ch to)
358 (modify-syntax-entry ch syntax)
359 (modify-category-entry ch category)
360 (setq ch (1+ ch)))
361 (setq i (+ i (char-bytes to))))
362 (setq deflist (cdr deflist))))
363
364 ;; Thai character set (TIS620)
365
366 (modify-category-entry (make-char 'thai-tis620) ?t)
367
368 (let ((deflist '(;; chars syntax category
369 ("\e,T!\e(B-\e,TCEG\e(B-\e,TN\e(B" "w" ?0) ; consonant
370 ("\e,TDFPRS`\e(B-\e,Te\e(B" "w" ?1) ; vowel base
371 ("\e,TQT\e(B-\e,TWgn\e(B" "w" ?2) ; vowel upper
372 ("\e,TX\e(B-\e,TZ\e(B" "w" ?3) ; vowel lower
373 ("\e,Th\e(B-\e,Tm\e(B" "w" ?4) ; tone mark
374 ("\e,TOfp\e(B-\e,Ty\e(B" "w" ?0) ; digit and misc
375 ("\e,T_oz{\e(B" "_" ?0) ; symbol
376 ))
377 elm chars len syntax category to ch i)
378 (while deflist
379 (setq elm (car deflist))
380 (setq chars (car elm)
381 len (length chars)
382 syntax (nth 1 elm)
383 category (nth 2 elm)
384 i 0)
385 (while (< i len)
386 (if (= (aref chars i) ?-)
387 (setq i (1+ i)
388 to (sref chars i))
389 (setq ch (sref chars i)
390 to ch))
391 (while (<= ch to)
392 (modify-syntax-entry ch syntax)
393 (modify-category-entry ch category)
394 (setq ch (1+ ch)))
395 (setq i (+ i (char-bytes to))))
396 (setq deflist (cdr deflist))))
397
398 ;; Vietnamese character set
399
400 (let ((lower (make-char 'vietnamese-viscii-lower))
401 (upper (make-char 'vietnamese-viscii-upper)))
402 (modify-syntax-entry lower "w")
403 (modify-syntax-entry upper "w")
404 (modify-category-entry lower ?v)
405 (modify-category-entry upper ?v)
406 (modify-category-entry lower ?l) ; To make a word with
407 (modify-category-entry upper ?l) ; latin characters.
408 )
409
410 \f
411 ;;; Setting word boundary.
412
413 (setq word-combining-categories
414 '((?l . ?l)))
415
416 (setq word-separating-categories ; (2-byte character sets)
417 '((?A . ?K) ; Alpha numeric - Katakana
418 (?A . ?C) ; Alpha numeric - Chinese
419 (?H . ?A) ; Hiragana - Alpha numeric
420 (?H . ?K) ; Hiragana - Katakana
421 (?H . ?C) ; Hiragana - Chinese
422 (?K . ?A) ; Katakana - Alpha numeric
423 (?K . ?C) ; Katakana - Chinese
424 (?C . ?A) ; Chinese - Alpha numeric
425 (?C . ?K) ; Chinese - Katakana
426 ))