]> code.delx.au - gnu-emacs/blob - lisp/international/characters.el
Initial revision
[gnu-emacs] / lisp / international / characters.el
1 ;;; characters.el --- set syntax and category for multibyte characters
2
3 ;; Copyright (C) 1995 Free Software Foundation, Inc.
4 ;; Copyright (C) 1995 Electrotechnical Laboratory, JAPAN.
5
6 ;; Keywords: multibyte character, character set, syntax, category
7
8 ;; This file is part of GNU Emacs.
9
10 ;; GNU Emacs is free software; you can redistribute it and/or modify
11 ;; it under the terms of the GNU General Public License as published by
12 ;; the Free Software Foundation; either version 2, or (at your option)
13 ;; any later version.
14
15 ;; GNU Emacs is distributed in the hope that it will be useful,
16 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;; GNU General Public License for more details.
19
20 ;; You should have received a copy of the GNU General Public License
21 ;; along with GNU Emacs; see the file COPYING. If not, write to
22 ;; the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23
24 ;;; Commentary:
25
26 ;; This file contains multibyte characters. Save this file always in
27 ;; `coding-system-iso-2022-7'.
28
29 ;;; Predefined categories.
30
31 ;; For each character set.
32
33 (define-category ?a "ASCII")
34 (define-category ?l "Latin")
35 (define-category ?t "Thai")
36 (define-category ?g "Greek")
37 (define-category ?b "Arabic")
38 (define-category ?w "Hebrew")
39 (define-category ?y "Cyrillic")
40 (define-category ?k "Japanese katakana")
41 (define-category ?r "Japanese roman")
42 (define-category ?c "Chinese")
43 (define-category ?j "Japanese")
44 (define-category ?h "Korean")
45 (define-category ?e "Ethiopic (Ge'ez)")
46 (define-category ?v "Vietnamese")
47 (define-category ?i "Indian")
48
49 ;; For each group (row) of 2-byte character sets.
50
51 (define-category ?A "Alpha numeric characters of 2-byte character sets")
52 (define-category ?C "Chinese (Han) characters of 2-byte character sets")
53 (define-category ?G "Greek characters of 2-byte characters sets")
54 (define-category ?H "Japanese Hiragana characters of 2-byte character sets")
55 (define-category ?K "Japanese Katakana characters of 2-byte character sets")
56 (define-category ?N "Korean Hangul characters of 2-byte character sets")
57 (define-category ?Y "Cyrillic character of 2-byte character sets")
58 (define-category ?I "Indian Glyphs")
59
60 ;; For phonetic classifications.
61
62 (define-category ?0 "consonant")
63 (define-category ?1 "base vowel")
64 (define-category ?2 "upper diacritical mark (including upper vowel)")
65 (define-category ?3 "lower diacritical mark (including lower vowel)")
66 (define-category ?4 "tone mark")
67 (define-category ?5 "vowel")
68 (define-category ?6 "digit")
69 (define-category ?7 "vowel-modifying diacritical mark")
70 (define-category ?8 "vowel-signs.")
71
72 ;; For filling.
73 (define-category ?| "While filling, we can break a line at this character.")
74
75 ;; Keep the followings for `kinsoku' processing. See comments in
76 ;; kinsoku.el.
77 (define-category ?> "A character which can't be placed at beginning of line.")
78 (define-category ?< "A character which can't be placed at end of line.")
79
80 \f
81 ;;; Setting syntax and category.
82
83 ;; ASCII
84
85 (let ((ch 32))
86 (while (< ch 127) ; All ASCII characters have
87 (modify-category-entry ch ?a) ; the category `a' (ASCII)
88 (modify-category-entry ch ?l) ; and `l' (Latin).
89 (setq ch (1+ ch))))
90
91 ;; Arabic character set
92
93 (let ((charsets '(arabic-iso8859-6
94 arabic-digit
95 arabic-1-column
96 arabic-2-column)))
97 (while charsets
98 (modify-syntax-entry (make-char (car charsets)) "w")
99 (modify-category-entry (make-char (car charsets)) ?b)
100 (setq charsets (cdr charsets))))
101
102 ;; Chinese character set (GB2312)
103
104 (modify-syntax-entry (make-char 'chinese-gb2312) "w")
105 (modify-syntax-entry (make-char 'chinese-gb2312 33) "_")
106 (modify-syntax-entry (make-char 'chinese-gb2312 34) "_")
107 (modify-syntax-entry (make-char 'chinese-gb2312 41) "_")
108 (modify-syntax-entry ?\\e$A!2\e(B "(\e$A!3\e(B")
109 (modify-syntax-entry ?\\e$A!4\e(B "(\e$A!5\e(B")
110 (modify-syntax-entry ?\\e$A!6\e(B "(\e$A!7\e(B")
111 (modify-syntax-entry ?\\e$A!8\e(B "(\e$A!9\e(B")
112 (modify-syntax-entry ?\\e$A!:\e(B "(\e$A!;\e(B")
113 (modify-syntax-entry ?\\e$A!<\e(B "(\e$A!=\e(B")
114 (modify-syntax-entry ?\\e$A!>\e(B "(\e$A!?\e(B")
115 (modify-syntax-entry ?\\e$A!3\e(B ")\e$A!2\e(B")
116 (modify-syntax-entry ?\\e$A!5\e(B ")\e$A!4\e(B")
117 (modify-syntax-entry ?\\e$A!7\e(B ")\e$A!6\e(B")
118 (modify-syntax-entry ?\\e$A!9\e(B ")\e$A!8\e(B")
119 (modify-syntax-entry ?\\e$A!;\e(B ")\e$A!:\e(B")
120 (modify-syntax-entry ?\\e$A!=\e(B ")\e$A!<\e(B")
121 (modify-syntax-entry ?\\e$A!?\e(B ")\e$A!>\e(B")
122
123 (modify-category-entry (make-char 'chinese-gb2312) ?c)
124 (modify-category-entry (make-char 'chinese-gb2312) ?\|)
125 (modify-category-entry (make-char 'chinese-gb2312 35) ?A)
126 (modify-category-entry (make-char 'chinese-gb2312 36) ?H)
127 (modify-category-entry (make-char 'chinese-gb2312 37) ?K)
128 (modify-category-entry (make-char 'chinese-gb2312 38) ?G)
129 (modify-category-entry (make-char 'chinese-gb2312 39) ?Y)
130 (modify-category-entry (make-char 'chinese-gb2312 35) ?A)
131 (let ((row 48))
132 (while (< row 127)
133 (modify-category-entry (make-char 'chinese-gb2312 row) ?C)
134 (setq row (1+ row))))
135
136 ;; Chinese character set (BIG5)
137
138 (let ((generic-big5-1-char (make-char 'chinese-big5-1))
139 (generic-big5-2-char (make-char 'chinese-big5-2)))
140 (modify-syntax-entry generic-big5-1-char "w")
141 (modify-syntax-entry generic-big5-2-char "w")
142
143 (modify-category-entry generic-big5-1-char ?c)
144 (modify-category-entry generic-big5-2-char ?c)
145
146 (modify-category-entry generic-big5-1-char ?C)
147 (modify-category-entry generic-big5-2-char ?C)
148
149 (modify-category-entry generic-big5-1-char ?\|)
150 (modify-category-entry generic-big5-2-char ?\|))
151
152
153 ;; Chinese character set (CNS11643)
154
155 (let ((cns-list '(chinese-cns11643-1
156 chinese-cns11643-2
157 chinese-cns11643-3
158 chinese-cns11643-4
159 chinese-cns11643-5
160 chinese-cns11643-6
161 chinese-cns11643-7))
162 generic-char)
163 (while cns-list
164 (setq generic-char (make-char (car cns-list)))
165 (modify-syntax-entry generic-char "w")
166 (modify-category-entry generic-char ?c)
167 (modify-category-entry generic-char ?C)
168 (modify-category-entry generic-char ?|)
169 (setq cns-list (cdr cns-list))))
170
171 ;; Cyrillic character set (ISO-8859-5)
172
173 (modify-category-entry (make-char 'cyrillic-iso8859-5) ?y)
174
175 (let ((c 160))
176 (while (< c 256)
177 (modify-syntax-entry (make-char 'cyrillic-iso8859-5 c) "w")
178 (setq c (1+ c))))
179 (modify-syntax-entry ?\e,L-\e(B ".")
180 (modify-syntax-entry ?\e,Lp\e(B ".")
181 (modify-syntax-entry ?\e,L}\e(B ".")
182
183 ;; Ethiopic character set
184
185 (modify-category-entry (make-char 'ethiopic) ?e)
186
187 ;; European character set (Latin-1,2,3,4,5)
188
189 (modify-category-entry (make-char 'latin-iso8859-1) ?l)
190 (modify-category-entry (make-char 'latin-iso8859-2) ?l)
191 (modify-category-entry (make-char 'latin-iso8859-3) ?l)
192 (modify-category-entry (make-char 'latin-iso8859-4) ?l)
193 (modify-category-entry (make-char 'latin-iso8859-9) ?l)
194
195 ;; ISO-8859-1 (Latin-1)
196 (let ((c 64))
197 (while (< c 128) ; from '\e,A@\e(B' to '\e,A\7f\e(B'
198 (modify-syntax-entry (make-char 'latin-iso8859-1 c) "w")
199 (setq c (1+ c)))
200 (modify-syntax-entry (make-char 'latin-iso8859-1 32) "w") ; NBSP
201 (modify-syntax-entry ?\e,AW\e(B "_")
202 (modify-syntax-entry ?\e,Aw\e(B "_")
203 )
204
205 ;; ISO-8859-2 (Latin-2)
206 (let ((c 190))
207 (while (< c 255)
208 (modify-syntax-entry (make-char 'latin-iso8859-2 c) "w")
209 (setq c (1+ c))))
210 (let ((chars '(?\e,B!\e(B ?\e,B#\e(B ?\e,B%\e(B ?\e,B&\e(B ?\e,B)\e(B ?\e,B*\e(B ?\e,B+\e(B ?\e,B,\e(B ?\e,B.\e(B ?\e,B/\e(B ?\e,B1\e(B ?\e,B3\e(B ?\e,B5\e(B ?\e,B6\e(B ?\e,B9\e(B ?\e,B:\e(B ?\e,B;\e(B ?\e,B<\e(B)))
211 (while chars
212 (modify-syntax-entry (car chars) "w")
213 (setq chars (cdr chars))))
214 (modify-syntax-entry (make-char 'latin-iso8859-2 160) "w") ; NBSP
215 (modify-syntax-entry ?\e,BW\e(B ".")
216 (modify-syntax-entry ?\e,Bw\e(B ".")
217
218 ;; Greek character set (ISO-8859-7)
219
220 (modify-category-entry (make-char 'greek-iso8859-7) ?g)
221
222 (let ((c 182))
223 (while (< c 255)
224 (modify-syntax-entry (make-char 'greek-iso8859-7 c) "w")
225 (setq c (1+ c))))
226 (modify-syntax-entry (make-char 'greek-iso8859-7 160) "w") ; NBSP
227 (modify-syntax-entry ?\e,F7\e(B ".")
228 (modify-syntax-entry ?\e,F;\e(B ".")
229 (modify-syntax-entry ?\e,F=\e(B ".")
230
231 ;; Hebrew character set (ISO-8859-8)
232
233 (modify-category-entry (make-char 'hebrew-iso8859-8) ?w)
234
235 (let ((c 224))
236 (while (< c 251)
237 (modify-syntax-entry (make-char 'hebrew-iso8859-8 c) "w")
238 (setq c (1+ c))))
239 (modify-syntax-entry (make-char 'hebrew-iso8859-8 160) "w") ; NBSP
240
241 ;; Indian character set (IS 13194 and other Emacs original Indian charsets)
242
243 (modify-category-entry (make-char 'indian-is13194) ?i)
244 (modify-category-entry (make-char 'indian-2-column) ?I)
245 (modify-category-entry (make-char 'indian-1-column) ?I)
246
247 ;; Japanese character set (JISX0201-kana, JISX0201-roman, JISX0208, JISX0212)
248
249 (modify-category-entry (make-char 'katakana-jisx0201) ?k)
250 (modify-category-entry (make-char 'latin-jisx0201) ?r)
251 (modify-category-entry (make-char 'japanese-jisx0208) ?j)
252 (modify-category-entry (make-char 'japanese-jisx0212) ?j)
253 (modify-category-entry (make-char 'japanese-jisx0208) ?\|)
254
255 ;; JISX0208
256 (modify-syntax-entry (make-char 'japanese-jisx0208) "w")
257 (modify-syntax-entry (make-char 'japanese-jisx0208 33) "_")
258 (modify-syntax-entry (make-char 'japanese-jisx0208 34) "_")
259 (modify-syntax-entry (make-char 'japanese-jisx0208 40) "_")
260 (let ((chars '(?\e$B!<\e(B ?\e$B!+\e(B ?\e$B!,\e(B ?\e$B!3\e(B ?\e$B!4\e(B ?\e$B!5\e(B ?\e$B!6\e(B ?\e$B!7\e(B ?\e$B!8\e(B ?\e$B!9\e(B ?\e$B!:\e(B ?\e$B!;\e(B)))
261 (while chars
262 (modify-syntax-entry (car chars) "w")
263 (setq chars (cdr chars))))
264 (modify-syntax-entry ?\\e$B!J\e(B "(\e$B!K\e(B")
265 (modify-syntax-entry ?\\e$B!N\e(B "(\e$B!O\e(B")
266 (modify-syntax-entry ?\\e$B!P\e(B "(\e$B!Q\e(B")
267 (modify-syntax-entry ?\\e$B!V\e(B "(\e$B!W\e(B")
268 (modify-syntax-entry ?\\e$B!X\e(B "(\e$B!Y\e(B")
269 (modify-syntax-entry ?\\e$B!K\e(B ")\e$B!J\e(B")
270 (modify-syntax-entry ?\\e$B!O\e(B ")\e$B!N\e(B")
271 (modify-syntax-entry ?\\e$B!Q\e(B ")\e$B!P\e(B")
272 (modify-syntax-entry ?\\e$B!W\e(B ")\e$B!V\e(B")
273 (modify-syntax-entry ?\\e$B!Y\e(B ")\e$B!X\e(B")
274
275 (modify-category-entry (make-char 'japanese-jisx0208 35) ?A)
276 (modify-category-entry (make-char 'japanese-jisx0208 36) ?H)
277 (modify-category-entry (make-char 'japanese-jisx0208 37) ?K)
278 (modify-category-entry (make-char 'japanese-jisx0208 38) ?G)
279 (modify-category-entry (make-char 'japanese-jisx0208 39) ?Y)
280 (let ((row 48))
281 (while (< row 127)
282 (modify-category-entry (make-char 'japanese-jisx0208 row) ?C)
283 (setq row (1+ row))))
284 (let ((chars '(?\e$B!<\e(B ?\e$B!+\e(B ?\e$B!,\e(B)))
285 (while chars
286 (modify-category-entry (car chars) ?K)
287 (modify-category-entry (car chars) ?H)
288 (setq chars (cdr chars))))
289 (let ((chars '(?\e$B!3\e(B ?\e$B!4\e(B ?\e$B!5\e(B ?\e$B!6\e(B ?\e$B!7\e(B ?\e$B!8\e(B ?\e$B!9\e(B ?\e$B!:\e(B ?\e$B!;\e(B)))
290 (while chars
291 (modify-category-entry (car chars) ?C)
292 (setq chars (cdr chars))))
293
294 ;; JISX0212
295 (modify-syntax-entry (make-char 'japanese-jisx0212) "w")
296 (modify-syntax-entry (make-char 'japanese-jisx0212 33) "_")
297 (modify-syntax-entry (make-char 'japanese-jisx0212 34) "_")
298 (modify-syntax-entry (make-char 'japanese-jisx0212 35) "_")
299
300 (modify-category-entry (make-char 'japanese-jisx0212 ) ?C)
301
302 ;; JISX0201-Kana
303 (modify-syntax-entry (make-char 'katakana-jisx0201) "w")
304 (let ((chars '(?\e(I!\e(B ?\e(I"\e(B ?\e(I#\e(B ?\e(I$\e(B ?\e(I%\e(B)))
305 (while chars
306 (modify-syntax-entry (car chars) ".")
307 (setq chars (cdr chars))))
308
309 ;; Korean character set (KSC5601)
310
311 (modify-syntax-entry (make-char 'korean-ksc5601) "w")
312 (modify-syntax-entry (make-char 'korean-ksc5601 33) "_")
313 (modify-syntax-entry (make-char 'korean-ksc5601 34) "_")
314 (modify-syntax-entry (make-char 'korean-ksc5601 38) "_")
315 (modify-syntax-entry (make-char 'korean-ksc5601 39) "_")
316 (modify-syntax-entry (make-char 'korean-ksc5601 40) "_")
317 (modify-syntax-entry (make-char 'korean-ksc5601 41) "_")
318
319 (modify-category-entry (make-char 'korean-ksc5601) ?h)
320 (modify-category-entry (make-char 'korean-ksc5601 35) ?A)
321 (modify-category-entry (make-char 'korean-ksc5601 37) ?G)
322 (modify-category-entry (make-char 'korean-ksc5601 42) ?H)
323 (modify-category-entry (make-char 'korean-ksc5601 43) ?K)
324 (modify-category-entry (make-char 'korean-ksc5601 44) ?Y)
325
326 ;; Thai character set (TIS620)
327
328 (modify-category-entry (make-char 'thai-tis620) ?t)
329
330 (let ((deflist '(;; chars syntax category
331 ("\e,T!\e(B-\e,TCEG\e(B-\e,TN\e(B" "w" ?0) ; consonant
332 ("\e,TDFPRS`\e(B-\e,Te\e(B" "w" ?1) ; vowel base
333 ("\e,TQT\e(B-\e,TWgn\e(B" "w" ?2) ; vowel upper
334 ("\e,TX\e(B-\e,TZ\e(B" "w" ?3) ; vowel lower
335 ("\e,Th\e(B-\e,Tm\e(B" "w" ?4) ; tone mark
336 ("\e,TOfp\e(B-\e,Ty\e(B" "w" ?0) ; digit and misc
337 ("\e,T_oz{\e(B" "_" ?0) ; symbol
338 ))
339 elm chars len syntax category to ch i)
340 (while deflist
341 (setq elm (car deflist))
342 (setq chars (car elm)
343 len (length chars)
344 syntax (nth 1 elm)
345 category (nth 2 elm)
346 i 0)
347 (while (< i len)
348 (if (= (aref chars i) ?-)
349 (setq i (1+ i)
350 to (sref chars i))
351 (setq ch (sref chars i)
352 to ch))
353 (while (<= ch to)
354 (modify-syntax-entry ch syntax)
355 (modify-category-entry ch category)
356 (setq ch (1+ ch)))
357 (setq i (+ i (char-bytes to))))
358 (setq deflist (cdr deflist))))
359
360 ;; Vietnamese character set
361
362 (let ((lower (make-char 'vietnamese-viscii-lower))
363 (upper (make-char 'vietnamese-viscii-upper)))
364 (modify-syntax-entry lower "w")
365 (modify-syntax-entry upper "w")
366 (modify-category-entry lower ?v)
367 (modify-category-entry upper ?v)
368 (modify-category-entry lower ?l) ; To make a word with
369 (modify-category-entry upper ?l) ; latin characters.
370 )
371
372 \f
373 ;;; Setting word boundary.
374
375 (setq word-combining-categories
376 '((?l . ?l)))
377
378 (setq word-separating-categories ; (2-byte character sets)
379 '((?A . ?K) ; Alpha numeric - Katakana
380 (?A . ?C) ; Alpha numeric - Chinese
381 (?H . ?A) ; Hiragana - Alpha numeric
382 (?H . ?K) ; Hiragana - Katakana
383 (?H . ?C) ; Hiragana - Chinese
384 (?K . ?A) ; Katakana - Alpha numeric
385 (?K . ?C) ; Katakana - Chinese
386 (?C . ?A) ; Chinese - Alpha numeric
387 (?C . ?K) ; Chinese - Katakana
388 ))