]> code.delx.au - gnu-emacs/blob - lisp/international/characters.el
Fix FSF address in comment.
[gnu-emacs] / lisp / international / characters.el
1 ;;; characters.el --- set syntax and category for multibyte characters
2
3 ;; Copyright (C) 1995 Free Software Foundation, Inc.
4 ;; Copyright (C) 1995 Electrotechnical Laboratory, JAPAN.
5
6 ;; Keywords: multibyte character, character set, syntax, category
7
8 ;; This file is part of GNU Emacs.
9
10 ;; GNU Emacs is free software; you can redistribute it and/or modify
11 ;; it under the terms of the GNU General Public License as published by
12 ;; the Free Software Foundation; either version 2, or (at your option)
13 ;; any later version.
14
15 ;; GNU Emacs is distributed in the hope that it will be useful,
16 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;; GNU General Public License for more details.
19
20 ;; You should have received a copy of the GNU General Public License
21 ;; along with GNU Emacs; see the file COPYING. If not, write to the
22 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 ;; Boston, MA 02111-1307, USA.
24
25 ;;; Commentary:
26
27 ;; This file contains multibyte characters. Save this file always in
28 ;; `coding-system-iso-2022-7'.
29
30 ;;; Predefined categories.
31
32 ;; For each character set.
33
34 (define-category ?a "ASCII")
35 (define-category ?l "Latin")
36 (define-category ?t "Thai")
37 (define-category ?g "Greek")
38 (define-category ?b "Arabic")
39 (define-category ?w "Hebrew")
40 (define-category ?y "Cyrillic")
41 (define-category ?k "Japanese katakana")
42 (define-category ?r "Japanese roman")
43 (define-category ?c "Chinese")
44 (define-category ?j "Japanese")
45 (define-category ?h "Korean")
46 (define-category ?e "Ethiopic (Ge'ez)")
47 (define-category ?v "Vietnamese")
48 (define-category ?i "Indian")
49
50 ;; For each group (row) of 2-byte character sets.
51
52 (define-category ?A "Alpha numeric characters of 2-byte character sets")
53 (define-category ?C "Chinese (Han) characters of 2-byte character sets")
54 (define-category ?G "Greek characters of 2-byte characters sets")
55 (define-category ?H "Japanese Hiragana characters of 2-byte character sets")
56 (define-category ?K "Japanese Katakana characters of 2-byte character sets")
57 (define-category ?N "Korean Hangul characters of 2-byte character sets")
58 (define-category ?Y "Cyrillic character of 2-byte character sets")
59 (define-category ?I "Indian Glyphs")
60
61 ;; For phonetic classifications.
62
63 (define-category ?0 "consonant")
64 (define-category ?1 "base vowel")
65 (define-category ?2 "upper diacritical mark (including upper vowel)")
66 (define-category ?3 "lower diacritical mark (including lower vowel)")
67 (define-category ?4 "tone mark")
68 (define-category ?5 "vowel")
69 (define-category ?6 "digit")
70 (define-category ?7 "vowel-modifying diacritical mark")
71 (define-category ?8 "vowel-signs.")
72
73 ;; For filling.
74 (define-category ?| "While filling, we can break a line at this character.")
75
76 ;; Keep the followings for `kinsoku' processing. See comments in
77 ;; kinsoku.el.
78 (define-category ?> "A character which can't be placed at beginning of line.")
79 (define-category ?< "A character which can't be placed at end of line.")
80
81 \f
82 ;;; Setting syntax and category.
83
84 ;; ASCII
85
86 (let ((ch 32))
87 (while (< ch 127) ; All ASCII characters have
88 (modify-category-entry ch ?a) ; the category `a' (ASCII)
89 (modify-category-entry ch ?l) ; and `l' (Latin).
90 (setq ch (1+ ch))))
91
92 ;; Arabic character set
93
94 (let ((charsets '(arabic-iso8859-6
95 arabic-digit
96 arabic-1-column
97 arabic-2-column)))
98 (while charsets
99 (modify-syntax-entry (make-char (car charsets)) "w")
100 (modify-category-entry (make-char (car charsets)) ?b)
101 (setq charsets (cdr charsets))))
102
103 ;; Chinese character set (GB2312)
104
105 (modify-syntax-entry (make-char 'chinese-gb2312) "w")
106 (modify-syntax-entry (make-char 'chinese-gb2312 33) "_")
107 (modify-syntax-entry (make-char 'chinese-gb2312 34) "_")
108 (modify-syntax-entry (make-char 'chinese-gb2312 41) "_")
109 (modify-syntax-entry ?\\e$A!2\e(B "(\e$A!3\e(B")
110 (modify-syntax-entry ?\\e$A!4\e(B "(\e$A!5\e(B")
111 (modify-syntax-entry ?\\e$A!6\e(B "(\e$A!7\e(B")
112 (modify-syntax-entry ?\\e$A!8\e(B "(\e$A!9\e(B")
113 (modify-syntax-entry ?\\e$A!:\e(B "(\e$A!;\e(B")
114 (modify-syntax-entry ?\\e$A!<\e(B "(\e$A!=\e(B")
115 (modify-syntax-entry ?\\e$A!>\e(B "(\e$A!?\e(B")
116 (modify-syntax-entry ?\\e$A!3\e(B ")\e$A!2\e(B")
117 (modify-syntax-entry ?\\e$A!5\e(B ")\e$A!4\e(B")
118 (modify-syntax-entry ?\\e$A!7\e(B ")\e$A!6\e(B")
119 (modify-syntax-entry ?\\e$A!9\e(B ")\e$A!8\e(B")
120 (modify-syntax-entry ?\\e$A!;\e(B ")\e$A!:\e(B")
121 (modify-syntax-entry ?\\e$A!=\e(B ")\e$A!<\e(B")
122 (modify-syntax-entry ?\\e$A!?\e(B ")\e$A!>\e(B")
123
124 (modify-category-entry (make-char 'chinese-gb2312) ?c)
125 (modify-category-entry (make-char 'chinese-gb2312) ?\|)
126 (modify-category-entry (make-char 'chinese-gb2312 35) ?A)
127 (modify-category-entry (make-char 'chinese-gb2312 36) ?H)
128 (modify-category-entry (make-char 'chinese-gb2312 37) ?K)
129 (modify-category-entry (make-char 'chinese-gb2312 38) ?G)
130 (modify-category-entry (make-char 'chinese-gb2312 39) ?Y)
131 (modify-category-entry (make-char 'chinese-gb2312 35) ?A)
132 (let ((row 48))
133 (while (< row 127)
134 (modify-category-entry (make-char 'chinese-gb2312 row) ?C)
135 (setq row (1+ row))))
136
137 ;; Chinese character set (BIG5)
138
139 (let ((generic-big5-1-char (make-char 'chinese-big5-1))
140 (generic-big5-2-char (make-char 'chinese-big5-2)))
141 (modify-syntax-entry generic-big5-1-char "w")
142 (modify-syntax-entry generic-big5-2-char "w")
143
144 (modify-category-entry generic-big5-1-char ?c)
145 (modify-category-entry generic-big5-2-char ?c)
146
147 (modify-category-entry generic-big5-1-char ?C)
148 (modify-category-entry generic-big5-2-char ?C)
149
150 (modify-category-entry generic-big5-1-char ?\|)
151 (modify-category-entry generic-big5-2-char ?\|))
152
153
154 ;; Chinese character set (CNS11643)
155
156 (let ((cns-list '(chinese-cns11643-1
157 chinese-cns11643-2
158 chinese-cns11643-3
159 chinese-cns11643-4
160 chinese-cns11643-5
161 chinese-cns11643-6
162 chinese-cns11643-7))
163 generic-char)
164 (while cns-list
165 (setq generic-char (make-char (car cns-list)))
166 (modify-syntax-entry generic-char "w")
167 (modify-category-entry generic-char ?c)
168 (modify-category-entry generic-char ?C)
169 (modify-category-entry generic-char ?|)
170 (setq cns-list (cdr cns-list))))
171
172 ;; Cyrillic character set (ISO-8859-5)
173
174 (modify-category-entry (make-char 'cyrillic-iso8859-5) ?y)
175
176 (let ((c 160))
177 (while (< c 256)
178 (modify-syntax-entry (make-char 'cyrillic-iso8859-5 c) "w")
179 (setq c (1+ c))))
180 (modify-syntax-entry ?\e,L-\e(B ".")
181 (modify-syntax-entry ?\e,Lp\e(B ".")
182 (modify-syntax-entry ?\e,L}\e(B ".")
183
184 ;; Ethiopic character set
185
186 (modify-category-entry (make-char 'ethiopic) ?e)
187
188 ;; European character set (Latin-1,2,3,4,5)
189
190 (modify-category-entry (make-char 'latin-iso8859-1) ?l)
191 (modify-category-entry (make-char 'latin-iso8859-2) ?l)
192 (modify-category-entry (make-char 'latin-iso8859-3) ?l)
193 (modify-category-entry (make-char 'latin-iso8859-4) ?l)
194 (modify-category-entry (make-char 'latin-iso8859-9) ?l)
195
196 ;; ISO-8859-1 (Latin-1)
197 (let ((c 64))
198 (while (< c 128) ; from '\e,A@\e(B' to '\e,A\7f\e(B'
199 (modify-syntax-entry (make-char 'latin-iso8859-1 c) "w")
200 (setq c (1+ c)))
201 (modify-syntax-entry (make-char 'latin-iso8859-1 32) "w") ; NBSP
202 (modify-syntax-entry ?\e,AW\e(B "_")
203 (modify-syntax-entry ?\e,Aw\e(B "_")
204 )
205
206 ;; ISO-8859-2 (Latin-2)
207 (let ((c 190))
208 (while (< c 255)
209 (modify-syntax-entry (make-char 'latin-iso8859-2 c) "w")
210 (setq c (1+ c))))
211 (let ((chars '(?\e,B!\e(B ?\e,B#\e(B ?\e,B%\e(B ?\e,B&\e(B ?\e,B)\e(B ?\e,B*\e(B ?\e,B+\e(B ?\e,B,\e(B ?\e,B.\e(B ?\e,B/\e(B ?\e,B1\e(B ?\e,B3\e(B ?\e,B5\e(B ?\e,B6\e(B ?\e,B9\e(B ?\e,B:\e(B ?\e,B;\e(B ?\e,B<\e(B)))
212 (while chars
213 (modify-syntax-entry (car chars) "w")
214 (setq chars (cdr chars))))
215 (modify-syntax-entry (make-char 'latin-iso8859-2 160) "w") ; NBSP
216 (modify-syntax-entry ?\e,BW\e(B ".")
217 (modify-syntax-entry ?\e,Bw\e(B ".")
218
219 ;; Greek character set (ISO-8859-7)
220
221 (modify-category-entry (make-char 'greek-iso8859-7) ?g)
222
223 (let ((c 182))
224 (while (< c 255)
225 (modify-syntax-entry (make-char 'greek-iso8859-7 c) "w")
226 (setq c (1+ c))))
227 (modify-syntax-entry (make-char 'greek-iso8859-7 160) "w") ; NBSP
228 (modify-syntax-entry ?\e,F7\e(B ".")
229 (modify-syntax-entry ?\e,F;\e(B ".")
230 (modify-syntax-entry ?\e,F=\e(B ".")
231
232 ;; Hebrew character set (ISO-8859-8)
233
234 (modify-category-entry (make-char 'hebrew-iso8859-8) ?w)
235
236 (let ((c 224))
237 (while (< c 251)
238 (modify-syntax-entry (make-char 'hebrew-iso8859-8 c) "w")
239 (setq c (1+ c))))
240 (modify-syntax-entry (make-char 'hebrew-iso8859-8 160) "w") ; NBSP
241
242 ;; Indian character set (IS 13194 and other Emacs original Indian charsets)
243
244 (modify-category-entry (make-char 'indian-is13194) ?i)
245 (modify-category-entry (make-char 'indian-2-column) ?I)
246 (modify-category-entry (make-char 'indian-1-column) ?I)
247
248 ;; Japanese character set (JISX0201-kana, JISX0201-roman, JISX0208, JISX0212)
249
250 (modify-category-entry (make-char 'katakana-jisx0201) ?k)
251 (modify-category-entry (make-char 'latin-jisx0201) ?r)
252 (modify-category-entry (make-char 'japanese-jisx0208) ?j)
253 (modify-category-entry (make-char 'japanese-jisx0212) ?j)
254 (modify-category-entry (make-char 'japanese-jisx0208) ?\|)
255
256 ;; JISX0208
257 (modify-syntax-entry (make-char 'japanese-jisx0208) "w")
258 (modify-syntax-entry (make-char 'japanese-jisx0208 33) "_")
259 (modify-syntax-entry (make-char 'japanese-jisx0208 34) "_")
260 (modify-syntax-entry (make-char 'japanese-jisx0208 40) "_")
261 (let ((chars '(?\e$B!<\e(B ?\e$B!+\e(B ?\e$B!,\e(B ?\e$B!3\e(B ?\e$B!4\e(B ?\e$B!5\e(B ?\e$B!6\e(B ?\e$B!7\e(B ?\e$B!8\e(B ?\e$B!9\e(B ?\e$B!:\e(B ?\e$B!;\e(B)))
262 (while chars
263 (modify-syntax-entry (car chars) "w")
264 (setq chars (cdr chars))))
265 (modify-syntax-entry ?\\e$B!J\e(B "(\e$B!K\e(B")
266 (modify-syntax-entry ?\\e$B!N\e(B "(\e$B!O\e(B")
267 (modify-syntax-entry ?\\e$B!P\e(B "(\e$B!Q\e(B")
268 (modify-syntax-entry ?\\e$B!V\e(B "(\e$B!W\e(B")
269 (modify-syntax-entry ?\\e$B!X\e(B "(\e$B!Y\e(B")
270 (modify-syntax-entry ?\\e$B!K\e(B ")\e$B!J\e(B")
271 (modify-syntax-entry ?\\e$B!O\e(B ")\e$B!N\e(B")
272 (modify-syntax-entry ?\\e$B!Q\e(B ")\e$B!P\e(B")
273 (modify-syntax-entry ?\\e$B!W\e(B ")\e$B!V\e(B")
274 (modify-syntax-entry ?\\e$B!Y\e(B ")\e$B!X\e(B")
275
276 (modify-category-entry (make-char 'japanese-jisx0208 35) ?A)
277 (modify-category-entry (make-char 'japanese-jisx0208 36) ?H)
278 (modify-category-entry (make-char 'japanese-jisx0208 37) ?K)
279 (modify-category-entry (make-char 'japanese-jisx0208 38) ?G)
280 (modify-category-entry (make-char 'japanese-jisx0208 39) ?Y)
281 (let ((row 48))
282 (while (< row 127)
283 (modify-category-entry (make-char 'japanese-jisx0208 row) ?C)
284 (setq row (1+ row))))
285 (let ((chars '(?\e$B!<\e(B ?\e$B!+\e(B ?\e$B!,\e(B)))
286 (while chars
287 (modify-category-entry (car chars) ?K)
288 (modify-category-entry (car chars) ?H)
289 (setq chars (cdr chars))))
290 (let ((chars '(?\e$B!3\e(B ?\e$B!4\e(B ?\e$B!5\e(B ?\e$B!6\e(B ?\e$B!7\e(B ?\e$B!8\e(B ?\e$B!9\e(B ?\e$B!:\e(B ?\e$B!;\e(B)))
291 (while chars
292 (modify-category-entry (car chars) ?C)
293 (setq chars (cdr chars))))
294
295 ;; JISX0212
296 (modify-syntax-entry (make-char 'japanese-jisx0212) "w")
297 (modify-syntax-entry (make-char 'japanese-jisx0212 33) "_")
298 (modify-syntax-entry (make-char 'japanese-jisx0212 34) "_")
299 (modify-syntax-entry (make-char 'japanese-jisx0212 35) "_")
300
301 (modify-category-entry (make-char 'japanese-jisx0212 ) ?C)
302
303 ;; JISX0201-Kana
304 (modify-syntax-entry (make-char 'katakana-jisx0201) "w")
305 (let ((chars '(?\e(I!\e(B ?\e(I"\e(B ?\e(I#\e(B ?\e(I$\e(B ?\e(I%\e(B)))
306 (while chars
307 (modify-syntax-entry (car chars) ".")
308 (setq chars (cdr chars))))
309
310 ;; Korean character set (KSC5601)
311
312 (modify-syntax-entry (make-char 'korean-ksc5601) "w")
313 (modify-syntax-entry (make-char 'korean-ksc5601 33) "_")
314 (modify-syntax-entry (make-char 'korean-ksc5601 34) "_")
315 (modify-syntax-entry (make-char 'korean-ksc5601 38) "_")
316 (modify-syntax-entry (make-char 'korean-ksc5601 39) "_")
317 (modify-syntax-entry (make-char 'korean-ksc5601 40) "_")
318 (modify-syntax-entry (make-char 'korean-ksc5601 41) "_")
319
320 (modify-category-entry (make-char 'korean-ksc5601) ?h)
321 (modify-category-entry (make-char 'korean-ksc5601 35) ?A)
322 (modify-category-entry (make-char 'korean-ksc5601 37) ?G)
323 (modify-category-entry (make-char 'korean-ksc5601 42) ?H)
324 (modify-category-entry (make-char 'korean-ksc5601 43) ?K)
325 (modify-category-entry (make-char 'korean-ksc5601 44) ?Y)
326
327 ;; Thai character set (TIS620)
328
329 (modify-category-entry (make-char 'thai-tis620) ?t)
330
331 (let ((deflist '(;; chars syntax category
332 ("\e,T!\e(B-\e,TCEG\e(B-\e,TN\e(B" "w" ?0) ; consonant
333 ("\e,TDFPRS`\e(B-\e,Te\e(B" "w" ?1) ; vowel base
334 ("\e,TQT\e(B-\e,TWgn\e(B" "w" ?2) ; vowel upper
335 ("\e,TX\e(B-\e,TZ\e(B" "w" ?3) ; vowel lower
336 ("\e,Th\e(B-\e,Tm\e(B" "w" ?4) ; tone mark
337 ("\e,TOfp\e(B-\e,Ty\e(B" "w" ?0) ; digit and misc
338 ("\e,T_oz{\e(B" "_" ?0) ; symbol
339 ))
340 elm chars len syntax category to ch i)
341 (while deflist
342 (setq elm (car deflist))
343 (setq chars (car elm)
344 len (length chars)
345 syntax (nth 1 elm)
346 category (nth 2 elm)
347 i 0)
348 (while (< i len)
349 (if (= (aref chars i) ?-)
350 (setq i (1+ i)
351 to (sref chars i))
352 (setq ch (sref chars i)
353 to ch))
354 (while (<= ch to)
355 (modify-syntax-entry ch syntax)
356 (modify-category-entry ch category)
357 (setq ch (1+ ch)))
358 (setq i (+ i (char-bytes to))))
359 (setq deflist (cdr deflist))))
360
361 ;; Vietnamese character set
362
363 (let ((lower (make-char 'vietnamese-viscii-lower))
364 (upper (make-char 'vietnamese-viscii-upper)))
365 (modify-syntax-entry lower "w")
366 (modify-syntax-entry upper "w")
367 (modify-category-entry lower ?v)
368 (modify-category-entry upper ?v)
369 (modify-category-entry lower ?l) ; To make a word with
370 (modify-category-entry upper ?l) ; latin characters.
371 )
372
373 \f
374 ;;; Setting word boundary.
375
376 (setq word-combining-categories
377 '((?l . ?l)))
378
379 (setq word-separating-categories ; (2-byte character sets)
380 '((?A . ?K) ; Alpha numeric - Katakana
381 (?A . ?C) ; Alpha numeric - Chinese
382 (?H . ?A) ; Hiragana - Alpha numeric
383 (?H . ?K) ; Hiragana - Katakana
384 (?H . ?C) ; Hiragana - Chinese
385 (?K . ?A) ; Katakana - Alpha numeric
386 (?K . ?C) ; Katakana - Chinese
387 (?C . ?A) ; Chinese - Alpha numeric
388 (?C . ?K) ; Chinese - Katakana
389 ))