]> code.delx.au - gnu-emacs/blob - lisp/international/characters.el
Split XEmacs/Emacs definitions and sample setup code into separate files
[gnu-emacs] / lisp / international / characters.el
1 ;;; characters.el --- set syntax and category for multibyte characters
2
3 ;; Copyright (C) 1997, 2000, 2001, 2002, 2003, 2004, 2005, 2006
4 ;; Free Software Foundation, Inc.
5 ;; Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
6 ;; 2005, 2006
7 ;; National Institute of Advanced Industrial Science and Technology (AIST)
8 ;; Registration Number H14PRO021
9 ;; Copyright (C) 2003
10 ;; National Institute of Advanced Industrial Science and Technology (AIST)
11 ;; Registration Number H13PRO009
12
13 ;; Keywords: multibyte character, character set, syntax, category
14
15 ;; This file is part of GNU Emacs.
16
17 ;; GNU Emacs is free software; you can redistribute it and/or modify
18 ;; it under the terms of the GNU General Public License as published by
19 ;; the Free Software Foundation; either version 2, or (at your option)
20 ;; any later version.
21
22 ;; GNU Emacs is distributed in the hope that it will be useful,
23 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
24 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25 ;; GNU General Public License for more details.
26
27 ;; You should have received a copy of the GNU General Public License
28 ;; along with GNU Emacs; see the file COPYING. If not, write to the
29 ;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
30 ;; Boston, MA 02110-1301, USA.
31
32 ;;; Commentary:
33
34 ;;; Code:
35
36 ;;; Predefined categories.
37
38 ;; For each character set.
39
40 (define-category ?a "ASCII")
41 (define-category ?l "Latin")
42 (define-category ?t "Thai")
43 (define-category ?g "Greek")
44 (define-category ?b "Arabic")
45 (define-category ?w "Hebrew")
46 (define-category ?y "Cyrillic")
47 (define-category ?k "Japanese katakana")
48 (define-category ?r "Japanese roman")
49 (define-category ?c "Chinese")
50 (define-category ?j "Japanese")
51 (define-category ?h "Korean")
52 (define-category ?e "Ethiopic (Ge'ez)")
53 (define-category ?v "Vietnamese")
54 (define-category ?i "Indian")
55 (define-category ?o "Lao")
56 (define-category ?q "Tibetan")
57
58 ;; For each group (row) of 2-byte character sets.
59
60 (define-category ?A "Alpha-numeric characters of 2-byte character sets")
61 (define-category ?C "Chinese (Han) characters of 2-byte character sets")
62 (define-category ?G "Greek characters of 2-byte character sets")
63 (define-category ?H "Japanese Hiragana characters of 2-byte character sets")
64 (define-category ?K "Japanese Katakana characters of 2-byte character sets")
65 (define-category ?N "Korean Hangul characters of 2-byte character sets")
66 (define-category ?Y "Cyrillic characters of 2-byte character sets")
67 (define-category ?I "Indian Glyphs")
68
69 ;; For phonetic classifications.
70
71 (define-category ?0 "consonant")
72 (define-category ?1 "base (independent) vowel")
73 (define-category ?2 "upper diacritical mark (including upper vowel)")
74 (define-category ?3 "lower diacritical mark (including lower vowel)")
75 (define-category ?4 "tone mark")
76 (define-category ?5 "symbol")
77 (define-category ?6 "digit")
78 (define-category ?7 "vowel-modifying diacritical mark")
79 (define-category ?8 "vowel-signs")
80 (define-category ?9 "semivowel lower")
81
82 ;; For filling.
83 (define-category ?| "While filling, we can break a line at this character.")
84
85 ;; For indentation calculation.
86 (define-category ?\s
87 "This character counts as a space for indentation purposes.")
88
89 ;; Keep the following for `kinsoku' processing. See comments in
90 ;; kinsoku.el.
91 (define-category ?> "A character which can't be placed at beginning of line.")
92 (define-category ?< "A character which can't be placed at end of line.")
93
94 ;; Combining
95 (define-category ?^ "Combining diacritic or mark")
96 \f
97 ;;; Setting syntax and category.
98
99 ;; ASCII
100
101 ;; All ASCII characters have the category `a' (ASCII) and `l' (Latin).
102 (modify-category-entry '(32 . 127) ?a)
103 (modify-category-entry '(32 . 127) ?l)
104
105 ;; Deal with the CJK charsets first. Since the syntax of blocks is
106 ;; defined per charset, and the charsets may contain e.g. Latin
107 ;; characters, we end up with the wrong syntax definitions if we're
108 ;; not careful.
109
110 ;; Chinese characters (Unicode)
111 (modify-category-entry '(#x2E80 . #x312F) ?|)
112 (modify-category-entry '(#x3190 . #x33FF) ?|)
113 (modify-category-entry '(#x3400 . #x9FAF) ?C)
114 (modify-category-entry '(#x3400 . #x9FAF) ?c)
115 (modify-category-entry '(#x3400 . #x9FAF) ?|)
116 (modify-category-entry '(#xF900 . #xFAFF) ?C)
117 (modify-category-entry '(#xF900 . #xFAFF) ?c)
118 (modify-category-entry '(#xF900 . #xFAFF) ?|)
119 (modify-category-entry '(#x20000 . #x2AFFF) ?|)
120 (modify-category-entry '(#x2F800 . #x2FFFF) ?|)
121
122
123 ;; Chinese character set (GB2312)
124
125 (map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2121 #x217E)
126 (map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2221 #x227E)
127 (map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2921 #x297E)
128
129 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?c)
130 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2330 #x2339)
131 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2341 #x235A)
132 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2361 #x237A)
133 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?H #x2421 #x247E)
134 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?K #x2521 #x257E)
135 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?G #x2621 #x267E)
136 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?Y #x2721 #x277E)
137 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?C #x3021 #x7E7E)
138
139 ;; Chinese character set (BIG5)
140
141 (map-charset-chars #'modify-category-entry 'big5 ?c)
142 (map-charset-chars #'modify-category-entry 'big5 ?C #xA259 #xA25F)
143 (map-charset-chars #'modify-category-entry 'big5 ?C #xA440 #xC67E)
144 (map-charset-chars #'modify-category-entry 'big5 ?C #xC940 #xF9DF)
145
146 ;; Chinese character set (CNS11643)
147
148 (dolist (c '(chinese-cns11643-1 chinese-cns11643-2 chinese-cns11643-3
149 chinese-cns11643-4 chinese-cns11643-5 chinese-cns11643-6
150 chinese-cns11643-7))
151 (map-charset-chars #'modify-category-entry c ?c)
152 (if (eq c 'chinese-cns11643-1)
153 (map-charset-chars #'modify-category-entry c ?C #x4421 #x7E7E)
154 (map-charset-chars #'modify-category-entry c ?C)))
155
156 ;; Japanese character set (JISX0201, JISX0208, JISX0212, JISX0213)
157
158 (map-charset-chars #'modify-category-entry 'katakana-jisx0201 ?k)
159
160 (map-charset-chars #'modify-category-entry 'latin-jisx0201 ?r)
161
162 (dolist (l '(katakana-jisx0201 japanese-jisx0208 japanese-jisx0212
163 japanese-jisx0213-1 japanese-jisx0213-2))
164 (map-charset-chars #'modify-category-entry l ?j))
165
166 ;; Unicode equivalents of JISX0201-kana
167 (let ((range '(#xff61 . #xff9f)))
168 (modify-category-entry range ?k)
169 (modify-category-entry range ?j)
170 (modify-category-entry range ?\|))
171
172 ;; Katakana block
173 (let ((range '(#x30a0 . #x30ff)))
174 ;; ?K is double width, ?k isn't specified
175 (modify-category-entry range ?K)
176 (modify-category-entry range ?\|))
177
178 ;; Hiragana block
179 (let ((range '(#x3040 . #x309d)))
180 ;; ?H is actually defined to be double width
181 ;;(modify-category-entry range ?H)
182 (modify-category-entry range ?\|)
183 )
184
185 ;; JISX0208
186 (map-charset-chars #'modify-syntax-entry 'japanese-jisx0208 "_" #x2121 #x227E)
187 (map-charset-chars #'modify-syntax-entry 'japanese-jisx0208 "_" #x2821 #x287E)
188 (let ((chars '(?ー ?゛ ?゜ ?ヽ ?ヾ ?ゝ ?ゞ ?〃 ?仝 ?々 ?〆 ?〇)))
189 (dolist (elt chars)
190 (modify-syntax-entry (car chars) "w")))
191
192 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?A #x2321 #x237E)
193 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?H #x2421 #x247E)
194 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?K #x2521 #x257E)
195 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?G #x2621 #x267E)
196 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?Y #x2721 #x277E)
197 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?C #x3021 #x7E7E)
198 (modify-category-entry ?ー ?K)
199 (let ((chars '(?゛ ?゜)))
200 (while chars
201 (modify-category-entry (car chars) ?K)
202 (modify-category-entry (car chars) ?H)
203 (setq chars (cdr chars))))
204 (let ((chars '(?ヽ ?ヾ ?ゝ ?ゞ ?〃 ?仝 ?々 ?〆 ?〇)))
205 (while chars
206 (modify-category-entry (car chars) ?C)
207 (setq chars (cdr chars))))
208
209 ;; JISX0212
210
211 (map-charset-chars #'modify-syntax-entry 'japanese-jisx0212 "_" #x2121 #x237E)
212
213 ;; JISX0201-Kana
214
215 (let ((chars '(?。 ?、 ?・)))
216 (while chars
217 (modify-syntax-entry (car chars) ".")
218 (setq chars (cdr chars))))
219
220 (modify-syntax-entry ?\「 "(」")
221 (modify-syntax-entry ?\」 "(「")
222
223 ;; Korean character set (KSC5601)
224
225 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?h)
226
227 (map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2121 #x227E)
228 (map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2621 #x277E)
229 (map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2830 #x287E)
230 (map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2930 #x297E)
231 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2330 #x2339)
232 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2341 #x235A)
233 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2361 #x237A)
234 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?G #x2521 #x257E)
235 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?H #x2A21 #x2A7E)
236 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?K #x2B21 #x2B7E)
237 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?Y #x2C21 #x2C7E)
238
239 ;; These are in more than one charset.
240 (let ((parens (concat "〈〉《》「」『』【】〔〕〖〗〘〙〚〛"
241 "︵︶︷︸︹︺︻︼︽︾︿﹀﹁﹂﹃﹄"
242 "()[]{}"))
243 open close)
244 (dotimes (i (/ (length parens) 2))
245 (setq open (aref parens (* i 2))
246 close (aref parens (1+ (* i 2))))
247 (modify-syntax-entry open (format "(%c" close))
248 (modify-syntax-entry close (format ")%c" open))))
249
250 ;; Arabic character set
251
252 (let ((charsets '(arabic-iso8859-6
253 arabic-digit
254 arabic-1-column
255 arabic-2-column)))
256 (while charsets
257 (map-charset-chars #'modify-category-entry (car charsets) ?b)
258 (setq charsets (cdr charsets))))
259 (modify-category-entry '(#x600 . #x6ff) ?b)
260 (modify-category-entry '(#xfb50 . #xfdff) ?b)
261 (modify-category-entry '(#xfe70 . #xfefe) ?b)
262
263 ;; Cyrillic character set (ISO-8859-5)
264
265 (modify-syntax-entry ?№ ".")
266
267 ;; Ethiopic character set
268
269 (modify-category-entry '(#x1200 . #x1399) ?e)
270 (modify-category-entry '(#x2d80 . #x2dde) ?e)
271 (let ((chars '(?፡ ?። ?፣ ?፤ ?፥ ?፦ ?፧ ?፨ ? ? ? ? ? ?)))
272 (while chars
273 (modify-syntax-entry (car chars) ".")
274 (setq chars (cdr chars))))
275 (map-charset-chars #'modify-category-entry 'ethiopic ?e)
276
277 ;; Hebrew character set (ISO-8859-8)
278
279 (modify-syntax-entry #x5be ".") ; MAQAF
280 (modify-syntax-entry #x5c0 ".") ; PASEQ
281 (modify-syntax-entry #x5c3 ".") ; SOF PASUQ
282 (modify-syntax-entry #x5f3 ".") ; GERESH
283 (modify-syntax-entry #x5f4 ".") ; GERSHAYIM
284
285 ;; Indian character set (IS 13194 and other Emacs original Indian charsets)
286
287 (modify-category-entry '(#x901 . #x970) ?i)
288 (map-charset-chars #'modify-category-entry 'indian-is13194 ?i)
289 (map-charset-chars #'modify-category-entry 'indian-2-column ?i)
290
291 ;; Lao character set
292
293 (modify-category-entry '(#xe80 . #xeff) ?o)
294 (map-charset-chars #'modify-category-entry 'lao ?o)
295
296 (let ((deflist '(("ກ-ຮ" "w" ?0) ; consonant
297 ("ະາຳຽເ-ໄ" "w" ?1) ; vowel base
298 ("ັິ-ືົໍ" "w" ?2) ; vowel upper
299 ("ຸູ" "w" ?3) ; vowel lower
300 ("່-໋" "w" ?4) ; tone mark
301 ("ຼຽ" "w" ?9) ; semivowel lower
302 ("໐-໙" "w" ?6) ; digit
303 ("ຯໆ" "_" ?5) ; symbol
304 ))
305 elm chars len syntax category to ch i)
306 (while deflist
307 (setq elm (car deflist))
308 (setq chars (car elm)
309 len (length chars)
310 syntax (nth 1 elm)
311 category (nth 2 elm)
312 i 0)
313 (while (< i len)
314 (if (= (aref chars i) ?-)
315 (setq i (1+ i)
316 to (aref chars i))
317 (setq ch (aref chars i)
318 to ch))
319 (while (<= ch to)
320 (unless (string-equal syntax "w")
321 (modify-syntax-entry ch syntax))
322 (modify-category-entry ch category)
323 (setq ch (1+ ch)))
324 (setq i (1+ i)))
325 (setq deflist (cdr deflist))))
326
327 ;; Thai character set (TIS620)
328
329 (modify-category-entry '(#xe00 . #xe7f) ?t)
330 (map-charset-chars #'modify-category-entry 'thai-tis620 ?t)
331
332 (let ((deflist '(;; chars syntax category
333 ("ก-รลว-ฮ" "w" ?0) ; consonant
334 ("ฤฦะาำเ-ๅ" "w" ?1) ; vowel base
335 ("ัิ-ื็๎" "w" ?2) ; vowel upper
336 ("ุ-ฺ" "w" ?3) ; vowel lower
337 ("่-ํ" "w" ?4) ; tone mark
338 ("๐-๙" "w" ?6) ; digit
339 ("ฯๆ฿๏๚๛" "_" ?5) ; symbol
340 ))
341 elm chars len syntax category to ch i)
342 (while deflist
343 (setq elm (car deflist))
344 (setq chars (car elm)
345 len (length chars)
346 syntax (nth 1 elm)
347 category (nth 2 elm)
348 i 0)
349 (while (< i len)
350 (if (= (aref chars i) ?-)
351 (setq i (1+ i)
352 to (aref chars i))
353 (setq ch (aref chars i)
354 to ch))
355 (while (<= ch to)
356 (unless (string-equal syntax "w")
357 (modify-syntax-entry ch syntax))
358 (modify-category-entry ch category)
359 (setq ch (1+ ch)))
360 (setq i (1+ i)))
361 (setq deflist (cdr deflist))))
362
363 ;; Tibetan character set
364
365 (modify-category-entry '(#xf00 . #xfff) ?q)
366 (map-charset-chars #'modify-category-entry 'tibetan ?q)
367 (map-charset-chars #'modify-category-entry 'tibetan-1-column ?q)
368
369 (let ((deflist '(;; chars syntax category
370 ("ཀ-ཀྵཪ" "w" ?0) ; consonant
371 ("ྐ-ྐྵྺྻྼ" "w" ?0) ;
372 ("-" "w" ?0) ;
373 ("-" "w" ?0) ;
374 ("ིེཻོཽྀ" "w" ?2) ; upper vowel
375 ("ཾྂྃ྆྇ྈྉྊྋ" "w" ?2) ; upper modifier
376 ("྄ཱུ༙༵༷" "w" ?3) ; lowel vowel/modifier
377 ("཰" "w" ?3) ; invisible vowel a
378 ("༠-༩༪-༳" "w" ?6) ; digit
379 ("་།-༒༔ཿ" "." ?|) ; line-break char
380 ("་།༏༐༑༔ཿ" "." ?|) ;
381 ("༈་།-༒༔ཿ༽༴" "." ?>) ; prohibition
382 ("་།༏༐༑༔ཿ" "." ?>) ;
383 ("ༀ-༊༼࿁࿂྅" "." ?<) ; prohibition
384 ("༓༕-༘༚-༟༶༸-༻༾༿྾྿-࿏" "." ?q) ; others
385 ))
386 elm chars len syntax category to ch i)
387 (while deflist
388 (setq elm (car deflist))
389 (setq chars (car elm)
390 len (length chars)
391 syntax (nth 1 elm)
392 category (nth 2 elm)
393 i 0)
394 (while (< i len)
395 (if (= (aref chars i) ?-)
396 (setq i (1+ i)
397 to (aref chars i))
398 (setq ch (aref chars i)
399 to ch))
400 (while (<= ch to)
401 (unless (string-equal syntax "w")
402 (modify-syntax-entry ch syntax))
403 (modify-category-entry ch category)
404 (setq ch (1+ ch)))
405 (setq i (1+ i)))
406 (setq deflist (cdr deflist))))
407
408 ;; Vietnamese character set
409
410 ;; To make a word with Latin characters
411 (map-charset-chars #'modify-category-entry 'vietnamese-viscii-lower ?l)
412 (map-charset-chars #'modify-category-entry 'vietnamese-viscii-lower ?v)
413
414 (map-charset-chars #'modify-category-entry 'vietnamese-viscii-upper ?l)
415 (map-charset-chars #'modify-category-entry 'vietnamese-viscii-upper ?v)
416
417 (let ((tbl (standard-case-table))
418 (i 32))
419 (while (< i 128)
420 (let* ((char (decode-char 'vietnamese-viscii-upper i))
421 (charl (decode-char 'vietnamese-viscii-lower i))
422 (uc (encode-char char 'ucs))
423 (lc (encode-char charl 'ucs)))
424 (set-case-syntax-pair char (decode-char 'vietnamese-viscii-lower i)
425 tbl)
426 (if uc (modify-category-entry uc ?v))
427 (if lc (modify-category-entry lc ?v)))
428 (setq i (1+ i))))
429
430
431 ;; Latin
432
433 (modify-category-entry '(#x80 . #x024F) ?l)
434
435 (let ((tbl (standard-case-table)) c)
436
437 ;; Latin-1
438
439 ;; Fixme: Some of the non-word syntaxes here perhaps should be
440 ;; reviewed. (Note that the following all implicitly have word
441 ;; syntax: ¢£¤¥¨ª¯²³´¶¸¹º.) There should be a well-defined way of
442 ;; relating Unicode categories to Emacs syntax codes.
443
444 ;; NBSP isn't semantically interchangeable with other whitespace chars,
445 ;; so it's more like punctation.
446 (set-case-syntax ?  "." tbl)
447 (set-case-syntax ?¡ "." tbl)
448 (set-case-syntax ?¦ "_" tbl)
449 (set-case-syntax ?§ "." tbl)
450 (set-case-syntax ?© "_" tbl)
451 (set-case-syntax-delims 171 187 tbl) ; « »
452 (set-case-syntax ?¬ "_" tbl)
453 (set-case-syntax ?­ "_" tbl)
454 (set-case-syntax ?® "_" tbl)
455 (set-case-syntax ?° "_" tbl)
456 (set-case-syntax ?± "_" tbl)
457 (set-case-syntax ?µ "_" tbl)
458 (set-case-syntax ?· "_" tbl)
459 (set-case-syntax ?¼ "_" tbl)
460 (set-case-syntax ?½ "_" tbl)
461 (set-case-syntax ?¾ "_" tbl)
462 (set-case-syntax ?¿ "." tbl)
463 (let ((c 192))
464 (while (<= c 222)
465 (set-case-syntax-pair c (+ c 32) tbl)
466 (setq c (1+ c))))
467 (set-case-syntax ?× "_" tbl)
468 (set-case-syntax ?ß "w" tbl)
469 (set-case-syntax ?÷ "_" tbl)
470 ;; See below for ÿ.
471
472 ;; Latin Extended-A, Latin Extended-B
473 (setq c #x0100)
474 (while (<= c #x0233)
475 (and (or (<= c #x012e)
476 (and (>= c #x014a) (<= c #x0177)))
477 (zerop (% c 2))
478 (set-case-syntax-pair c (1+ c) tbl))
479 (and (>= c #x013a)
480 (<= c #x0148)
481 (zerop (% c 2))
482 (set-case-syntax-pair (1- c) c tbl))
483 (setq c (1+ c)))
484
485
486 ;; In some languages, such as Turkish, U+0049 LATIN CAPITAL LETTER I
487 ;; and U+0131 LATIN SMALL LETTER DOTLESS I make a case pair, and so
488 ;; do U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE and U+0069 LATIN
489 ;; SMALL LETTER I.
490
491 ;; We used to set up half of those correspondence unconditionally,
492 ;; but that makes searches slow. So now we don't set up either half
493 ;; of these correspondences by default.
494
495 ;; (set-downcase-syntax ?İ ?i tbl)
496 ;; (set-upcase-syntax ?I ?ı tbl)
497
498 (set-case-syntax-pair ?IJ ?ij tbl)
499 (set-case-syntax-pair ?Ĵ ?ĵ tbl)
500 (set-case-syntax-pair ?Ķ ?ķ tbl)
501 (set-case-syntax-pair ?Ÿ ?ÿ tbl)
502 (set-case-syntax-pair ?Ź ?ź tbl)
503 (set-case-syntax-pair ?Ż ?ż tbl)
504 (set-case-syntax-pair ?Ž ?ž tbl)
505
506 ;; Latin Extended-B
507 (set-case-syntax-pair ?Ɓ ?ɓ tbl)
508 (set-case-syntax-pair ?Ƃ ?ƃ tbl)
509 (set-case-syntax-pair ?Ƅ ?ƅ tbl)
510 (set-case-syntax-pair ?Ɔ ?ɔ tbl)
511 (set-case-syntax-pair ?Ƈ ?ƈ tbl)
512 (set-case-syntax-pair ?Ɖ ?ɖ tbl)
513 (set-case-syntax-pair ?Ɗ ?ɗ tbl)
514 (set-case-syntax-pair ?Ƌ ?ƌ tbl)
515 (set-case-syntax-pair ?Ǝ ?ǝ tbl)
516 (set-case-syntax-pair ?Ə ?ə tbl)
517 (set-case-syntax-pair ?Ɛ ?ɛ tbl)
518 (set-case-syntax-pair ?Ƒ ?ƒ tbl)
519 (set-case-syntax-pair ?Ɠ ?ɠ tbl)
520 (set-case-syntax-pair ?Ɣ ?ɣ tbl)
521 (set-case-syntax-pair ?Ɩ ?ɩ tbl)
522 (set-case-syntax-pair ?Ɨ ?ɨ tbl)
523 (set-case-syntax-pair ?Ƙ ?ƙ tbl)
524 (set-case-syntax-pair ?Ɯ ?ɯ tbl)
525 (set-case-syntax-pair ?Ɲ ?ɲ tbl)
526 (set-case-syntax-pair ?Ɵ ?ɵ tbl)
527 (set-case-syntax-pair ?Ơ ?ơ tbl)
528 (set-case-syntax-pair ?Ƣ ?ƣ tbl)
529 (set-case-syntax-pair ?Ƥ ?ƥ tbl)
530 (set-case-syntax-pair ?Ʀ ?ʀ tbl)
531 (set-case-syntax-pair ?Ƨ ?ƨ tbl)
532 (set-case-syntax-pair ?Ʃ ?ʃ tbl)
533 (set-case-syntax-pair ?Ƭ ?ƭ tbl)
534 (set-case-syntax-pair ?Ʈ ?ʈ tbl)
535 (set-case-syntax-pair ?Ư ?ư tbl)
536 (set-case-syntax-pair ?Ʊ ?ʊ tbl)
537 (set-case-syntax-pair ?Ʋ ?ʋ tbl)
538 (set-case-syntax-pair ?Ƴ ?ƴ tbl)
539 (set-case-syntax-pair ?Ƶ ?ƶ tbl)
540 (set-case-syntax-pair ?Ʒ ?ʒ tbl)
541 (set-case-syntax-pair ?Ƹ ?ƹ tbl)
542 (set-case-syntax-pair ?Ƽ ?ƽ tbl)
543 (set-case-syntax-pair ?DŽ ?dž tbl)
544 (set-case-syntax-pair ?Dž ?dž tbl)
545 (set-case-syntax-pair ?LJ ?lj tbl)
546 (set-case-syntax-pair ?Lj ?lj tbl)
547 (set-case-syntax-pair ?NJ ?nj tbl)
548 (set-case-syntax-pair ?Nj ?nj tbl)
549 (set-case-syntax-pair ?Ǎ ?ǎ tbl)
550 (set-case-syntax-pair ?Ǐ ?ǐ tbl)
551 (set-case-syntax-pair ?Ǒ ?ǒ tbl)
552 (set-case-syntax-pair ?Ǔ ?ǔ tbl)
553 (set-case-syntax-pair ?Ǖ ?ǖ tbl)
554 (set-case-syntax-pair ?Ǘ ?ǘ tbl)
555 (set-case-syntax-pair ?Ǚ ?ǚ tbl)
556 (set-case-syntax-pair ?Ǜ ?ǜ tbl)
557 (set-case-syntax-pair ?Ǟ ?ǟ tbl)
558 (set-case-syntax-pair ?Ǡ ?ǡ tbl)
559 (set-case-syntax-pair ?Ǣ ?ǣ tbl)
560 (set-case-syntax-pair ?Ǥ ?ǥ tbl)
561 (set-case-syntax-pair ?Ǧ ?ǧ tbl)
562 (set-case-syntax-pair ?Ǩ ?ǩ tbl)
563 (set-case-syntax-pair ?Ǫ ?ǫ tbl)
564 (set-case-syntax-pair ?Ǭ ?ǭ tbl)
565 (set-case-syntax-pair ?Ǯ ?ǯ tbl)
566 ;; 01F0; F; 006A 030C; # LATIN SMALL LETTER J WITH CARON
567 (set-case-syntax-pair ?DZ ?dz tbl)
568 (set-case-syntax-pair ?Dz ?dz tbl)
569 (set-case-syntax-pair ?Ǵ ?ǵ tbl)
570 (set-case-syntax-pair ?Ƕ ?ƕ tbl)
571 (set-case-syntax-pair ?Ƿ ?ƿ tbl)
572 (set-case-syntax-pair ?Ǹ ?ǹ tbl)
573 (set-case-syntax-pair ?Ǻ ?ǻ tbl)
574 (set-case-syntax-pair ?Ǽ ?ǽ tbl)
575 (set-case-syntax-pair ?Ǿ ?ǿ tbl)
576 (set-case-syntax-pair ?Ȁ ?ȁ tbl)
577 (set-case-syntax-pair ?Ȃ ?ȃ tbl)
578 (set-case-syntax-pair ?Ȅ ?ȅ tbl)
579 (set-case-syntax-pair ?Ȇ ?ȇ tbl)
580 (set-case-syntax-pair ?Ȉ ?ȉ tbl)
581 (set-case-syntax-pair ?Ȋ ?ȋ tbl)
582 (set-case-syntax-pair ?Ȍ ?ȍ tbl)
583 (set-case-syntax-pair ?Ȏ ?ȏ tbl)
584 (set-case-syntax-pair ?Ȑ ?ȑ tbl)
585 (set-case-syntax-pair ?Ȓ ?ȓ tbl)
586 (set-case-syntax-pair ?Ȕ ?ȕ tbl)
587 (set-case-syntax-pair ?Ȗ ?ȗ tbl)
588 (set-case-syntax-pair ?Ș ?ș tbl)
589 (set-case-syntax-pair ?Ț ?ț tbl)
590 (set-case-syntax-pair ?Ȝ ?ȝ tbl)
591 (set-case-syntax-pair ?Ȟ ?ȟ tbl)
592 (set-case-syntax-pair ?Ȣ ?ȣ tbl)
593 (set-case-syntax-pair ?Ȥ ?ȥ tbl)
594 (set-case-syntax-pair ?Ȧ ?ȧ tbl)
595 (set-case-syntax-pair ?Ȩ ?ȩ tbl)
596 (set-case-syntax-pair ?Ȫ ?ȫ tbl)
597 (set-case-syntax-pair ?Ȭ ?ȭ tbl)
598 (set-case-syntax-pair ?Ȯ ?ȯ tbl)
599 (set-case-syntax-pair ?Ȱ ?ȱ tbl)
600 (set-case-syntax-pair ?Ȳ ?ȳ tbl)
601
602 ;; Latin Extended Additional
603 (modify-category-entry '(#x1e00 . #x1ef9) ?l)
604 (setq c #x1e00)
605 (while (<= c #x1ef9)
606 (and (zerop (% c 2))
607 (or (<= c #x1e94) (>= c #x1ea0))
608 (set-case-syntax-pair c (1+ c) tbl))
609 (setq c (1+ c)))
610
611 ;; Greek
612 (modify-category-entry '(#x0370 . #x03ff) ?g)
613 (setq c #x0370)
614 (while (<= c #x03ff)
615 (if (or (and (>= c #x0391) (<= c #x03a1))
616 (and (>= c #x03a3) (<= c #x03ab)))
617 (set-case-syntax-pair c (+ c 32) tbl))
618 (and (>= c #x03da)
619 (<= c #x03ee)
620 (zerop (% c 2))
621 (set-case-syntax-pair c (1+ c) tbl))
622 (setq c (1+ c)))
623 (set-case-syntax-pair ?Ά ?ά tbl)
624 (set-case-syntax-pair ?Έ ?έ tbl)
625 (set-case-syntax-pair ?Ή ?ή tbl)
626 (set-case-syntax-pair ?Ί ?ί tbl)
627 (set-case-syntax-pair ?Ό ?ό tbl)
628 (set-case-syntax-pair ?Ύ ?ύ tbl)
629 (set-case-syntax-pair ?Ώ ?ώ tbl)
630
631 ;; Armenian
632 (setq c #x531)
633 (while (<= c #x556)
634 (set-case-syntax-pair c (+ c #x30) tbl)
635 (setq c (1+ c)))
636
637 ;; Greek Extended
638 (modify-category-entry '(#x1f00 . #x1fff) ?g)
639 (setq c #x1f00)
640 (while (<= c #x1fff)
641 (and (<= (logand c #x000f) 7)
642 (<= c #x1fa7)
643 (not (memq c '(#x1f50 #x1f52 #x1f54 #x1f56)))
644 (/= (logand c #x00f0) 7)
645 (set-case-syntax-pair (+ c 8) c tbl))
646 (setq c (1+ c)))
647 (set-case-syntax-pair ?Ᾰ ?ᾰ tbl)
648 (set-case-syntax-pair ?Ᾱ ?ᾱ tbl)
649 (set-case-syntax-pair ?Ὰ ?ὰ tbl)
650 (set-case-syntax-pair ?Ά ?ά tbl)
651 (set-case-syntax-pair ?ᾼ ?ᾳ tbl)
652 (set-case-syntax-pair ?Ὲ ?ὲ tbl)
653 (set-case-syntax-pair ?Έ ?έ tbl)
654 (set-case-syntax-pair ?Ὴ ?ὴ tbl)
655 (set-case-syntax-pair ?Ή ?ή tbl)
656 (set-case-syntax-pair ?ῌ ?ῃ tbl)
657 (set-case-syntax-pair ?Ῐ ?ῐ tbl)
658 (set-case-syntax-pair ?Ῑ ?ῑ tbl)
659 (set-case-syntax-pair ?Ὶ ?ὶ tbl)
660 (set-case-syntax-pair ?Ί ?ί tbl)
661 (set-case-syntax-pair ?Ῠ ?ῠ tbl)
662 (set-case-syntax-pair ?Ῡ ?ῡ tbl)
663 (set-case-syntax-pair ?Ὺ ?ὺ tbl)
664 (set-case-syntax-pair ?Ύ ?ύ tbl)
665 (set-case-syntax-pair ?Ῥ ?ῥ tbl)
666 (set-case-syntax-pair ?Ὸ ?ὸ tbl)
667 (set-case-syntax-pair ?Ό ?ό tbl)
668 (set-case-syntax-pair ?Ὼ ?ὼ tbl)
669 (set-case-syntax-pair ?Ώ ?ώ tbl)
670 (set-case-syntax-pair ?ῼ ?ῳ tbl)
671
672 ;; cyrillic
673 (modify-category-entry '(#x0400 . #x04FF) ?y)
674 (setq c #x0400)
675 (while (<= c #x04ff)
676 (and (>= c #x0400)
677 (<= c #x040f)
678 (set-case-syntax-pair c (+ c 80) tbl))
679 (and (>= c #x0410)
680 (<= c #x042f)
681 (set-case-syntax-pair c (+ c 32) tbl))
682 (and (zerop (% c 2))
683 (or (and (>= c #x0460) (<= c #x0480))
684 (and (>= c #x048c) (<= c #x04be))
685 (and (>= c #x04d0) (<= c #x04f4)))
686 (set-case-syntax-pair c (1+ c) tbl))
687 (setq c (1+ c)))
688 (set-case-syntax-pair ?Ӂ ?ӂ tbl)
689 (set-case-syntax-pair ?Ӄ ?ӄ tbl)
690 (set-case-syntax-pair ?Ӈ ?ӈ tbl)
691 (set-case-syntax-pair ?Ӌ ?ӌ tbl)
692 (set-case-syntax-pair ?Ӹ ?ӹ tbl)
693
694 ;; general punctuation
695 (setq c #x2000)
696 (while (<= c #x200b)
697 (set-case-syntax c " " tbl)
698 (setq c (1+ c)))
699 (while (<= c #x200F)
700 (set-case-syntax c "." tbl)
701 (setq c (1+ c)))
702 ;; Fixme: These aren't all right:
703 (setq c #x2010)
704 (while (<= c #x2016)
705 (set-case-syntax c "_" tbl)
706 (setq c (1+ c)))
707 ;; Punctuation syntax for quotation marks (like `)
708 (while (<= c #x201f)
709 (set-case-syntax c "." tbl)
710 (setq c (1+ c)))
711 ;; Fixme: These aren't all right:
712 (while (<= c #x2027)
713 (set-case-syntax c "_" tbl)
714 (setq c (1+ c)))
715 (while (<= c #x206F)
716 (set-case-syntax c "." tbl)
717 (setq c (1+ c)))
718
719 ;; Roman numerals
720 (setq c #x2160)
721 (while (<= c #x216f)
722 (set-case-syntax-pair c (+ c #x10) tbl)
723 (setq c (1+ c)))
724
725 ;; Fixme: The following blocks might be better as symbol rather than
726 ;; punctuation.
727 ;; Arrows
728 (setq c #x2190)
729 (while (<= c #x21FF)
730 (set-case-syntax c "." tbl)
731 (setq c (1+ c)))
732 ;; Mathematical Operators
733 (while (<= c #x22FF)
734 (set-case-syntax c "." tbl)
735 (setq c (1+ c)))
736 ;; Miscellaneous Technical
737 (while (<= c #x23FF)
738 (set-case-syntax c "." tbl)
739 (setq c (1+ c)))
740 ;; Control Pictures
741 (while (<= c #x243F)
742 (set-case-syntax c "_" tbl)
743 (setq c (1+ c)))
744
745 ;; Circled Latin
746 (setq c #x24b6)
747 (while (<= c #x24cf)
748 (set-case-syntax-pair c (+ c 26) tbl)
749 (modify-category-entry c ?l)
750 (modify-category-entry (+ c 26) ?l)
751 (setq c (1+ c)))
752
753 ;; Fullwidth Latin
754 (setq c #xff21)
755 (while (<= c #xff3a)
756 (set-case-syntax-pair c (+ c #x20) tbl)
757 (modify-category-entry c ?l)
758 (modify-category-entry (+ c #x20) ?l)
759 (setq c (1+ c)))
760
761 ;; Combining diacritics
762 (modify-category-entry '(#x300 . #x362) ?^)
763 ;; Combining marks
764 (modify-category-entry '(#x20d0 . #x20e3) ?^)
765
766 ;; Fixme: syntax for symbols &c
767 )
768
769 (let ((pairs
770 '("⁅⁆" ; U+2045 U+2046
771 "⁽⁾" ; U+207D U+207E
772 "₍₎" ; U+208D U+208E
773 "〈〉" ; U+2329 U+232A
774 "⎴⎵" ; U+23B4 U+23B5
775 "❨❩" ; U+2768 U+2769
776 "❪❫" ; U+276A U+276B
777 "❬❭" ; U+276C U+276D
778 "❰❱" ; U+2770 U+2771
779 "❲❳" ; U+2772 U+2773
780 "❴❵" ; U+2774 U+2775
781 "⟦⟧" ; U+27E6 U+27E7
782 "⟨⟩" ; U+27E8 U+27E9
783 "⟪⟫" ; U+27EA U+27EB
784 "⦃⦄" ; U+2983 U+2984
785 "⦅⦆" ; U+2985 U+2986
786 "⦇⦈" ; U+2987 U+2988
787 "⦉⦊" ; U+2989 U+298A
788 "⦋⦌" ; U+298B U+298C
789 "⦍⦎" ; U+298D U+298E
790 "⦏⦐" ; U+298F U+2990
791 "⦑⦒" ; U+2991 U+2992
792 "⦓⦔" ; U+2993 U+2994
793 "⦕⦖" ; U+2995 U+2996
794 "⦗⦘" ; U+2997 U+2998
795 "⧼⧽" ; U+29FC U+29FD
796 "〈〉" ; U+3008 U+3009
797 "《》" ; U+300A U+300B
798 "「」" ; U+300C U+300D
799 "『』" ; U+300E U+300F
800 "【】" ; U+3010 U+3011
801 "〔〕" ; U+3014 U+3015
802 "〖〗" ; U+3016 U+3017
803 "〘〙" ; U+3018 U+3019
804 "〚〛" ; U+301A U+301B
805 "﴾﴿" ; U+FD3E U+FD3F
806 "︵︶" ; U+FE35 U+FE36
807 "︷︸" ; U+FE37 U+FE38
808 "︹︺" ; U+FE39 U+FE3A
809 "︻︼" ; U+FE3B U+FE3C
810 "︽︾" ; U+FE3D U+FE3E
811 "︿﹀" ; U+FE3F U+FE40
812 "﹁﹂" ; U+FE41 U+FE42
813 "﹃﹄" ; U+FE43 U+FE44
814 "﹙﹚" ; U+FE59 U+FE5A
815 "﹛﹜" ; U+FE5B U+FE5C
816 "﹝﹞" ; U+FE5D U+FE5E
817 "()" ; U+FF08 U+FF09
818 "[]" ; U+FF3B U+FF3D
819 "{}" ; U+FF5B U+FF5D
820 "⦅⦆" ; U+FF5F U+FF60
821 "「」" ; U+FF62 U+FF63
822 )))
823 (dolist (elt pairs)
824 (modify-syntax-entry (aref elt 0) (string ?\( (aref elt 1)))
825 (modify-syntax-entry (aref elt 1) (string ?\) (aref elt 0)))))
826
827 \f
828 ;; For each character set, put the information of the most proper
829 ;; coding system to encode it by `preferred-coding-system' property.
830
831 ;; Fixme: should this be junked?
832 (let ((l '((latin-iso8859-1 . iso-latin-1)
833 (latin-iso8859-2 . iso-latin-2)
834 (latin-iso8859-3 . iso-latin-3)
835 (latin-iso8859-4 . iso-latin-4)
836 (thai-tis620 . thai-tis620)
837 (greek-iso8859-7 . greek-iso-8bit)
838 (arabic-iso8859-6 . iso-2022-7bit)
839 (hebrew-iso8859-8 . hebrew-iso-8bit)
840 (katakana-jisx0201 . japanese-shift-jis)
841 (latin-jisx0201 . japanese-shift-jis)
842 (cyrillic-iso8859-5 . cyrillic-iso-8bit)
843 (latin-iso8859-9 . iso-latin-5)
844 (japanese-jisx0208-1978 . iso-2022-jp)
845 (chinese-gb2312 . chinese-iso-8bit)
846 (chinese-gbk . chinese-gbk)
847 (gb18030-2-byte . chinese-gb18030)
848 (gb18030-4-byte-bmp . chinese-gb18030)
849 (gb18030-4-byte-smp . chinese-gb18030)
850 (gb18030-4-byte-ext-1 . chinese-gb18030)
851 (gb18030-4-byte-ext-2 . chinese-gb18030)
852 (japanese-jisx0208 . iso-2022-jp)
853 (korean-ksc5601 . iso-2022-kr)
854 (japanese-jisx0212 . iso-2022-jp)
855 (chinese-big5-1 . chinese-big5)
856 (chinese-big5-2 . chinese-big5)
857 (chinese-sisheng . iso-2022-7bit)
858 (ipa . iso-2022-7bit)
859 (vietnamese-viscii-lower . vietnamese-viscii)
860 (vietnamese-viscii-upper . vietnamese-viscii)
861 (arabic-digit . iso-2022-7bit)
862 (arabic-1-column . iso-2022-7bit)
863 (lao . lao)
864 (arabic-2-column . iso-2022-7bit)
865 (indian-is13194 . devanagari)
866 (indian-glyph . devanagari)
867 (tibetan-1-column . tibetan)
868 (ethiopic . iso-2022-7bit)
869 (chinese-cns11643-1 . iso-2022-cn)
870 (chinese-cns11643-2 . iso-2022-cn)
871 (chinese-cns11643-3 . iso-2022-cn)
872 (chinese-cns11643-4 . iso-2022-cn)
873 (chinese-cns11643-5 . iso-2022-cn)
874 (chinese-cns11643-6 . iso-2022-cn)
875 (chinese-cns11643-7 . iso-2022-cn)
876 (indian-2-column . devanagari)
877 (tibetan . tibetan)
878 (latin-iso8859-14 . iso-latin-8)
879 (latin-iso8859-15 . iso-latin-9))))
880 (while l
881 (put-charset-property (car (car l)) 'preferred-coding-system (cdr (car l)))
882 (setq l (cdr l))))
883
884 \f
885 ;; Setup auto-fill-chars for charsets that should invoke auto-filling.
886 ;; SPACE and NEWLINE are already set.
887
888 (set-char-table-range auto-fill-chars '(#x3041 . #x30FF) t)
889 (set-char-table-range auto-fill-chars '(#x3400 . #x4DB5) t)
890 (set-char-table-range auto-fill-chars '(#x4e00 . #x9fbb) t)
891 (set-char-table-range auto-fill-chars '(#xF900 . #xFAFF) t)
892 (set-char-table-range auto-fill-chars '(#xFF00 . #xFF9F) t)
893 (set-char-table-range auto-fill-chars '(#x20000 . #x2FFFF) t)
894
895 \f
896 ;;; Setting char-width-table. The default is 1.
897
898 ;; 0: non-spacing, enclosing combining, formatting, Hangul Jamo medial
899 ;; and final characters.
900 (let ((l '((#x00AD . #x00AD)
901 (#x0300 . #x036F)
902 (#x0483 . #x0489)
903 (#x0591 . #x05BD)
904 (#x05BF . #x05BF)
905 (#x05C1 . #x05C2)
906 (#x05C4 . #x05C5)
907 (#x05C7 . #x05C7)
908 (#x0600 . #x0603)
909 (#x0610 . #x0615)
910 (#x064B . #x065E)
911 (#x0670 . #x0670)
912 (#x06D6 . #x06E4)
913 (#x06E7 . #x06E8)
914 (#x06EA . #x06ED)
915 (#x070F . #x070F)
916 (#x0711 . #x0711)
917 (#x0730 . #x074A)
918 (#x07A6 . #x07B0)
919 (#x07EB . #x07F3)
920 (#x0901 . #x0902)
921 (#x093C . #x093C)
922 (#x0941 . #x0948)
923 (#x094D . #x094D)
924 (#x0951 . #x0954)
925 (#x0962 . #x0963)
926 (#x0981 . #x0981)
927 (#x09BC . #x09BC)
928 (#x09C1 . #x09C4)
929 (#x09CD . #x09CD)
930 (#x09E2 . #x09E3)
931 (#x0A01 . #x0A02)
932 (#x0A3C . #x0A3C)
933 (#x0A41 . #x0A4D)
934 (#x0A70 . #x0A71)
935 (#x0A81 . #x0A82)
936 (#x0ABC . #x0ABC)
937 (#x0AC1 . #x0AC8)
938 (#x0ACD . #x0ACD)
939 (#x0AE2 . #x0AE3)
940 (#x0B01 . #x0B01)
941 (#x0B3C . #x0B3C)
942 (#x0B3F . #x0B3F)
943 (#x0B41 . #x0B43)
944 (#x0B4D . #x0B56)
945 (#x0B82 . #x0B82)
946 (#x0BC0 . #x0BC0)
947 (#x0BCD . #x0BCD)
948 (#x0C3E . #x0C40)
949 (#x0C46 . #x0C56)
950 (#x0CBC . #x0CBC)
951 (#x0CBF . #x0CBF)
952 (#x0CC6 . #x0CC6)
953 (#x0CCC . #x0CCD)
954 (#x0CE2 . #x0CE3)
955 (#x0D41 . #x0D43)
956 (#x0D4D . #x0D4D)
957 (#x0DCA . #x0DCA)
958 (#x0DD2 . #x0DD6)
959 (#x0E31 . #x0E31)
960 (#x0E34 . #x0E3A)
961 (#x0E47 . #x0E4E)
962 (#x0EB1 . #x0EB1)
963 (#x0EB4 . #x0EBC)
964 (#x0EC8 . #x0ECD)
965 (#x0F18 . #x0F19)
966 (#x0F35 . #x0F35)
967 (#x0F37 . #x0F37)
968 (#x0F39 . #x0F39)
969 (#x0F71 . #x0F7E)
970 (#x0F80 . #x0F84)
971 (#x0F86 . #x0F87)
972 (#x0F90 . #x0FBC)
973 (#x0FC6 . #x0FC6)
974 (#x102D . #x1030)
975 (#x1032 . #x1037)
976 (#x1039 . #x1039)
977 (#x1058 . #x1059)
978 (#x1160 . #x11FF)
979 (#x135F . #x135F)
980 (#x1712 . #x1714)
981 (#x1732 . #x1734)
982 (#x1752 . #x1753)
983 (#x1772 . #x1773)
984 (#x17B4 . #x17B5)
985 (#x17B7 . #x17BD)
986 (#x17C6 . #x17C6)
987 (#x17C9 . #x17D3)
988 (#x17DD . #x17DD)
989 (#x180B . #x180D)
990 (#x18A9 . #x18A9)
991 (#x1920 . #x1922)
992 (#x1927 . #x1928)
993 (#x1932 . #x1932)
994 (#x1939 . #x193B)
995 (#x1A17 . #x1A18)
996 (#x1B00 . #x1B03)
997 (#x1B34 . #x1B34)
998 (#x1B36 . #x1B3A)
999 (#x1B3C . #x1B3C)
1000 (#x1B42 . #x1B42)
1001 (#x1B6B . #x1B73)
1002 (#x1DC0 . #x1DFF)
1003 (#x200B . #x200F)
1004 (#x202A . #x202E)
1005 (#x2060 . #x206F)
1006 (#x20D0 . #x20EF)
1007 (#x302A . #x302F)
1008 (#x3099 . #x309A)
1009 (#xA806 . #xA806)
1010 (#xA80B . #xA80B)
1011 (#xA825 . #xA826)
1012 (#xFB1E . #xFB1E)
1013 (#xFE00 . #xFE0F)
1014 (#xFE20 . #xFE23)
1015 (#xFEFF . #xFEFF)
1016 (#xFFF9 . #xFFFB)
1017 (#x10A01 . #x10A0F)
1018 (#x10A38 . #x10A3F)
1019 (#x1D167 . #x1D169)
1020 (#x1D173 . #x1D182)
1021 (#x1D185 . #x1D18B)
1022 (#x1D1AA . #x1D1AD)
1023 (#x1D242 . #x1D244)
1024 (#xE0001 . #xE01EF))))
1025 (dolist (elt l)
1026 (set-char-table-range char-width-table elt 0)))
1027
1028 ;; 2: East Asian Wide and Full-width characters.
1029 (let ((l '((#x1100 . #x115F)
1030 (#x2329 . #x232A)
1031 (#x2E80 . #x303E)
1032 (#x3040 . #xA4CF)
1033 (#xAC00 . #xD7A3)
1034 (#xF900 . #xFAFF)
1035 (#xFE30 . #xFE6F)
1036 (#xFF01 . #xFF60)
1037 (#xFFE0 . #xFFE6)
1038 (#x20000 . #x2FFFF)
1039 (#x30000 . #x3FFFF))))
1040 (dolist (elt l)
1041 (set-char-table-range char-width-table elt 2)))
1042
1043 ;; Other double width
1044 ;;(map-charset-chars
1045 ;; (lambda (range ignore) (set-char-table-range char-width-table range 2))
1046 ;; 'ethiopic)
1047 ;; (map-charset-chars
1048 ;; (lambda (range ignore) (set-char-table-range char-width-table range 2))
1049 ;; 'tibetan)
1050 (map-charset-chars
1051 (lambda (range ignore) (set-char-table-range char-width-table range 2))
1052 'indian-2-column)
1053 (map-charset-chars
1054 (lambda (range ignore) (set-char-table-range char-width-table range 2))
1055 'arabic-2-column)
1056
1057 (optimize-char-table (standard-case-table))
1058 (optimize-char-table (standard-category-table))
1059 (optimize-char-table (standard-syntax-table))
1060
1061 ;; The Unicode blocks actually extend past some of these ranges with
1062 ;; undefined codepoints.
1063 (let ((script-list nil))
1064 (dolist
1065 (elt
1066 '((#x0000 #x007F latin)
1067 (#x00A0 #x036F latin)
1068 (#x0370 #x03E1 greek)
1069 (#x03E2 #x03EF coptic)
1070 (#x03F0 #x03F3 greek)
1071 (#x0400 #x04FF cyrillic)
1072 (#x0530 #x058F armenian)
1073 (#x0590 #x05FF hebrew)
1074 (#x0600 #x06FF arabic)
1075 (#x0700 #x074F syriac)
1076 (#x07C0 #x07FA nko)
1077 (#x0780 #x07BF thaana)
1078 (#x0900 #x097F devanagari)
1079 (#x0980 #x09FF bengali)
1080 (#x0A00 #x0A7F gurmukhi)
1081 (#x0A80 #x0AFF gujarati)
1082 (#x0B00 #x0B7F oriya)
1083 (#x0B80 #x0BFF tamil)
1084 (#x0C00 #x0C7F telugu)
1085 (#x0C80 #x0CFF kannada)
1086 (#x0D00 #x0D7F malayalam)
1087 (#x0D80 #x0DFF sinhala)
1088 (#x0E00 #x0E5F thai)
1089 (#x0E80 #x0EDF lao)
1090 (#x0F00 #x0FFF tibetan)
1091 (#x1000 #x105F myanmar)
1092 (#x10A0 #x10FF georgian)
1093 (#x1100 #x11FF hangul)
1094 (#x1200 #x139F ethiopic)
1095 (#x13A0 #x13FF cherokee)
1096 (#x1400 #x167F canadian-aboriginal)
1097 (#x1680 #x169F ogham)
1098 (#x16A0 #x16FF runic)
1099 (#x1780 #x17FF khmer)
1100 (#x1800 #x18AF mongolian)
1101 (#x1E00 #x1EFF latin)
1102 (#x1F00 #x1FFF greek)
1103 (#x2000 #x27FF symbol)
1104 (#x2800 #x28FF braille)
1105 (#x2D80 #x2DDF ethiopic)
1106 (#x2E80 #x2FDF han)
1107 (#x2FF0 #x2FFF ideographic-description)
1108 (#x3000 #x303F cjk-misc)
1109 (#x3040 #x30FF kana)
1110 (#x3100 #x312F bopomofo)
1111 (#x3130 #x318F hangul)
1112 (#x3190 #x319F kanbun)
1113 (#x31A0 #x31BF bopomofo)
1114 (#x3400 #x9FAF han)
1115 (#xA000 #xA4CF yi)
1116 (#xAC00 #xD7AF hangul)
1117 (#xF900 #xFAFF han)
1118 (#xFB1D #xFB4F hebrew)
1119 (#xFB50 #xFDFF arabic)
1120 (#xFE70 #xFEFC arabic)
1121 (#xFF00 #xFF5F cjk-misc)
1122 (#xFF61 #xFF9F kana)
1123 (#xFFE0 #xFFE6 cjk-misc)
1124 (#x1D000 #x1D0FF byzantine-musical-symbol)
1125 (#x1D100 #x1D1FF musical-symbol)
1126 (#x1D400 #x1D7FF mathematical)
1127 (#x20000 #x2AFFF han)
1128 (#x2F800 #x2FFFF han)))
1129 (set-char-table-range char-script-table
1130 (cons (car elt) (nth 1 elt)) (nth 2 elt))
1131 (or (memq (nth 2 elt) script-list)
1132 (setq script-list (cons (nth 2 elt) script-list))))
1133 (set-char-table-extra-slot char-script-table 0 (nreverse script-list)))
1134
1135 (map-charset-chars
1136 #'(lambda (range ignore)
1137 (set-char-table-range char-script-table range 'tibetan))
1138 'tibetan)
1139
1140 \f
1141 ;;; Setting word boundary.
1142
1143 (defun next-word-boundary-han (pos limit)
1144 (if (<= pos limit)
1145 (save-excursion
1146 (goto-char pos)
1147 (looking-at "\\cC+")
1148 (goto-char (match-end 0))
1149 (if (looking-at "\\cH+")
1150 (goto-char (match-end 0)))
1151 (point))
1152 (while (and (> pos limit)
1153 (eq (aref char-script-table (char-after (1- pos))) 'han))
1154 (setq pos (1- pos)))
1155 pos))
1156
1157 (defun next-word-boundary-kana (pos limit)
1158 (if (<= pos limit)
1159 (save-excursion
1160 (goto-char pos)
1161 (if (looking-at "\\cK+")
1162 (goto-char (match-end 0)))
1163 (if (looking-at "\\cH+")
1164 (goto-char (match-end 0)))
1165 (if (looking-at "\\ck+")
1166 (goto-char (match-end 0)))
1167 (point))
1168 (let ((category-set (char-category-set (char-after pos)))
1169 category)
1170 (if (or (aref category-set ?K) (aref category-set ?k))
1171 (while (and (> pos limit)
1172 (setq category-set
1173 (char-category-set (char-after (1- pos))))
1174 (or (aref category-set ?K) (aref category-set ?k)))
1175 (setq pos (1- pos)))
1176 (while (and (> pos limit)
1177 (aref (setq category-set
1178 (char-category-set (char-after (1- pos)))) ?H))
1179 (setq pos (1- pos)))
1180 (setq category (cond ((aref category-set ?C) ?C)
1181 ((aref category-set ?K) ?K)
1182 ((aref category-set ?A) ?A)))
1183 (when category
1184 (setq pos (1- pos))
1185 (while (and (> pos limit)
1186 (aref (char-category-set (char-after (1- pos)))
1187 category))
1188 (setq pos (1- pos)))))
1189 pos)))
1190
1191 (map-char-table
1192 #'(lambda (char script)
1193 (cond ((eq script 'han)
1194 (set-char-table-range find-word-boundary-function-table
1195 char #'next-word-boundary-han))
1196 ((eq script 'kana)
1197 (set-char-table-range find-word-boundary-function-table
1198 char #'next-word-boundary-kana))))
1199 char-script-table)
1200
1201 (setq word-combining-categories
1202 '((?l . ?l)
1203 (?C . ?C)
1204 (?C . ?H)
1205 (?C . ?K)))
1206
1207 (setq word-separating-categories ; (2-byte character sets)
1208 '((?A . ?K) ; Alpha numeric - Katakana
1209 (?A . ?C) ; Alpha numeric - Chinese
1210 (?H . ?A) ; Hiragana - Alpha numeric
1211 (?H . ?K) ; Hiragana - Katakana
1212 (?H . ?C) ; Hiragana - Chinese
1213 (?K . ?A) ; Katakana - Alpha numeric
1214 (?K . ?C) ; Katakana - Chinese
1215 (?C . ?A) ; Chinese - Alpha numeric
1216 (?C . ?K) ; Chinese - Katakana
1217 ))
1218
1219 ;;; Local Variables:
1220 ;;; coding: utf-8-emacs
1221 ;;; End:
1222
1223 ;;; arch-tag: 85889c35-9f4d-4912-9bf5-82de31b0d42d
1224 ;;; characters.el ends here