1 ;;; thai-word.el -- find Thai word boundaries
3 ;; Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
4 ;; National Institute of Advanced Industrial Science and Technology (AIST)
5 ;; Registration Number H14PRO021
7 ;; Author: Kenichi HANDA <handa@etl.go.jp>
9 ;; Keywords: thai, word break, emacs
11 ;; This file is part of GNU Emacs.
13 ;; GNU Emacs is free software: you can redistribute it and/or modify
14 ;; it under the terms of the GNU General Public License as published by
15 ;; the Free Software Foundation, either version 3 of the License, or
16 ;; (at your option) any later version.
18 ;; GNU Emacs is distributed in the hope that it will be useful,
19 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
20 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 ;; GNU General Public License for more details.
23 ;; You should have received a copy of the GNU General Public License
24 ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
26 ;; The used Thai word list has been taken from IBM's ICU4J project
27 ;; (file `thai6.ucs', version 1.4, converted to TIS encoding, with
28 ;; removal of three incorrect entries) to which the following license
31 ;; COPYRIGHT AND PERMISSION NOTICE
34 ;; Copyright (c) 1995-2001 International Business Machines
35 ;; Corporation and others
37 ;; All rights reserved.
40 ;; Permission is hereby granted, free of charge, to any person
41 ;; obtaining a copy of this software and associated documentation
42 ;; files (the "Software"), to deal in the Software without
43 ;; restriction, including without limitation the rights to use,
44 ;; copy, modify, merge, publish, distribute, and/or sell copies of
45 ;; the Software, and to permit persons to whom the Software is
46 ;; furnished to do so, provided that the above copyright notice(s)
47 ;; and this permission notice appear in all copies of the Software
48 ;; and that both the above copyright notice(s) and this permission
49 ;; notice appear in supporting documentation.
51 ;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
52 ;; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
53 ;; OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
54 ;; NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE
55 ;; COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE
56 ;; FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES,
57 ;; OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
58 ;; PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
59 ;; TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
60 ;; PERFORMANCE OF THIS SOFTWARE.
62 ;; Except as contained in this notice, the name of a copyright
63 ;; holder shall not be used in advertising or otherwise to promote
64 ;; the sale, use or other dealings in this Software without prior
65 ;; written authorization of the copyright holder.
70 ;; This file implements an algorithm to find Thai word breaks using a
71 ;; dictionary. It is based on the C program `cttex' written by
72 ;; Vuthichai Ampornaramveth <vuthi@nii.ac.jp>.
75 ;; Table of Thai words. It is a nested alist (see `nested-alist-p'),
76 ;; which means that you can easily index the list character by
79 (defvar thai-word-table
80 (let ((table (list 'thai-words)))
82 ;;; The following is indented as this to minimize this file size.
6227 "สมบูรณาญาสิทธิราชย์"
10737 (set-nested-alist elt 1 table))
10739 "Nested alist of Thai words.")
10742 (defun thai-update-word-table (file &optional append)
10743 "Update Thai word table by replacing the current word list with
10744 FILE. If called with a prefix argument, FILE is appended instead to
10745 the current word list."
10746 (interactive "FThai word table file: \nP")
10747 (let ((buf (generate-new-buffer "*thai-work*"))
10748 (coding-system-for-read 'thai-tis620)
10749 (table (if append thai-word-table (list 'thai-words))))
10751 (with-current-buffer buf
10752 (insert-file-contents file)
10753 (goto-char (point-min))
10754 (while (re-search-forward "\\ct+" nil t)
10755 (set-nested-alist (match-string 0) 1 table)))
10757 (setq thai-word-table table)))
10760 ;; Two special Thai characters regarded as suffix of words.
10762 (defconst thai-MaiYaMok (make-char 'thai-tis620 ?\xE6))
10763 (defconst thai-PaiYanNoi (make-char 'thai-tis620 ?\xCF))
10766 ;; Find Thai words starting at POS and return a list of positions of
10767 ;; the Thai word ends. It doesn't move point. LIMIT limits the
10768 ;; maximum position. IGNORE is a list of positions to ignore. It is
10769 ;; assumed that all following characters to LIMIT are Thai. If the
10770 ;; following char is not Thai (i.e., POS is equal to LIMIT), return t.
10772 ;; Note that the longest word position comes first.
10774 (defun thai-find-word-ends (pos limit &optional ignore)
10777 (let* ((char (char-after pos))
10778 (this (cdr (assq char (cdr thai-word-table))))
10781 ;; Look up the following character sequence in `thai-word-table'
10782 ;; character by character.
10785 char (or (char-after pos) 0)
10786 category-set (char-category-set char))
10787 ;; If the current sequence is recorded in `thai-word-table'
10788 ;; (i.e. (car THIS) is 1) and the following Thai character is
10789 ;; not an upper-vowel, lower-vowel, or tone-mark, we have
10790 ;; found a possible word ending position.
10791 (if (and (eq (car this) 1)
10792 (not (or (aref category-set ?2)
10793 (aref category-set ?3)
10794 (aref category-set ?4))))
10796 ;; Skip possible Thai suffices.
10797 (while (or (eq char thai-MaiYaMok) (eq char thai-PaiYanNoi))
10799 char (char-after pos)))
10800 ;; Skip character positions in IGNORE list.
10801 (or (memq pos ignore)
10802 (setq positions (cons pos positions)))))
10803 ;; Set up next loop.
10804 (setq this (and (< pos limit) (cdr (assq char this)))))
10808 ;; Move point forward to the end of Thai word which follows point and
10809 ;; update VEC. VEC is a vector of three elements used to cache word
10810 ;; end positions. The Nth element, if non-nil, is a list of end
10811 ;; points of the Nth word, or t indicating that there is no Thai
10812 ;; character. LIMIT limits the point movement.
10814 (defun thai-forward-word-update-info (vec limit)
10815 (let ((pos (point))
10821 ;; If four succeeding Thai words are found, throw t, otherwise
10824 ;; Start with first vector element.
10825 (setq v0 (aref vec 0))
10827 ;; Update VEC if V0 is empty.
10828 (setq v0 (thai-find-word-ends pos limit))
10831 ;; In case we haven't found any wordbreaks resp. point has
10832 ;; reached LIMIT, exit the catch body.
10833 (if (symbolp v0) ; i.e. nil or t?
10835 ;; OK, V0 holds possible word ends for the current position.
10836 ;; We save V0 for later reference.
10838 (setq v1 (aref vec 1))
10839 ;; Now we try all end word positions to find the next word.
10841 (setq pos (car v0))
10843 ;; Update VEC if V1 is empty, ignoring positions already
10844 ;; found -- for Thai, we need the longest match, so if
10847 ;; start(long-word) = start(short-word1)
10848 ;; end(short-word1) = start(short-word2)
10849 ;; end(short-word2) = end(long-word)
10851 ;; only long-word is used.
10852 (setq v1 (thai-find-word-ends pos limit tried))
10855 ;; If point has reached LIMIT, exit the catch body.
10858 ;; Save SECOND-BEST, if this hasn't been done already.
10859 ;; The `second best' solution is the end position of the
10860 ;; longest first word followed by the longest second word.
10863 (setq second-best (cons v0 v1)))
10864 ;; Update the already tried end word positions.
10865 (setq tried (append tried v1))
10866 ;; Now repeat the whole process to find a third word.
10868 (setq v2 (aref vec 2))
10870 (setq pos (car v1))
10872 (setq v2 (thai-find-word-ends pos limit tried))
10876 (setq tried (append tried v2))
10877 ;; And the same for a fourth word.
10879 (setq pos (car v2))
10880 (setq v3 (thai-find-word-ends pos limit tried))
10883 (setq v2 (cdr v2)))
10884 (setq v1 (cdr v1))))
10885 (setq v0 (cdr v0)))
10889 ;; We found four succeeding Thai words (or LIMIT has been
10890 ;; reached). Move to the end of the first word.
10891 (goto-char (car v0))
10892 ;; Update VEC for the next function call. If no larger word
10893 ;; positions have been found, set the corresponding vector
10895 (if (and (consp v1) (< (car v1) (car (aref vec 1))))
10898 (if (and (consp v2) (< (car v2) (car (aref vec 2))))
10901 (aset vec 2 v3)))) ; exit function successfully
10903 ;; We didn't find four consecutive words. If we have found a
10904 ;; `second best' solution and the length of those two words is
10905 ;; longer than the longest word we can see at the current point,
10906 ;; adopt the second best solution. This decision is based on
10907 ;; heuristic tests.
10908 (if (and second-best
10909 (< (car (aref vec 0)) (car (cdr second-best))))
10911 (goto-char (car (car second-best)))
10912 (aset vec 0 (cdr second-best)))
10913 ;; We finally failed to find a word break. For Thai, the best
10914 ;; solution is to extend the first longest word so that the
10915 ;; end point starts a second word.
10916 (setq pos (or (car (aref vec 0)) pos))
10917 (while (and (< pos limit)
10918 (not (setq positions (thai-find-word-ends pos limit))))
10919 (setq pos (1+ pos)))
10921 (aset vec 0 positions))
10926 ;; Return a list of Thai word boundary positions after the current
10927 ;; point. LIMIT, if non-nil, limits the region to check.
10929 (defun thai-find-word-boundaries (&optional limit)
10931 (setq limit (point-max)))
10933 (let ((vec (make-vector 3 nil))
10936 ;; Loop over all (consecutive) Thai regions by using the
10937 ;; character property `t' until LIMIT is reached.
10938 (while (and (< (point) limit)
10939 (re-search-forward "\\ct+" nil t))
10940 (setq this-limit (point))
10941 (goto-char (match-beginning 0))
10942 (fillarray vec nil)
10943 ;; Check the first word, initializing VEC.
10944 (thai-forward-word-update-info vec this-limit)
10945 ;; Then loop over the remaining words in the current Thai
10946 ;; region, collecting the boundaries.
10947 (while (< (point) this-limit)
10948 (setq boundaries (cons (point) boundaries))
10949 (thai-forward-word-update-info vec this-limit)))
10953 (defun thai-break-words (separator &optional limit)
10954 "Break Thai words by inserting a separator string at word boundaries."
10955 (interactive "sSeparator: ")
10957 (let ((boundaries (thai-find-word-boundaries limit)))
10959 (goto-char (car boundaries))
10961 (setq boundaries (cdr boundaries))))))
10964 (defun thai-forward-word (count)
10965 "Move point forward COUNT words considering Thai word boundaries.
10966 If COUNT is negative, move point backward (- COUNT) words."
10970 (skip-syntax-forward "^w")
10971 (if (looking-at "\\ct+")
10972 ;; We have reached a Thai region, so we must do something
10973 ;; special instead of using forward-word.
10974 (let ((start (point))
10975 (limit (match-end 0))
10978 ;; If thai-forward-word has been called within a Thai
10979 ;; region, we must go back until the Thai region starts
10980 ;; to do the contextual analysis for finding word
10982 (while (aref (char-category-set (preceding-char)) ?t)
10984 ;; OK, we ask for the list of word boundaries in
10986 (setq boundaries (nreverse (thai-find-word-boundaries limit)))
10987 ;; Now we search for the next boundary after START.
10988 (while (and boundaries (<= (car boundaries) start))
10989 (setq boundaries (cdr boundaries)))
10990 ;; Adjust loop for next while loop.
10991 (setq count (1- count))
10992 ;; Now we skip Thai words until the BOUNDARIES list is
10993 ;; empty or count-1 words have been passed.
10994 (if (not boundaries)
10996 (while (and (> count 0)
10998 (setq boundaries (cdr boundaries)
11000 ;; If BOUNDARIES is empty, the word counter is not
11001 ;; zero yet (remember that we have decreased COUNT by
11002 ;; one), so we go to LIMIT. Otherwise go to next
11005 (goto-char (car boundaries))
11006 (goto-char limit))))
11009 (setq count (1- count))))
11010 ;; The symmetrical action for negative values.
11013 (skip-syntax-backward "^w")
11014 (if (aref (char-category-set (preceding-char)) ?t)
11015 (let ((start (point))
11016 (limit (if (looking-at "\\ct+") (match-end 0)
11020 (while (aref (char-category-set (preceding-char)) ?t)
11022 (setq boundaries (thai-find-word-boundaries limit))
11023 (while (and boundaries (>= (car boundaries) start))
11024 (setq boundaries (cdr boundaries)))
11025 (setq count (1+ count))
11028 (while (and (< count 0) boundaries)
11029 (setq boundaries (cdr boundaries)
11032 (goto-char (car boundaries))))))
11034 (setq count (1+ count)))))))
11037 (defun thai-backward-word (count)
11038 "Move point backward COUNT words considering Thai word boundaries.
11039 If COUNT is negative, move point forward (- COUNT) words."
11041 (thai-forward-word (- count)))
11044 (defun thai-kill-word (arg)
11045 "Like kill-word but pay attention to Thai word boundaries.
11046 With argument, do this that many times."
11048 (kill-region (point) (progn (thai-forward-word arg) (point))))
11051 (defun thai-backward-kill-word (arg)
11052 "Like backward-kill-word but pay attention to Thai word boundaries."
11054 (thai-kill-word (- arg)))
11057 (defun thai-transpose-words (arg)
11058 "Like transpose-words but pay attention to Thai word boundaries."
11060 (transpose-subr 'thai-forward-word arg))
11062 (defun thai-fill-find-break-point (linebeg)
11063 "Go to a line breaking position near point considering Thai word boundaries."
11064 (let ((pos (point)))
11065 (thai-forward-word -1)
11066 (when (<= (point) linebeg)
11068 (thai-forward-word 1))
11069 (kinsoku linebeg)))
11071 (provide 'thai-word)
11074 ;; Local Variables:
11078 ;; end of thai-word.el