]> code.delx.au - gnu-emacs/blob - lisp/nxml/xmltok.el
Merge from origin/emacs-25
[gnu-emacs] / lisp / nxml / xmltok.el
1 ;;; xmltok.el --- XML tokenization -*- lexical-binding:t -*-
2
3 ;; Copyright (C) 2003, 2007-2016 Free Software Foundation, Inc.
4
5 ;; Author: James Clark
6 ;; Keywords: wp, hypermedia, languages, XML
7
8 ;; This file is part of GNU Emacs.
9
10 ;; GNU Emacs is free software: you can redistribute it and/or modify
11 ;; it under the terms of the GNU General Public License as published by
12 ;; the Free Software Foundation, either version 3 of the License, or
13 ;; (at your option) any later version.
14
15 ;; GNU Emacs is distributed in the hope that it will be useful,
16 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;; GNU General Public License for more details.
19
20 ;; You should have received a copy of the GNU General Public License
21 ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
22
23 ;;; Commentary:
24
25 ;; This implements an XML 1.0 parser. It also implements the XML
26 ;; Namespaces Recommendation. It is designed to be conforming, but it
27 ;; works a bit differently from a normal XML parser. An XML document
28 ;; consists of the prolog and an instance. The prolog is parsed as a
29 ;; single unit using `xmltok-forward-prolog'. The instance is
30 ;; considered as a sequence of tokens, where a token is something like
31 ;; a start-tag, a comment, a chunk of data or a CDATA section. The
32 ;; tokenization of the instance is stateless: the tokenization of one
33 ;; part of the instance does not depend on tokenization of the
34 ;; preceding part of the instance. This allows the instance to be
35 ;; parsed incrementally. The main entry point is `xmltok-forward':
36 ;; this can be called at any point in the instance provided it is
37 ;; between tokens.
38 ;;
39 ;; This is a non-validating XML 1.0 processor. It does not resolve
40 ;; parameter entities (including the external DTD subset) and it does
41 ;; not resolve external general entities.
42 ;;
43 ;; It is non-conformant by design in the following respects.
44 ;;
45 ;; 1. It expects the client to detect aspects of well-formedness that
46 ;; are not internal to a single token, specifically checking that
47 ;; end-tags match start-tags and that the instance contains exactly
48 ;; one element.
49 ;;
50 ;; 2. It expects the client to detect duplicate attributes. Detection
51 ;; of duplicate attributes after expansion of namespace prefixes
52 ;; requires the namespace processing state. Detection of duplicate
53 ;; attributes before expansion of namespace prefixes does not, but is
54 ;; redundant given that the client will do detection of duplicate
55 ;; attributes after expansion of namespace prefixes.
56 ;;
57 ;; 3. It allows the client to recover from well-formedness errors.
58 ;; This is essential for use in applications where the document is
59 ;; being parsed during the editing process.
60 ;;
61 ;; 4. It does not support documents that do not conform to the lexical
62 ;; requirements of the XML Namespaces Recommendation (e.g. a document
63 ;; with a colon in an entity name).
64 ;;
65 ;; There are also a number of things that have not yet been
66 ;; implemented that make it non-conformant.
67 ;;
68 ;; 1. It does not implement default attributes. ATTLIST declarations
69 ;; are parsed, but no checking is done on the content of attribute
70 ;; value literals specifying default attribute values, and default
71 ;; attribute values are not reported to the client.
72 ;;
73 ;; 2. It does not implement internal entities containing elements. If
74 ;; an internal entity is referenced and parsing its replacement text
75 ;; yields one or more tags, then it will skip the reference and
76 ;; report this to the client.
77 ;;
78 ;; 3. It does not check the syntax of public identifiers in the DTD.
79 ;;
80 ;; 4. It allows some non-ASCII characters in certain situations where
81 ;; it should not. For example, it only enforces XML 1.0's
82 ;; restrictions on name characters strictly for ASCII characters. The
83 ;; problem here is XML's character model is based squarely on Unicode,
84 ;; whereas Emacs's is not (as of version 21). It is not clear what
85 ;; the right thing to do is.
86
87 ;;; Code:
88
89 (defvar xmltok-type nil)
90 (defvar xmltok-start nil)
91 (defvar xmltok-name-colon nil)
92 (defvar xmltok-name-end nil)
93 (defvar xmltok-replacement nil
94 "String containing replacement for a character or entity reference.")
95
96 (defvar xmltok-attributes nil
97 "List containing attributes of last scanned element.
98 Each member of the list is a vector representing an attribute, which
99 can be accessed using the functions `xmltok-attribute-name-start',
100 `xmltok-attribute-name-colon', `xmltok-attribute-name-end',
101 `xmltok-attribute-value-start', `xmltok-attribute-value-end',
102 `xmltok-attribute-raw-normalized-value', `xmltok-attribute-refs'.")
103
104 (defvar xmltok-namespace-attributes nil
105 "List containing namespace declarations of last scanned element.
106 List has same format as `xmltok-attributes'.")
107
108 (defvar xmltok-dtd nil
109 "Information about the DTD used by `xmltok-forward'.
110 `xmltok-forward-prolog' sets this up.
111
112 It consists of an alist of general entity names vs definitions. The
113 first member of the alist is t if references to entities not in the
114 alist are well-formed \(e.g. because there's an external subset that
115 wasn't parsed).
116
117 Each general entity name is a string. The definition is either nil,
118 a symbol, a string, a cons cell. If the definition is nil, then it
119 means that it's an internal entity but the result of parsing it is
120 unknown. If it is a symbol, then the symbol is either `unparsed',
121 meaning the entity is an unparsed entity, `external', meaning the
122 entity is or references an external entity, `element', meaning the
123 entity includes one or more elements, or `not-well-formed', meaning
124 the replacement text is not well-formed. If the definition is a
125 string, then the replacement text of the entity is that string; this
126 happens only during the parsing of the prolog. If the definition is
127 a cons cell \(ER . AR), then ER specifies the string that results
128 from referencing the entity in element content and AR is either nil,
129 meaning the replacement text included a <, or a string which is the
130 normalized attribute value.")
131
132
133 (defvar xmltok-errors nil
134 "List of errors detected by `xmltok-forward' and `xmltok-forward-prolog'.
135 When `xmltok-forward' and `xmltok-forward-prolog' detect a
136 well-formedness error, they will add an entry to the beginning of this
137 list. Each entry is a vector [MESSAGE START END], where MESSAGE is a
138 string giving the error message and START and END are integers
139 indicating the position of the error.")
140
141 (defmacro xmltok-save (&rest body)
142 (declare (indent 0) (debug t))
143 `(let (xmltok-type
144 xmltok-start
145 xmltok-name-colon
146 xmltok-name-end
147 xmltok-replacement
148 xmltok-attributes
149 xmltok-namespace-attributes
150 xmltok-errors)
151 ,@body))
152
153 (defsubst xmltok-attribute-name-start (att)
154 (aref att 0))
155
156 (defsubst xmltok-attribute-name-colon (att)
157 (aref att 1))
158
159 (defsubst xmltok-attribute-name-end (att)
160 (aref att 2))
161
162 (defsubst xmltok-attribute-value-start (att)
163 (aref att 3))
164
165 (defsubst xmltok-attribute-value-end (att)
166 (aref att 4))
167
168 (defsubst xmltok-attribute-raw-normalized-value (att)
169 "Return an object representing the normalized value of ATT.
170 This can be t indicating that the normalized value is the same as
171 the buffer substring from the start to the end of the value, or nil
172 indicating that the value is not well-formed or a string."
173 (aref att 5))
174
175 (defsubst xmltok-attribute-refs (att)
176 "Return a list of the entity and character references in ATT.
177 Each member is a vector [TYPE START END] where TYPE is either char-ref
178 or entity-ref and START and END are integers giving the start and end of
179 the reference. Nested entity references are not included in the list."
180 (aref att 6))
181
182 (defun xmltok-attribute-prefix (att)
183 (let ((colon (xmltok-attribute-name-colon att)))
184 (and colon
185 (buffer-substring-no-properties (xmltok-attribute-name-start att)
186 colon))))
187
188 (defun xmltok-attribute-local-name (att)
189 (let ((colon (xmltok-attribute-name-colon att)))
190 (buffer-substring-no-properties (if colon
191 (1+ colon)
192 (xmltok-attribute-name-start att))
193 (xmltok-attribute-name-end att))))
194
195 (defun xmltok-attribute-value (att)
196 (let ((rnv (xmltok-attribute-raw-normalized-value att)))
197 (and rnv
198 (if (stringp rnv)
199 rnv
200 (buffer-substring-no-properties (xmltok-attribute-value-start att)
201 (xmltok-attribute-value-end att))))))
202
203 (defun xmltok-start-tag-prefix ()
204 (and xmltok-name-colon
205 (buffer-substring-no-properties (1+ xmltok-start)
206 xmltok-name-colon)))
207
208 (defun xmltok-start-tag-local-name ()
209 (buffer-substring-no-properties (1+ (or xmltok-name-colon
210 xmltok-start))
211 xmltok-name-end))
212
213 (defun xmltok-end-tag-prefix ()
214 (and xmltok-name-colon
215 (buffer-substring-no-properties (+ 2 xmltok-start)
216 xmltok-name-colon)))
217
218 (defun xmltok-end-tag-local-name ()
219 (buffer-substring-no-properties (if xmltok-name-colon
220 (1+ xmltok-name-colon)
221 (+ 2 xmltok-start))
222 xmltok-name-end))
223
224 (defun xmltok-start-tag-qname ()
225 (buffer-substring-no-properties (+ xmltok-start 1) xmltok-name-end))
226
227 (defun xmltok-end-tag-qname ()
228 (buffer-substring-no-properties (+ xmltok-start 2) xmltok-name-end))
229
230 (defsubst xmltok-make-attribute (name-begin
231 name-colon
232 name-end
233 &optional
234 value-begin
235 value-end
236 raw-normalized-value)
237 "Make an attribute.
238 RAW-NORMALIZED-VALUE is nil if the value is not well-formed,
239 t if the normalized value is the string between VALUE-BEGIN
240 and VALUE-END, otherwise a STRING giving the value."
241 (vector name-begin
242 name-colon
243 name-end
244 value-begin
245 value-end
246 raw-normalized-value
247 nil))
248
249 (defsubst xmltok-error-message (err)
250 (aref err 0))
251
252 (defsubst xmltok-error-start (err)
253 (aref err 1))
254
255 (defsubst xmltok-error-end (err)
256 (aref err 2))
257
258 (defsubst xmltok-make-error (message start end)
259 (vector message start end))
260
261 (defun xmltok-add-error (message &optional start end)
262 (push (xmltok-make-error message
263 (or start xmltok-start)
264 (or end (point)))
265 xmltok-errors))
266
267 (defun xmltok-forward ()
268 (setq xmltok-start (point))
269 (let* ((case-fold-search nil)
270 (space-count (skip-chars-forward " \t\r\n"))
271 (ch (char-after)))
272 (cond ((eq ch ?\<)
273 (cond ((> space-count 0)
274 (setq xmltok-type 'space))
275 (t
276 (forward-char 1)
277 (xmltok-scan-after-lt))))
278 ((eq ch ?\&)
279 (cond ((> space-count 0)
280 (setq xmltok-type 'space))
281 (t
282 (forward-char 1)
283 (xmltok-scan-after-amp 'xmltok-handle-entity))))
284 ((re-search-forward "[<&]\\|\\(]]>\\)" nil t)
285 (cond ((not (match-beginning 1))
286 (goto-char (match-beginning 0))
287 ;; must have got a non-space char
288 (setq xmltok-type 'data))
289 ((= (match-beginning 1) xmltok-start)
290 (xmltok-add-error "Found `]]>' not closing a CDATA section")
291 (setq xmltok-type 'not-well-formed))
292 (t
293 (goto-char (match-beginning 0))
294 (setq xmltok-type
295 (if (= (point) (+ xmltok-start space-count))
296 'space
297 'data)))))
298 ((eq ch nil)
299 (setq xmltok-type
300 (if (> space-count 0)
301 'space
302 nil)))
303 (t
304 (goto-char (point-max))
305 (setq xmltok-type 'data)))))
306
307 (eval-when-compile
308
309 ;; A symbolic regexp is represented by a list whose CAR is the string
310 ;; containing the regexp and whose cdr is a list of symbolic names
311 ;; for the groups in the string.
312
313 ;; Construct a symbolic regexp from a regexp.
314 (defun xmltok-r (str)
315 (cons str nil))
316
317 ;; Concatenate zero of more regexps and symbolic regexps.
318 (defun xmltok+ (&rest args)
319 (let (strs names)
320 (while args
321 (let ((arg (car args)))
322 (if (stringp arg)
323 (setq strs (cons arg strs))
324 (setq strs (cons (car arg) strs))
325 (setq names (cons (cdr arg) names)))
326 (setq args (cdr args))))
327 (cons (apply 'concat (nreverse strs))
328 (apply 'append (nreverse names))))))
329
330 (eval-when-compile
331 ;; Make a symbolic group named NAME from the regexp R.
332 ;; R may be a symbolic regexp or an ordinary regexp.
333 (defmacro xmltok-g (name &rest r)
334 (let ((sym (make-symbol "r")))
335 `(let ((,sym (xmltok+ ,@r)))
336 (if (stringp ,sym)
337 (cons (concat "\\(" ,sym "\\)") (cons ',name nil))
338 (cons (concat "\\(" (car ,sym) "\\)") (cons ',name (cdr ,sym)))))))
339
340 (defun xmltok-p (&rest r) (xmltok+ "\\(?:"
341 (apply 'xmltok+ r)
342 "\\)"))
343
344 ;; Get the group index of ELEM in a LIST of symbols.
345 (defun xmltok-get-index (elem list)
346 (or elem
347 (error "Missing group name"))
348 (let ((found nil)
349 (i 1))
350 (while list
351 (cond ((eq elem (car list))
352 (setq found i)
353 (setq list nil))
354 (t
355 (setq i (1+ i))
356 (setq list (cdr list)))))
357 (or found
358 (error "Bad group name %s" elem))))
359
360 ;; Define a macro SYM using a symbolic regexp R.
361 ;; SYM can be called in three ways:
362 ;; (SYM regexp)
363 ;; expands to the regexp in R
364 ;; (SYM start G)
365 ;; expands to
366 ;; (match-beginning N)
367 ;; where N is the group index of G in R.
368 ;; (SYM end G)
369 ;; expands to
370 ;; (match-end N)
371 ;; where N is the group index of G in R.
372 (defmacro xmltok-defregexp (sym r)
373 `(defalias ',sym
374 (let ((r ,r))
375 `(macro lambda (action &optional group-name)
376 (cond ((eq action 'regexp)
377 ,(car r))
378 ((or (eq action 'start) (eq action 'beginning))
379 (list 'match-beginning (xmltok-get-index group-name
380 ',(cdr r))))
381 ((eq action 'end)
382 (list 'match-end (xmltok-get-index group-name
383 ',(cdr r))))
384 ((eq action 'string)
385 (list 'match-string
386 (xmltok-get-index group-name ',(cdr r))))
387 ((eq action 'string-no-properties)
388 (list 'match-string-no-properties
389 (xmltok-get-index group-name ',(cdr r))))
390 (t (error "Invalid action: %s" action))))))))
391
392
393 (eval-when-compile
394 (let* ((or "\\|")
395 (open "\\(?:")
396 (close "\\)")
397 (name-start-char "[_[:alpha:]]")
398 (name-continue-not-start-char "[-.[:digit:]]")
399 (name-continue-char "[-._[:alnum:]]")
400 (* "*")
401 (+ "+")
402 (opt "?")
403 (question "\\?")
404 (s "[ \r\t\n]")
405 (s+ (concat s +))
406 (s* (concat s *))
407 (ncname (concat name-start-char name-continue-char *))
408 (entity-ref
409 (xmltok+ (xmltok-g entity-name ncname)
410 (xmltok-g entity-ref-close ";") opt))
411 (decimal-ref
412 (xmltok+ (xmltok-g decimal "[0-9]" +)
413 (xmltok-g decimal-ref-close ";") opt))
414 (hex-ref
415 (xmltok+ "x" open
416 (xmltok-g hex "[0-9a-fA-F]" +)
417 (xmltok-g hex-ref-close ";") opt
418 close opt))
419 (char-ref
420 (xmltok+ (xmltok-g number-sign "#")
421 open decimal-ref or hex-ref close opt))
422 (start-tag-close
423 (xmltok+ open (xmltok-g start-tag-close s* ">")
424 or open (xmltok-g empty-tag-slash s* "/")
425 (xmltok-g empty-tag-close ">") opt close
426 or (xmltok-g start-tag-s s+)
427 close))
428 (start-tag
429 (xmltok+ (xmltok-g start-tag-name
430 ncname (xmltok-g start-tag-colon ":" ncname) opt)
431 start-tag-close opt))
432 (end-tag
433 (xmltok+ (xmltok-g end-tag-slash "/")
434 open (xmltok-g end-tag-name
435 ncname
436 (xmltok-g end-tag-colon ":" ncname) opt)
437 (xmltok-g end-tag-close s* ">") opt
438 close opt))
439 (comment
440 (xmltok+ (xmltok-g markup-declaration "!")
441 (xmltok-g comment-first-dash "-"
442 (xmltok-g comment-open "-") opt) opt))
443 (cdata-section
444 (xmltok+ "!"
445 (xmltok-g marked-section-open "\\[")
446 open "C"
447 open "D"
448 open "A"
449 open "T"
450 open "A"
451 (xmltok-g cdata-section-open "\\[" ) opt
452 close opt ; A
453 close opt ; T
454 close opt ; A
455 close opt ; D
456 close opt)) ; C
457 (processing-instruction
458 (xmltok-g processing-instruction-question question)))
459
460 (xmltok-defregexp xmltok-ncname (xmltok+ open ncname close))
461
462 (xmltok-defregexp xmltok-after-amp
463 (xmltok+ entity-ref or char-ref))
464 (xmltok-defregexp xmltok-after-lt
465 (xmltok+ start-tag
466 or end-tag
467 ;; cdata-section must come before comment
468 ;; because we treat <! as a comment
469 ;; and Emacs doesn't do fully greedy matching
470 ;; by default
471 or cdata-section
472 or comment
473 or processing-instruction))
474 (xmltok-defregexp
475 xmltok-attribute
476 (let* ((lit1
477 (xmltok+ "'"
478 "[^<'&\r\n\t]*"
479 (xmltok-g complex1 "[&\r\n\t][^<']*") opt
480 "'"))
481 (lit2 (cons (replace-regexp-in-string "'" "\"" (car lit1))
482 '(complex2)))
483 (literal (xmltok-g literal lit1 or lit2))
484 (name (xmltok+ open (xmltok-g xmlns "xmlns") or ncname close
485 (xmltok-g colon ":" ncname) opt)))
486 (xmltok+ (xmltok-g name name)
487 s* "="
488 ;; If the literal isn't followed by what it should be,
489 ;; then the closing delimiter is probably really the
490 ;; opening delimiter of another literal, so don't
491 ;; absorb the literal in this case.
492 open s* literal start-tag-close close opt)))
493 (xmltok-defregexp
494 xmltok-xml-declaration
495 (let* ((literal-content "[-._:a-zA-Z0-9]+")
496 (literal
497 (concat open "\"" literal-content "\""
498 or "'" literal-content "'" close))
499 (version-att
500 (xmltok+ open
501 s+ (xmltok-g version-name "version")
502 s* "="
503 s* (xmltok-g version-value literal)
504 close opt))
505 (encoding-att
506 (xmltok+ open
507 s+ (xmltok-g encoding-name "encoding")
508 s* "="
509 s* (xmltok-g encoding-value literal)
510 close opt))
511 (yes-no
512 (concat open "yes" or "no" close))
513 (standalone-att
514 (xmltok+ open
515 s+ (xmltok-g standalone-name "standalone")
516 s* "="
517 s* (xmltok-g standalone-value
518 "\"" yes-no "\"" or "'" yes-no "'")
519 close opt)))
520 (xmltok+ "<" question "xml"
521 version-att
522 encoding-att
523 standalone-att
524 s* question ">")))
525 (xmltok-defregexp
526 xmltok-prolog
527 (let* ((single-char (xmltok-g single-char "[[|,(\"'>]"))
528 (internal-subset-close (xmltok-g internal-subset-close
529 "][ \t\r\n]*>"))
530 (starts-with-close-paren
531 (xmltok-g close-paren
532 ")"
533 (xmltok-p
534 (xmltok-g close-paren-occur "[+?]")
535 or
536 (xmltok-g close-paren-star "\\*"))
537 opt))
538 (starts-with-percent
539 (xmltok-g percent
540 "%" (xmltok-g param-entity-ref
541 ncname
542 (xmltok-g param-entity-ref-close
543 ";") opt) opt))
544 (starts-with-nmtoken-not-name
545 (xmltok-g nmtoken
546 (xmltok-p name-continue-not-start-char or ":")
547 (xmltok-p name-continue-char or ":") *))
548 (nmtoken-after-colon
549 (xmltok+
550 (xmltok-p name-continue-not-start-char or ":")
551 (xmltok-p name-continue-char or ":") *
552 or
553 name-start-char
554 name-continue-char *
555 ":"
556 (xmltok-p name-continue-char or ":") *))
557 (after-ncname
558 (xmltok+ (xmltok-g ncname-nmtoken
559 ":" (xmltok-p nmtoken-after-colon))
560 or (xmltok-p (xmltok-g colon ":" ncname)
561 (xmltok-g colon-name-occur "[?+*]") opt)
562 or (xmltok-g ncname-occur "[?+*]")
563 or (xmltok-g ncname-colon ":")))
564 (starts-with-name
565 (xmltok-g name ncname (xmltok-p after-ncname) opt))
566 (starts-with-hash
567 (xmltok-g pound
568 "#" (xmltok-g hash-name ncname)))
569 (markup-declaration
570 (xmltok-g markup-declaration
571 "!" (xmltok-p (xmltok-g comment-first-dash "-"
572 (xmltok-g comment-open "-") opt)
573 or (xmltok-g named-markup-declaration
574 ncname)) opt))
575 (after-lt
576 (xmltok+ markup-declaration
577 or (xmltok-g processing-instruction-question
578 question)
579 or (xmltok-g instance-start
580 ncname)))
581 (starts-with-lt (xmltok-g less-than "<" (xmltok-p after-lt) opt)))
582 (xmltok+ starts-with-lt
583 or single-char
584 or starts-with-close-paren
585 or starts-with-percent
586 or starts-with-name
587 or starts-with-nmtoken-not-name
588 or starts-with-hash
589 or internal-subset-close)))))
590
591 (defconst xmltok-ncname-regexp (xmltok-ncname regexp))
592
593 (defun xmltok-scan-after-lt ()
594 (cond ((not (looking-at (xmltok-after-lt regexp)))
595 (xmltok-add-error "`<' that is not markup must be entered as `&lt;'")
596 (setq xmltok-type 'not-well-formed))
597 (t
598 (goto-char (match-end 0))
599 (cond ((xmltok-after-lt start start-tag-close)
600 (setq xmltok-name-end
601 (xmltok-after-lt end start-tag-name))
602 (setq xmltok-name-colon
603 (xmltok-after-lt start start-tag-colon))
604 (setq xmltok-attributes nil)
605 (setq xmltok-namespace-attributes nil)
606 (setq xmltok-type 'start-tag))
607 ((xmltok-after-lt start end-tag-close)
608 (setq xmltok-name-end
609 (xmltok-after-lt end end-tag-name))
610 (setq xmltok-name-colon
611 (xmltok-after-lt start end-tag-colon))
612 (setq xmltok-type 'end-tag))
613 ((xmltok-after-lt start start-tag-s)
614 (setq xmltok-name-end
615 (xmltok-after-lt end start-tag-name))
616 (setq xmltok-name-colon
617 (xmltok-after-lt start start-tag-colon))
618 (setq xmltok-namespace-attributes nil)
619 (setq xmltok-attributes nil)
620 (xmltok-scan-attributes)
621 xmltok-type)
622 ((xmltok-after-lt start empty-tag-close)
623 (setq xmltok-name-end
624 (xmltok-after-lt end start-tag-name))
625 (setq xmltok-name-colon
626 (xmltok-after-lt start start-tag-colon))
627 (setq xmltok-attributes nil)
628 (setq xmltok-namespace-attributes nil)
629 (setq xmltok-type 'empty-element))
630 ((xmltok-after-lt start cdata-section-open)
631 (setq xmltok-type
632 (progn (search-forward "]]>" nil 'move)
633 'cdata-section)))
634 ((xmltok-after-lt start processing-instruction-question)
635 (xmltok-scan-after-processing-instruction-open))
636 ((xmltok-after-lt start comment-open)
637 (xmltok-scan-after-comment-open))
638 ((xmltok-after-lt start empty-tag-slash)
639 (setq xmltok-name-end
640 (xmltok-after-lt end start-tag-name))
641 (setq xmltok-name-colon
642 (xmltok-after-lt start start-tag-colon))
643 (setq xmltok-attributes nil)
644 (setq xmltok-namespace-attributes nil)
645 (xmltok-add-error "Expected `/>'" (1- (point)))
646 (setq xmltok-type 'partial-empty-element))
647 ((xmltok-after-lt start start-tag-name)
648 (xmltok-add-error "Missing `>'"
649 nil
650 (1+ xmltok-start))
651 (setq xmltok-name-end
652 (xmltok-after-lt end start-tag-name))
653 (setq xmltok-name-colon
654 (xmltok-after-lt start start-tag-colon))
655 (setq xmltok-namespace-attributes nil)
656 (setq xmltok-attributes nil)
657 (setq xmltok-type 'partial-start-tag))
658 ((xmltok-after-lt start end-tag-name)
659 (setq xmltok-name-end (xmltok-after-lt end end-tag-name))
660 (setq xmltok-name-colon
661 (xmltok-after-lt start end-tag-colon))
662 (cond ((and (not xmltok-name-colon)
663 (eq (char-after) ?:))
664 (goto-char (1+ (point)))
665 (xmltok-add-error "Expected name following `:'"
666 (1- (point))))
667 (t
668 (xmltok-add-error "Missing `>'"
669 nil
670 (1+ xmltok-start))))
671 (setq xmltok-type 'partial-end-tag))
672 ((xmltok-after-lt start end-tag-slash)
673 (xmltok-add-error "Expected name following `</'")
674 (setq xmltok-name-end nil)
675 (setq xmltok-name-colon nil)
676 (setq xmltok-type 'partial-end-tag))
677 ((xmltok-after-lt start marked-section-open)
678 (xmltok-add-error "Expected `CDATA[' after `<!['"
679 xmltok-start
680 (+ 3 xmltok-start))
681 (setq xmltok-type 'not-well-formed))
682 ((xmltok-after-lt start comment-first-dash)
683 (xmltok-add-error "Expected `-' after `<!-'"
684 xmltok-start
685 (+ 3 xmltok-start))
686 (setq xmltok-type 'not-well-formed))
687 ((xmltok-after-lt start markup-declaration)
688 (xmltok-add-error "Expected `[CDATA[' or `--' after `<!'"
689 xmltok-start
690 (+ 2 xmltok-start))
691 (setq xmltok-type 'not-well-formed))
692 (t
693 (xmltok-add-error "Not well-formed")
694 (setq xmltok-type 'not-well-formed))))))
695
696 ;; XXX This should be unified with
697 ;; xmltok-scan-prolog-after-processing-instruction-open
698 ;; XXX maybe should include rest of line (up to any <,>) in unclosed PI
699 (defun xmltok-scan-after-processing-instruction-open ()
700 (search-forward "?>" nil 'move)
701 (cond ((not (save-excursion
702 (goto-char (+ 2 xmltok-start))
703 (and (looking-at (xmltok-ncname regexp))
704 (setq xmltok-name-end (match-end 0)))))
705 (setq xmltok-name-end (+ xmltok-start 2))
706 (xmltok-add-error "<? not followed by name"
707 (+ xmltok-start 2)
708 (+ xmltok-start 3)))
709 ((not (or (memq (char-after xmltok-name-end)
710 '(?\n ?\t ?\r ? ))
711 (= xmltok-name-end (- (point) 2))))
712 (xmltok-add-error "Target not followed by whitespace"
713 xmltok-name-end
714 (1+ xmltok-name-end)))
715 ((and (= xmltok-name-end (+ xmltok-start 5))
716 (save-excursion
717 (goto-char (+ xmltok-start 2))
718 (let ((case-fold-search t))
719 (looking-at "xml"))))
720 (xmltok-add-error "Processing instruction target is xml"
721 (+ xmltok-start 2)
722 (+ xmltok-start 5))))
723 (setq xmltok-type 'processing-instruction))
724
725 (defun xmltok-scan-after-comment-open ()
726 (while (and (re-search-forward "--\\(>\\)?" nil 'move)
727 (not (match-end 1)))
728 (xmltok-add-error "`--' not followed by `>'" (match-beginning 0)))
729 (setq xmltok-type 'comment))
730
731 (defun xmltok-scan-attributes ()
732 (let ((recovering nil)
733 (atts-needing-normalization nil))
734 (while (cond ((or (looking-at (xmltok-attribute regexp))
735 ;; use non-greedy group
736 (when (looking-at (concat "[^<>\n]+?"
737 (xmltok-attribute regexp)))
738 (unless recovering
739 (xmltok-add-error "Malformed attribute"
740 (point)
741 (save-excursion
742 (goto-char (xmltok-attribute start
743 name))
744 (skip-chars-backward "\r\n\t ")
745 (point))))
746 t))
747 (setq recovering nil)
748 (goto-char (match-end 0))
749 (let ((att (xmltok-add-attribute)))
750 (when att
751 (setq atts-needing-normalization
752 (cons att atts-needing-normalization))))
753 (cond ((xmltok-attribute start start-tag-s) t)
754 ((xmltok-attribute start start-tag-close)
755 (setq xmltok-type 'start-tag)
756 nil)
757 ((xmltok-attribute start empty-tag-close)
758 (setq xmltok-type 'empty-element)
759 nil)
760 ((xmltok-attribute start empty-tag-slash)
761 (setq xmltok-type 'partial-empty-element)
762 (xmltok-add-error "Expected `/>'"
763 (1- (point)))
764 nil)
765 ((looking-at "[ \t\r\n]*[\"']")
766 (goto-char (match-end 0))
767 (xmltok-add-error "Missing closing delimiter"
768 (1- (point)))
769 (setq recovering t)
770 t)
771 ((looking-at "[ \t]*\\([^ \t\r\n\"'=<>/]+\\)[ \t\r\n/>]")
772 (goto-char (match-end 1))
773 (xmltok-add-error "Attribute value not quoted"
774 (match-beginning 1))
775 (setq recovering t)
776 t)
777 (t
778 (xmltok-add-error "Missing attribute value"
779 (1- (point)))
780 (setq recovering t)
781 t)))
782 ((looking-at "[^<>\n]*/>")
783 (let ((start (point)))
784 (goto-char (match-end 0))
785 (unless recovering
786 (xmltok-add-error "Malformed empty-element"
787 start
788 (- (point) 2))))
789 (setq xmltok-type 'empty-element)
790 nil)
791 ((looking-at "[^<>\n]*>")
792 (let ((start (point)))
793 (goto-char (match-end 0))
794 (unless recovering
795 (xmltok-add-error "Malformed start-tag"
796 start
797 (1- (point)))))
798 (setq xmltok-type 'start-tag)
799 nil)
800 (t
801 (when recovering
802 (skip-chars-forward "^<>\n"))
803 (xmltok-add-error "Missing `>'"
804 xmltok-start
805 (1+ xmltok-start))
806 (setq xmltok-type 'partial-start-tag)
807 nil)))
808 (while atts-needing-normalization
809 (xmltok-normalize-attribute (car atts-needing-normalization))
810 (setq atts-needing-normalization (cdr atts-needing-normalization))))
811 (setq xmltok-attributes
812 (nreverse xmltok-attributes))
813 (setq xmltok-namespace-attributes
814 (nreverse xmltok-namespace-attributes)))
815
816 (defun xmltok-add-attribute ()
817 "Return the attribute if it needs normalizing, otherwise nil."
818 (let* ((needs-normalizing nil)
819 (att
820 (if (xmltok-attribute start literal)
821 (progn
822 (setq needs-normalizing
823 (or (xmltok-attribute start complex1)
824 (xmltok-attribute start complex2)))
825 (xmltok-make-attribute (xmltok-attribute start name)
826 (xmltok-attribute start colon)
827 (xmltok-attribute end name)
828 (1+ (xmltok-attribute start literal))
829 (1- (xmltok-attribute end literal))
830 (not needs-normalizing)))
831 (xmltok-make-attribute (xmltok-attribute start name)
832 (xmltok-attribute start colon)
833 (xmltok-attribute end name)))))
834 (if (xmltok-attribute start xmlns)
835 (setq xmltok-namespace-attributes
836 (cons att xmltok-namespace-attributes))
837 (setq xmltok-attributes
838 (cons att xmltok-attributes)))
839 (and needs-normalizing
840 att)))
841
842 (defun xmltok-normalize-attribute (att)
843 (let ((end (xmltok-attribute-value-end att))
844 (well-formed t)
845 (value-parts nil)
846 (refs nil))
847 (save-excursion
848 (goto-char (xmltok-attribute-value-start att))
849 (while (progn
850 (let ((n (skip-chars-forward "^\r\t\n&" end)))
851 (when (> n 0)
852 (setq value-parts
853 (cons (buffer-substring-no-properties (- (point) n)
854 (point))
855 value-parts))))
856 (when (< (point) end)
857 (goto-char (1+ (point)))
858 (cond ((eq (char-before) ?\&)
859 (let ((xmltok-start (1- (point)))
860 xmltok-type xmltok-replacement)
861 (xmltok-scan-after-amp
862 (lambda (start end)
863 (xmltok-handle-entity start end t)))
864 (cond ((or (eq xmltok-type 'char-ref)
865 (eq xmltok-type 'entity-ref))
866 (setq refs
867 (cons (vector xmltok-type
868 xmltok-start
869 (point))
870 refs))
871 (if xmltok-replacement
872 (setq value-parts
873 (cons xmltok-replacement
874 value-parts))
875 (setq well-formed nil)))
876 (t (setq well-formed nil)))))
877 (t (setq value-parts
878 (cons " " value-parts)))))
879 (< (point) end))))
880 (when well-formed
881 (aset att 5 (apply 'concat (nreverse value-parts))))
882 (aset att 6 (nreverse refs))))
883
884 (defun xmltok-scan-after-amp (entity-handler)
885 (cond ((not (looking-at (xmltok-after-amp regexp)))
886 (xmltok-add-error "`&' that is not markup must be entered as `&amp;'")
887 (setq xmltok-type 'not-well-formed))
888 (t
889 (goto-char (match-end 0))
890 (cond ((xmltok-after-amp start entity-ref-close)
891 (funcall entity-handler
892 (xmltok-after-amp start entity-name)
893 (xmltok-after-amp end entity-name))
894 (setq xmltok-type 'entity-ref))
895 ((xmltok-after-amp start decimal-ref-close)
896 (xmltok-scan-char-ref (xmltok-after-amp start decimal)
897 (xmltok-after-amp end decimal)
898 10))
899 ((xmltok-after-amp start hex-ref-close)
900 (xmltok-scan-char-ref (xmltok-after-amp start hex)
901 (xmltok-after-amp end hex)
902 16))
903 ((xmltok-after-amp start number-sign)
904 (xmltok-add-error "Missing character number")
905 (setq xmltok-type 'not-well-formed))
906 (t
907 (xmltok-add-error "Missing closing `;'")
908 (setq xmltok-type 'not-well-formed))))))
909
910 (defconst xmltok-entity-error-messages
911 '((unparsed . "Referenced entity is unparsed")
912 (not-well-formed . "Referenced entity is not well-formed")
913 (external nil . "Referenced entity is external")
914 (element nil . "Referenced entity contains <")))
915
916 (defun xmltok-handle-entity (start end &optional attributep)
917 (let* ((name (buffer-substring-no-properties start end))
918 (name-def (assoc name xmltok-dtd))
919 (def (cdr name-def)))
920 (cond ((setq xmltok-replacement (and (consp def)
921 (if attributep
922 (cdr def)
923 (car def)))))
924 ((null name-def)
925 (unless (eq (car xmltok-dtd) t)
926 (xmltok-add-error "Referenced entity has not been defined"
927 start
928 end)))
929 ((and attributep (consp def))
930 (xmltok-add-error "Referenced entity contains <"
931 start
932 end))
933 (t
934 (let ((err (cdr (assq def xmltok-entity-error-messages))))
935 (when (consp err)
936 (setq err (if attributep (cdr err) (car err))))
937 (when err
938 (xmltok-add-error err start end)))))))
939
940 (defun xmltok-scan-char-ref (start end base)
941 (setq xmltok-replacement
942 (let ((n (string-to-number (buffer-substring-no-properties start end)
943 base)))
944 (cond ((and (integerp n) (xmltok-valid-char-p n))
945 (setq n (xmltok-unicode-to-char n))
946 (and n (string n)))
947 (t
948 (xmltok-add-error "Invalid character code" start end)
949 nil))))
950 (setq xmltok-type 'char-ref))
951
952 (defun xmltok-char-number (start end)
953 (let* ((base (if (eq (char-after (+ start 2)) ?x)
954 16
955 10))
956 (n (string-to-number
957 (buffer-substring-no-properties (+ start (if (= base 16) 3 2))
958 (1- end))
959 base)))
960 (and (integerp n)
961 (xmltok-valid-char-p n)
962 n)))
963
964 (defun xmltok-valid-char-p (n)
965 "Return non-nil if N is the Unicode code of a valid XML character."
966 (cond ((< n #x20) (memq n '(#xA #xD #x9)))
967 ((< n #xD800) t)
968 ((< n #xE000) nil)
969 ((< n #xFFFE) t)
970 (t (and (> n #xFFFF)
971 (< n #x110000)))))
972
973 (defun xmltok-unicode-to-char (n)
974 "Return the character corresponding to Unicode scalar value N.
975 Return nil if unsupported in Emacs."
976 (decode-char 'ucs n))
977
978 ;;; Prolog parsing
979
980 (defvar xmltok-contains-doctype nil)
981 (defvar xmltok-doctype-external-subset-flag nil)
982 (defvar xmltok-internal-subset-start nil)
983 (defvar xmltok-had-param-entity-ref nil)
984 (defvar xmltok-prolog-regions nil)
985 (defvar xmltok-standalone nil
986 "Non-nil if there was an XML declaration specifying standalone=\"yes\".")
987 (defvar xmltok-markup-declaration-doctype-flag nil)
988
989 (defconst xmltok-predefined-entity-alist
990 '(("lt" "<" . "<")
991 ("gt" ">" . ">")
992 ("amp" "&" . "&")
993 ("apos" "'" . "'")
994 ("quot" "\"" . "\"")))
995
996 (defun xmltok-forward-prolog ()
997 "Move forward to the end of the XML prolog.
998
999 Returns a list of vectors [TYPE START END] where TYPE is a symbol and
1000 START and END are integers giving the start and end of the region of
1001 that type. TYPE can be one of xml-declaration,
1002 xml-declaration-attribute-name, xml-declaration-attribute-value,
1003 comment, processing-instruction-left, processing-instruction-right,
1004 markup-declaration-open, markup-declaration-close,
1005 internal-subset-open, internal-subset-close, hash-name, keyword,
1006 literal, encoding-name.
1007 Adds to `xmltok-errors' as appropriate."
1008 (let ((case-fold-search nil)
1009 xmltok-start
1010 xmltok-type
1011 xmltok-prolog-regions
1012 xmltok-contains-doctype
1013 xmltok-internal-subset-start
1014 xmltok-had-param-entity-ref
1015 xmltok-standalone
1016 xmltok-doctype-external-subset-flag
1017 xmltok-markup-declaration-doctype-flag)
1018 (setq xmltok-dtd xmltok-predefined-entity-alist)
1019 (xmltok-scan-xml-declaration)
1020 (xmltok-next-prolog-token)
1021 (while (condition-case nil
1022 (when (xmltok-parse-prolog-item)
1023 (xmltok-next-prolog-token))
1024 (xmltok-markup-declaration-parse-error
1025 (xmltok-skip-markup-declaration))))
1026 (when xmltok-internal-subset-start
1027 (xmltok-add-error "No closing ]"
1028 (1- xmltok-internal-subset-start)
1029 xmltok-internal-subset-start))
1030 (xmltok-parse-entities)
1031 (nreverse xmltok-prolog-regions)))
1032
1033 (defconst xmltok-bad-xml-decl-regexp
1034 "[ \t\r\n]*<\\?xml\\(?:[ \t\r\n]\\|\\?>\\)")
1035
1036 ;;;###autoload
1037 (defun xmltok-get-declared-encoding-position (&optional limit)
1038 "Return the position of the encoding in the XML declaration at point.
1039 If there is a well-formed XML declaration starting at point and it
1040 contains an encoding declaration, then return (START . END)
1041 where START and END are the positions of the start and the end
1042 of the encoding name; if there is no encoding declaration return
1043 the position where and encoding declaration could be inserted.
1044 If there is XML that is not well-formed that looks like an XML
1045 declaration, return nil. Otherwise, return t.
1046 If LIMIT is non-nil, then do not consider characters beyond LIMIT."
1047 (cond ((let ((case-fold-search nil))
1048 (and (looking-at (xmltok-xml-declaration regexp))
1049 (or (not limit) (<= (match-end 0) limit))))
1050 (let ((end (xmltok-xml-declaration end encoding-value)))
1051 (if end
1052 (cons (1+ (xmltok-xml-declaration start encoding-value))
1053 (1- end))
1054 (or (xmltok-xml-declaration end version-value)
1055 (+ (point) 5)))))
1056 ((not (let ((case-fold-search t))
1057 (looking-at xmltok-bad-xml-decl-regexp))))))
1058
1059 (defun xmltok-scan-xml-declaration ()
1060 (when (looking-at (xmltok-xml-declaration regexp))
1061 (xmltok-add-prolog-region 'xml-declaration (point) (match-end 0))
1062 (goto-char (match-end 0))
1063 (when (xmltok-xml-declaration start version-name)
1064 (xmltok-add-prolog-region 'xml-declaration-attribute-name
1065 (xmltok-xml-declaration start version-name)
1066 (xmltok-xml-declaration end version-name))
1067 (let ((start (xmltok-xml-declaration start version-value))
1068 (end (xmltok-xml-declaration end version-value)))
1069 (xmltok-add-prolog-region 'xml-declaration-attribute-value
1070 start
1071 end)))
1072 ;; XXX need to check encoding name
1073 ;; Should start with letter, not contain colon
1074 (when (xmltok-xml-declaration start encoding-name)
1075 (xmltok-add-prolog-region 'xml-declaration-attribute-name
1076 (xmltok-xml-declaration start encoding-name)
1077 (xmltok-xml-declaration end encoding-name))
1078 (let ((start (xmltok-xml-declaration start encoding-value))
1079 (end (xmltok-xml-declaration end encoding-value)))
1080 (xmltok-add-prolog-region 'encoding-name
1081 (1+ start)
1082 (1- end))
1083 (xmltok-add-prolog-region 'xml-declaration-attribute-value
1084 start
1085 end)))
1086 (when (xmltok-xml-declaration start standalone-name)
1087 (xmltok-add-prolog-region 'xml-declaration-attribute-name
1088 (xmltok-xml-declaration start standalone-name)
1089 (xmltok-xml-declaration end standalone-name))
1090 (let ((start (xmltok-xml-declaration start standalone-value))
1091 (end (xmltok-xml-declaration end standalone-value)))
1092 (xmltok-add-prolog-region 'xml-declaration-attribute-value
1093 start
1094 end)
1095 (setq xmltok-standalone
1096 (string= (buffer-substring-no-properties (1+ start) (1- end))
1097 "yes"))))
1098 t))
1099
1100 (defconst xmltok-markup-declaration-alist
1101 '(("ELEMENT" . xmltok-parse-element-declaration)
1102 ("ATTLIST" . xmltok-parse-attlist-declaration)
1103 ("ENTITY" . xmltok-parse-entity-declaration)
1104 ("NOTATION" . xmltok-parse-notation-declaration)))
1105
1106 (defun xmltok-parse-prolog-item ()
1107 (cond ((eq xmltok-type 'comment)
1108 (xmltok-add-prolog-region 'comment
1109 xmltok-start
1110 (point))
1111 t)
1112 ((eq xmltok-type 'processing-instruction))
1113 ((eq xmltok-type 'named-markup-declaration)
1114 (setq xmltok-markup-declaration-doctype-flag nil)
1115 (xmltok-add-prolog-region 'markup-declaration-open
1116 xmltok-start
1117 (point))
1118 (let* ((name (buffer-substring-no-properties
1119 (+ xmltok-start 2)
1120 (point)))
1121 (fun (cdr (assoc name xmltok-markup-declaration-alist))))
1122 (cond (fun
1123 (unless xmltok-internal-subset-start
1124 (xmltok-add-error
1125 "Declaration allowed only in internal subset"))
1126 (funcall fun))
1127 ((string= name "DOCTYPE")
1128 (xmltok-parse-doctype))
1129 (t
1130 (xmltok-add-error "Unknown markup declaration"
1131 (+ xmltok-start 2))
1132 (xmltok-next-prolog-token)
1133 (xmltok-markup-declaration-parse-error))))
1134 t)
1135 ((or (eq xmltok-type 'end-prolog)
1136 (not xmltok-type))
1137 nil)
1138 ((eq xmltok-type 'internal-subset-close)
1139 (xmltok-add-prolog-region 'internal-subset-close
1140 xmltok-start
1141 (1+ xmltok-start))
1142 (xmltok-add-prolog-region 'markup-declaration-close
1143 (1- (point))
1144 (point))
1145 (if xmltok-internal-subset-start
1146 (setq xmltok-internal-subset-start nil)
1147 (xmltok-add-error "]> outside internal subset"))
1148 t)
1149 ((eq xmltok-type 'param-entity-ref)
1150 (if xmltok-internal-subset-start
1151 (setq xmltok-had-param-entity-ref t)
1152 (xmltok-add-error "Parameter entity reference outside document type declaration"))
1153 t)
1154 ;; If we don't do this, we can get thousands of errors when
1155 ;; a plain text file is parsed.
1156 ((not xmltok-internal-subset-start)
1157 (when (let ((err (car xmltok-errors)))
1158 (or (not err)
1159 (<= (xmltok-error-end err) xmltok-start)))
1160 (goto-char xmltok-start))
1161 nil)
1162 ((eq xmltok-type 'not-well-formed) t)
1163 (t
1164 (xmltok-add-error "Token allowed only inside markup declaration")
1165 t)))
1166
1167 (defun xmltok-parse-doctype ()
1168 (setq xmltok-markup-declaration-doctype-flag t)
1169 (xmltok-next-prolog-token)
1170 (when xmltok-internal-subset-start
1171 (xmltok-add-error "DOCTYPE declaration not allowed in internal subset")
1172 (xmltok-markup-declaration-parse-error))
1173 (when xmltok-contains-doctype
1174 (xmltok-add-error "Duplicate DOCTYPE declaration")
1175 (xmltok-markup-declaration-parse-error))
1176 (setq xmltok-contains-doctype t)
1177 (xmltok-require-token 'name 'prefixed-name)
1178 (xmltok-require-next-token "SYSTEM" "PUBLIC" ?\[ ?>)
1179 (cond ((eq xmltok-type ?\[)
1180 (setq xmltok-internal-subset-start (point)))
1181 ((eq xmltok-type ?>))
1182 (t
1183 (setq xmltok-doctype-external-subset-flag t)
1184 (xmltok-parse-external-id)
1185 (xmltok-require-token ?\[ ?>)
1186 (when (eq xmltok-type ?\[)
1187 (setq xmltok-internal-subset-start (point))))))
1188
1189 (defun xmltok-parse-attlist-declaration ()
1190 (xmltok-require-next-token 'prefixed-name 'name)
1191 (while (progn
1192 (xmltok-require-next-token ?> 'name 'prefixed-name)
1193 (if (eq xmltok-type ?>)
1194 nil
1195 (xmltok-require-next-token ?\(
1196 "CDATA"
1197 "ID"
1198 "IDREF"
1199 "IDREFS"
1200 "ENTITY"
1201 "ENTITIES"
1202 "NMTOKEN"
1203 "NMTOKENS"
1204 "NOTATION")
1205 (cond ((eq xmltok-type ?\()
1206 (xmltok-parse-nmtoken-group))
1207 ((string= (xmltok-current-token-string)
1208 "NOTATION")
1209 (xmltok-require-next-token ?\()
1210 (xmltok-parse-nmtoken-group)))
1211 (xmltok-require-next-token "#IMPLIED"
1212 "#REQUIRED"
1213 "#FIXED"
1214 'literal)
1215 (when (string= (xmltok-current-token-string) "#FIXED")
1216 (xmltok-require-next-token 'literal))
1217 t))))
1218
1219 (defun xmltok-parse-nmtoken-group ()
1220 (while (progn
1221 (xmltok-require-next-token 'nmtoken 'prefixed-name 'name)
1222 (xmltok-require-next-token ?| ?\))
1223 (eq xmltok-type ?|))))
1224
1225 (defun xmltok-parse-element-declaration ()
1226 (xmltok-require-next-token 'name 'prefixed-name)
1227 (xmltok-require-next-token "EMPTY" "ANY" ?\()
1228 (when (eq xmltok-type ?\()
1229 (xmltok-require-next-token "#PCDATA"
1230 'name
1231 'prefixed-name
1232 'name-occur
1233 ?\()
1234 (cond ((eq xmltok-type 'hash-name)
1235 (xmltok-require-next-token ?| ?\) 'close-paren-star)
1236 (while (eq xmltok-type ?|)
1237 (xmltok-require-next-token 'name 'prefixed-name)
1238 (xmltok-require-next-token 'close-paren-star ?|)))
1239 (t (xmltok-parse-model-group))))
1240 (xmltok-require-next-token ?>))
1241
1242 (defun xmltok-parse-model-group ()
1243 (xmltok-parse-model-group-member)
1244 (xmltok-require-next-token ?|
1245 ?,
1246 ?\)
1247 'close-paren-star
1248 'close-paren-occur)
1249 (when (memq xmltok-type '(?, ?|))
1250 (let ((connector xmltok-type))
1251 (while (progn
1252 (xmltok-next-prolog-token)
1253 (xmltok-parse-model-group-member)
1254 (xmltok-require-next-token connector
1255 ?\)
1256 'close-paren-star
1257 'close-paren-occur)
1258 (eq xmltok-type connector))))))
1259
1260 (defun xmltok-parse-model-group-member ()
1261 (xmltok-require-token 'name
1262 'prefixed-name
1263 'name-occur
1264 ?\()
1265 (when (eq xmltok-type ?\()
1266 (xmltok-next-prolog-token)
1267 (xmltok-parse-model-group)))
1268
1269 (defun xmltok-parse-entity-declaration ()
1270 (let (paramp name)
1271 (xmltok-require-next-token 'name ?%)
1272 (when (eq xmltok-type ?%)
1273 (setq paramp t)
1274 (xmltok-require-next-token 'name))
1275 (setq name (xmltok-current-token-string))
1276 (xmltok-require-next-token 'literal "SYSTEM" "PUBLIC")
1277 (cond ((eq xmltok-type 'literal)
1278 (let ((replacement (xmltok-parse-entity-value)))
1279 (unless paramp
1280 (xmltok-define-entity name replacement)))
1281 (xmltok-require-next-token ?>))
1282 (t
1283 (xmltok-parse-external-id)
1284 (if paramp
1285 (xmltok-require-token ?>)
1286 (xmltok-require-token ?> "NDATA")
1287 (if (eq xmltok-type ?>)
1288 (xmltok-define-entity name 'external)
1289 (xmltok-require-next-token 'name)
1290 (xmltok-require-next-token ?>)
1291 (xmltok-define-entity name 'unparsed)))))))
1292
1293 (defun xmltok-define-entity (name value)
1294 (when (and (or (not xmltok-had-param-entity-ref)
1295 xmltok-standalone)
1296 (not (assoc name xmltok-dtd)))
1297 (setq xmltok-dtd
1298 (cons (cons name value) xmltok-dtd))))
1299
1300 (defun xmltok-parse-entity-value ()
1301 (let ((lim (1- (point)))
1302 (well-formed t)
1303 value-parts
1304 start)
1305 (save-excursion
1306 (goto-char (1+ xmltok-start))
1307 (setq start (point))
1308 (while (progn
1309 (skip-chars-forward "^%&" lim)
1310 (when (< (point) lim)
1311 (goto-char (1+ (point)))
1312 (cond ((eq (char-before) ?%)
1313 (xmltok-add-error "Parameter entity references are not allowed in the internal subset"
1314 (1- (point))
1315 (point))
1316 (setq well-formed nil))
1317 (t
1318 (let ((xmltok-start (1- (point)))
1319 xmltok-type xmltok-replacement)
1320 (xmltok-scan-after-amp (lambda (_start _end)))
1321 (cond ((eq xmltok-type 'char-ref)
1322 (setq value-parts
1323 (cons (buffer-substring-no-properties
1324 start
1325 xmltok-start)
1326 value-parts))
1327 (setq value-parts
1328 (cons xmltok-replacement
1329 value-parts))
1330 (setq start (point)))
1331 ((eq xmltok-type 'not-well-formed)
1332 (setq well-formed nil))))))
1333 t))))
1334 (if (not well-formed)
1335 nil
1336 (apply 'concat
1337 (nreverse (cons (buffer-substring-no-properties start lim)
1338 value-parts))))))
1339
1340 (defun xmltok-parse-notation-declaration ()
1341 (xmltok-require-next-token 'name)
1342 (xmltok-require-next-token "SYSTEM" "PUBLIC")
1343 (let ((publicp (string= (xmltok-current-token-string) "PUBLIC")))
1344 (xmltok-require-next-token 'literal)
1345 (cond (publicp
1346 (xmltok-require-next-token 'literal ?>)
1347 (unless (eq xmltok-type ?>)
1348 (xmltok-require-next-token ?>)))
1349 (t (xmltok-require-next-token ?>)))))
1350
1351 (defun xmltok-parse-external-id ()
1352 (xmltok-require-token "SYSTEM" "PUBLIC")
1353 (let ((publicp (string= (xmltok-current-token-string) "PUBLIC")))
1354 (xmltok-require-next-token 'literal)
1355 (when publicp
1356 (xmltok-require-next-token 'literal)))
1357 (xmltok-next-prolog-token))
1358
1359 (defun xmltok-require-next-token (&rest types)
1360 (xmltok-next-prolog-token)
1361 (apply 'xmltok-require-token types))
1362
1363 (defun xmltok-require-token (&rest types)
1364 ;; XXX Generate a more helpful error message
1365 (while (and (not (let ((type (car types)))
1366 (if (stringp (car types))
1367 (string= (xmltok-current-token-string) type)
1368 (eq type xmltok-type))))
1369 (setq types (cdr types))))
1370 (unless types
1371 (when (and xmltok-type
1372 (not (eq xmltok-type 'not-well-formed)))
1373 (xmltok-add-error "Unexpected token"))
1374 (xmltok-markup-declaration-parse-error))
1375 (let ((region-type (xmltok-prolog-region-type (car types))))
1376 (when region-type
1377 (xmltok-add-prolog-region region-type
1378 xmltok-start
1379 (point)))))
1380
1381 (defun xmltok-current-token-string ()
1382 (buffer-substring-no-properties xmltok-start (point)))
1383
1384 (define-error 'xmltok-markup-declaration-parse-error
1385 "Syntax error in markup declaration")
1386
1387 (defun xmltok-markup-declaration-parse-error ()
1388 (signal 'xmltok-markup-declaration-parse-error nil))
1389
1390 (defun xmltok-skip-markup-declaration ()
1391 (while (cond ((eq xmltok-type ?>)
1392 (xmltok-next-prolog-token)
1393 nil)
1394 ((and xmltok-markup-declaration-doctype-flag
1395 (eq xmltok-type ?\[))
1396 (setq xmltok-internal-subset-start (point))
1397 (xmltok-next-prolog-token)
1398 nil)
1399 ((memq xmltok-type '(nil
1400 end-prolog
1401 named-markup-declaration
1402 comment
1403 processing-instruction))
1404 nil)
1405 ((and xmltok-internal-subset-start
1406 (eq xmltok-type 'internal-subset-close))
1407 nil)
1408 (t (xmltok-next-prolog-token) t)))
1409 xmltok-type)
1410
1411 (defun xmltok-prolog-region-type (required)
1412 (cond ((cdr (assq xmltok-type
1413 '((literal . literal)
1414 (?> . markup-declaration-close)
1415 (?\[ . internal-subset-open)
1416 (hash-name . hash-name)))))
1417 ((and (stringp required) (eq xmltok-type 'name))
1418 'keyword)))
1419
1420 ;; Return new token type.
1421
1422 (defun xmltok-next-prolog-token ()
1423 (skip-chars-forward " \t\r\n")
1424 (setq xmltok-start (point))
1425 (cond ((not (and (looking-at (xmltok-prolog regexp))
1426 (goto-char (match-end 0))))
1427 (let ((ch (char-after)))
1428 (cond (ch
1429 (goto-char (1+ (point)))
1430 (xmltok-add-error "Illegal char in prolog")
1431 (setq xmltok-type 'not-well-formed))
1432 (t (setq xmltok-type nil)))))
1433 ((or (xmltok-prolog start ncname-occur)
1434 (xmltok-prolog start colon-name-occur))
1435 (setq xmltok-name-end (1- (point)))
1436 (setq xmltok-name-colon (xmltok-prolog start colon))
1437 (setq xmltok-type 'name-occur))
1438 ((xmltok-prolog start colon)
1439 (setq xmltok-name-end (point))
1440 (setq xmltok-name-colon (xmltok-prolog start colon))
1441 (unless (looking-at "[ \t\r\n>),|[%]")
1442 (xmltok-add-error "Missing space after name"))
1443 (setq xmltok-type 'prefixed-name))
1444 ((or (xmltok-prolog start ncname-nmtoken)
1445 (xmltok-prolog start ncname-colon))
1446 (unless (looking-at "[ \t\r\n>),|[%]")
1447 (xmltok-add-error "Missing space after name token"))
1448 (setq xmltok-type 'nmtoken))
1449 ((xmltok-prolog start name)
1450 (setq xmltok-name-end (point))
1451 (setq xmltok-name-colon nil)
1452 (unless (looking-at "[ \t\r\n>),|[%]")
1453 (xmltok-add-error "Missing space after name"))
1454 (setq xmltok-type 'name))
1455 ((xmltok-prolog start hash-name)
1456 (setq xmltok-name-end (point))
1457 (unless (looking-at "[ \t\r\n>)|%]")
1458 (xmltok-add-error "Missing space after name"))
1459 (setq xmltok-type 'hash-name))
1460 ((xmltok-prolog start processing-instruction-question)
1461 (xmltok-scan-prolog-after-processing-instruction-open))
1462 ((xmltok-prolog start comment-open)
1463 ;; XXX if not-well-formed, ignore some stuff
1464 (xmltok-scan-after-comment-open))
1465 ((xmltok-prolog start named-markup-declaration)
1466 (setq xmltok-type 'named-markup-declaration))
1467 ((xmltok-prolog start instance-start)
1468 (goto-char xmltok-start)
1469 (setq xmltok-type 'end-prolog))
1470 ((xmltok-prolog start close-paren-star)
1471 (setq xmltok-type 'close-paren-star))
1472 ((xmltok-prolog start close-paren-occur)
1473 (setq xmltok-type 'close-paren-occur))
1474 ((xmltok-prolog start close-paren)
1475 (unless (looking-at "[ \t\r\n>,|)]")
1476 (xmltok-add-error "Missing space after )"))
1477 (setq xmltok-type ?\)))
1478 ((xmltok-prolog start single-char)
1479 (let ((ch (char-before)))
1480 (cond ((memq ch '(?\" ?\'))
1481 (xmltok-scan-prolog-literal))
1482 (t (setq xmltok-type ch)))))
1483 ((xmltok-prolog start percent)
1484 (cond ((xmltok-prolog start param-entity-ref-close)
1485 (setq xmltok-name-end (1- (point)))
1486 (setq xmltok-type 'param-entity-ref))
1487 ((xmltok-prolog start param-entity-ref)
1488 (xmltok-add-error "Missing ;")
1489 (setq xmltok-name-end (point))
1490 (setq xmltok-type 'param-entity-ref))
1491 ((looking-at "[ \t\r\n%]")
1492 (setq xmltok-type ?%))
1493 (t
1494 (xmltok-add-error "Expected name after %")
1495 (setq xmltok-type 'not-well-formed))))
1496 ((xmltok-prolog start nmtoken)
1497 (unless (looking-at "[ \t\r\n>),|[%]")
1498 (xmltok-add-error "Missing space after name token"))
1499 (setq xmltok-type 'nmtoken))
1500 ((xmltok-prolog start internal-subset-close)
1501 (setq xmltok-type 'internal-subset-close))
1502 ((xmltok-prolog start pound)
1503 (xmltok-add-error "Expected name after #")
1504 (setq xmltok-type 'not-well-formed))
1505 ((xmltok-prolog start markup-declaration)
1506 (xmltok-add-error "Expected name or -- after <!")
1507 (setq xmltok-type 'not-well-formed))
1508 ((xmltok-prolog start comment-first-dash)
1509 (xmltok-add-error "Expected <!--")
1510 (setq xmltok-type 'not-well-formed))
1511 ((xmltok-prolog start less-than)
1512 (xmltok-add-error "Incomplete markup")
1513 (setq xmltok-type 'not-well-formed))
1514 (t (error "Unhandled token in prolog %s"
1515 (match-string-no-properties 0)))))
1516
1517 (defun xmltok-scan-prolog-literal ()
1518 (let* ((delim (string (char-before)))
1519 (safe-end (save-excursion
1520 (skip-chars-forward (concat "^<>[]" delim))
1521 (point)))
1522 (end (save-excursion
1523 (goto-char safe-end)
1524 (search-forward delim nil t))))
1525 (cond ((or (not end)
1526 (save-excursion
1527 (goto-char end)
1528 (looking-at "[ \t\r\n>%[]")))
1529 (goto-char end))
1530 ((eq (1+ safe-end) end)
1531 (goto-char end)
1532 (xmltok-add-error (format "Missing space after %s" delim)
1533 safe-end)))
1534 (setq xmltok-type 'literal)))
1535
1536 (defun xmltok-scan-prolog-after-processing-instruction-open ()
1537 (search-forward "?>" nil 'move)
1538 (let* ((end (point))
1539 (target
1540 (save-excursion
1541 (goto-char (+ xmltok-start 2))
1542 (and (looking-at (xmltok-ncname regexp))
1543 (or (memq (char-after (match-end 0))
1544 '(?\n ?\t ?\r ? ))
1545 (= (match-end 0) (- end 2)))
1546 (match-string-no-properties 0)))))
1547 (cond ((not target)
1548 (xmltok-add-error "\
1549 Processing instruction does not start with a name"
1550 (+ xmltok-start 2)
1551 (+ xmltok-start 3)))
1552 ((not (and (= (length target) 3)
1553 (let ((case-fold-search t))
1554 (string-match "xml" target)))))
1555 ((= xmltok-start 1)
1556 (xmltok-add-error "Invalid XML declaration"
1557 xmltok-start
1558 (point)))
1559 ((save-excursion
1560 (goto-char xmltok-start)
1561 (looking-at (xmltok-xml-declaration regexp)))
1562 (xmltok-add-error "XML declaration not at beginning of file"
1563 xmltok-start
1564 (point)))
1565 (t
1566 (xmltok-add-error "Processing instruction has target of xml"
1567 (+ xmltok-start 2)
1568 (+ xmltok-start 5))))
1569 (xmltok-add-prolog-region 'processing-instruction-left
1570 xmltok-start
1571 (+ xmltok-start
1572 2
1573 (if target
1574 (length target)
1575 0)))
1576 (xmltok-add-prolog-region 'processing-instruction-right
1577 (if target
1578 (save-excursion
1579 (goto-char (+ xmltok-start
1580 (length target)
1581 2))
1582 (skip-chars-forward " \t\r\n")
1583 (point))
1584 (+ xmltok-start 2))
1585 (point)))
1586 (setq xmltok-type 'processing-instruction))
1587
1588 (defun xmltok-parse-entities ()
1589 (let ((todo xmltok-dtd))
1590 (when (and (or xmltok-had-param-entity-ref
1591 xmltok-doctype-external-subset-flag)
1592 (not xmltok-standalone))
1593 (setq xmltok-dtd (cons t xmltok-dtd)))
1594 (while todo
1595 (xmltok-parse-entity (car todo))
1596 (setq todo (cdr todo)))))
1597
1598 (defun xmltok-parse-entity (name-def)
1599 (let ((def (cdr name-def))
1600 ;; in case its value is buffer local
1601 (xmltok-dtd xmltok-dtd)
1602 buf)
1603 (when (stringp def)
1604 (if (string-match "\\`[^&<\t\r\n]*\\'" def)
1605 (setcdr name-def (cons def def))
1606 (setcdr name-def 'not-well-formed) ; avoid infinite expansion loops
1607 (setq buf (get-buffer-create
1608 (format " *Entity %s*" (car name-def))))
1609 (with-current-buffer buf
1610 (erase-buffer)
1611 (insert def)
1612 (goto-char (point-min))
1613 (setcdr name-def
1614 (xmltok-parse-entity-replacement)))
1615 (kill-buffer buf)))))
1616
1617 (defun xmltok-parse-entity-replacement ()
1618 (let ((def (cons "" "")))
1619 (while (let* ((start (point))
1620 (found (re-search-forward "[<&\t\r\n]\\|]]>" nil t))
1621 (ch (and found (char-before)))
1622 (str (buffer-substring-no-properties
1623 start
1624 (if found
1625 (match-beginning 0)
1626 (point-max)))))
1627 (setq def
1628 (xmltok-append-entity-def def
1629 (cons str str)))
1630 (cond ((not found) nil)
1631 ((eq ch ?>)
1632 (setq def 'not-well-formed)
1633 nil)
1634 ((eq ch ?<)
1635 (xmltok-save
1636 (setq xmltok-start (1- (point)))
1637 (xmltok-scan-after-lt)
1638 (setq def
1639 (xmltok-append-entity-def
1640 def
1641 (cond ((memq xmltok-type
1642 '(start-tag
1643 end-tag
1644 empty-element))
1645 'element)
1646 ((memq xmltok-type
1647 '(comment
1648 processing-instruction))
1649 (cons "" nil))
1650 ((eq xmltok-type
1651 'cdata-section)
1652 (cons (buffer-substring-no-properties
1653 (+ xmltok-start 9)
1654 (- (point) 3))
1655 nil))
1656 (t 'not-well-formed)))))
1657 t)
1658 ((eq ch ?&)
1659 (let ((xmltok-start (1- (point)))
1660 xmltok-type
1661 xmltok-replacement
1662 xmltok-errors)
1663 (xmltok-scan-after-amp 'xmltok-handle-nested-entity)
1664 (cond ((eq xmltok-type 'entity-ref)
1665 (setq def
1666 (xmltok-append-entity-def
1667 def
1668 xmltok-replacement)))
1669 ((eq xmltok-type 'char-ref)
1670 (setq def
1671 (xmltok-append-entity-def
1672 def
1673 (if xmltok-replacement
1674 (cons xmltok-replacement
1675 xmltok-replacement)
1676 (and xmltok-errors 'not-well-formed)))))
1677 (t
1678 (setq def 'not-well-formed))))
1679 t)
1680 (t
1681 (setq def
1682 (xmltok-append-entity-def
1683 def
1684 (cons (match-string-no-properties 0)
1685 " ")))
1686 t))))
1687 def))
1688
1689 (defun xmltok-handle-nested-entity (start end)
1690 (let* ((name-def (assoc (buffer-substring-no-properties start end)
1691 xmltok-dtd))
1692 (def (cdr name-def)))
1693 (when (stringp def)
1694 (xmltok-parse-entity name-def)
1695 (setq def (cdr name-def)))
1696 (setq xmltok-replacement
1697 (cond ((null name-def)
1698 (if (eq (car xmltok-dtd) t)
1699 nil
1700 'not-well-formed))
1701 ((eq def 'unparsed) 'not-well-formed)
1702 (t def)))))
1703
1704 (defun xmltok-append-entity-def (d1 d2)
1705 (cond ((consp d1)
1706 (if (consp d2)
1707 (cons (concat (car d1) (car d2))
1708 (and (cdr d1)
1709 (cdr d2)
1710 (concat (cdr d1) (cdr d2))))
1711 d2))
1712 ((consp d2) d1)
1713 (t
1714 (let ((defs '(not-well-formed external element)))
1715 (while (not (or (eq (car defs) d1)
1716 (eq (car defs) d2)))
1717 (setq defs (cdr defs)))
1718 (car defs)))))
1719
1720 (defun xmltok-add-prolog-region (type start end)
1721 (setq xmltok-prolog-regions
1722 (cons (vector type start end)
1723 xmltok-prolog-regions)))
1724
1725 (defun xmltok-merge-attributes ()
1726 "Return a list merging `xmltok-attributes' and `xmltok-namespace-attributes'.
1727 The members of the merged list are in order of occurrence in the
1728 document. The list may share list structure with `xmltok-attributes'
1729 and `xmltok-namespace-attributes'."
1730 (cond ((not xmltok-namespace-attributes)
1731 xmltok-attributes)
1732 ((not xmltok-attributes)
1733 xmltok-namespace-attributes)
1734 (t
1735 (let ((atts1 xmltok-attributes)
1736 (atts2 xmltok-namespace-attributes)
1737 merged)
1738 (while (and atts1 atts2)
1739 (cond ((< (xmltok-attribute-name-start (car atts1))
1740 (xmltok-attribute-name-start (car atts2)))
1741 (setq merged (cons (car atts1) merged))
1742 (setq atts1 (cdr atts1)))
1743 (t
1744 (setq merged (cons (car atts2) merged))
1745 (setq atts2 (cdr atts2)))))
1746 (setq merged (nreverse merged))
1747 (cond (atts1 (setq merged (nconc merged atts1)))
1748 (atts2 (setq merged (nconc merged atts2))))
1749 merged))))
1750
1751 ;;; Testing
1752
1753 (defun xmltok-forward-test ()
1754 (interactive)
1755 (if (xmltok-forward)
1756 (message "Scanned %s" xmltok-type)
1757 (message "Scanned nothing")))
1758
1759 (defun xmltok-next-prolog-token-test ()
1760 (interactive)
1761 (if (xmltok-next-prolog-token)
1762 (message "Scanned %s"
1763 (if (integerp xmltok-type)
1764 (string xmltok-type)
1765 xmltok-type))
1766 (message "Scanned end of file")))
1767
1768 (provide 'xmltok)
1769
1770 ;;; xmltok.el ends here