]> code.delx.au - pymsnt/blob - src/tlib/domish.py
Fixed bug with Twisted 1.3
[pymsnt] / src / tlib / domish.py
1 # -*- test-case-name: twisted.test.test_domish -*-
2 #
3 # Twisted, the Framework of Your Internet
4 # Copyright (C) 2001 Matthew W. Lefkowitz
5 #
6 # This library is free software; you can redistribute it and/or
7 # modify it under the terms of version 2.1 of the GNU Lesser General Public
8 # License as published by the Free Software Foundation.
9 #
10 # This library is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 # Lesser General Public License for more details.
14 #
15 # You should have received a copy of the GNU Lesser General Public
16 # License along with this library; if not, write to the Free Software
17 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18
19 from __future__ import generators
20
21 import types
22
23 try:
24 import cStringIO as StringIO
25 except ImportError:
26 import StringIO
27
28 def _splitPrefix(name):
29 """Internal method for splitting a prefixed Element name into its respective parts """
30 ntok = name.split(":", 1)
31 if len(ntok) == 2:
32 return ntok
33 else:
34 return (None, ntok[0])
35
36 class _Serializer:
37 """ Internal class which serializes an Element tree into a buffer """
38 def __init__(self, prefixes = None):
39 self.cio = StringIO.StringIO()
40 self.prefixes = prefixes or {}
41 self.prefixCounter = 0
42
43 def getValue(self):
44 return self.cio.getvalue()
45
46 def getPrefix(self, uri):
47 if not self.prefixes.has_key(uri):
48 self.prefixes[uri] = "xn%d" % (self.prefixCounter)
49 self.prefixCounter = self.prefixCounter + 1
50 return self.prefixes[uri]
51
52 def serialize(self, elem, closeElement = 1):
53 # Optimization shortcuts
54 write = self.cio.write
55
56 # Shortcut, check to see if elem is actually a chunk o' serialized XML
57 if isinstance(elem, SerializedXML):
58 write(elem.encode("utf-8"))
59 return
60
61 # Shortcut, check to see if elem is actually a string (aka Cdata)
62 if isinstance(elem, types.StringTypes):
63 write(escapeToXml(elem).encode("utf-8"))
64 return
65
66 # Further optimizations
67 parent = elem.parent
68 name = elem.name
69 uri = elem.uri
70 defaultUri = elem.defaultUri
71
72
73 # Seralize element name
74 if defaultUri == uri:
75 if parent == None or defaultUri == parent.defaultUri:
76 write("<%s" % (name))
77 else:
78 write("<%s xmlns='%s' " % (name, defaultUri))
79 else:
80 prefix = self.getPrefix(uri)
81 if parent == None or elem.defaultUri == parent.defaultUri:
82 write("<%s:%s xmlns:%s='%s'" % (prefix, name, prefix, uri))
83 else:
84 write("<%s:%s xmlns:%s='%s' xmlns='%s'" % (prefix, name, prefix, uri, defaultUri))
85
86 # Serialize attributes
87 for k,v in elem.attributes.items():
88 # If the attribute name is a list, it's a qualified attribute
89 if isinstance(k, types.TupleType):
90 write((" %s:%s='%s'" % (self.getPrefix(k[0]), k[1], escapeToXml(v, 1))).encode("utf-8"))
91 else:
92 write((" %s='%s'" % ( k, escapeToXml(v, 1))).encode("utf-8"))
93
94 # Shortcut out if this is only going to return
95 # the element (i.e. no children)
96 if closeElement == 0:
97 write(">")
98 return
99
100 # Serialize children
101 if len(elem.children) > 0:
102 write(">")
103 for c in elem.children:
104 self.serialize(c)
105 # Add closing tag
106 if defaultUri == uri:
107 write("</%s>" % (name))
108 else:
109 write("</%s:%s>" % (self.getPrefix(uri), name))
110 else:
111 write("/>")
112
113 class _ListSerializer:
114 """ Internal class which serializes an Element tree into a buffer """
115 def __init__(self, prefixes = None):
116 self.writelist = []
117 self.prefixes = prefixes or {}
118 self.prefixCounter = 0
119
120 def getValue(self):
121 d = "".join(self.writelist)
122 return d.encode("utf-8")
123
124 def getPrefix(self, uri):
125 if not self.prefixes.has_key(uri):
126 self.prefixes[uri] = "xn%d" % (self.prefixCounter)
127 self.prefixCounter = self.prefixCounter + 1
128 return self.prefixes[uri]
129
130 def serialize(self, elem, closeElement = 1):
131 # Optimization shortcuts
132 write = self.writelist.append
133
134 # Shortcut, check to see if elem is actually a chunk o' serialized XML
135 if isinstance(elem, SerializedXML):
136 write(elem)
137 return
138
139 # Shortcut, check to see if elem is actually a string (aka Cdata)
140 if isinstance(elem, types.StringTypes):
141 write(escapeToXml(elem))
142 return
143
144 # Further optimizations
145 parent = elem.parent
146 name = elem.name
147 uri = elem.uri
148 defaultUri = elem.defaultUri
149
150 # Seralize element name
151 if defaultUri == uri:
152 if parent == None or defaultUri == parent.defaultUri:
153 write("<%s" % (name))
154 else:
155 write("<%s xmlns='%s' " % (name, defaultUri))
156 else:
157 prefix = self.getPrefix(uri)
158 if parent == None or elem.defaultUri == parent.defaultUri:
159 write("<%s:%s xmlns:%s='%s'" % (prefix, name, prefix, uri))
160 else:
161 write("<%s:%s xmlns:%s='%s' xmlns='%s'" % (prefix, name, prefix, uri, defaultUri))
162
163 # Serialize attributes
164 for k,v in elem.attributes.items():
165 # If the attribute name is a list, it's a qualified attribute
166 if isinstance(k, types.TupleType):
167 write(" %s:%s='%s'" % (self.getPrefix(k[0]), k[1], escapeToXml(v, 1)))
168 else:
169 write((" %s='%s'" % ( k, escapeToXml(v, 1))))
170
171 # Shortcut out if this is only going to return
172 # the element (i.e. no children)
173 if closeElement == 0:
174 write(">")
175 return
176
177 # Serialize children
178 if len(elem.children) > 0:
179 write(">")
180 for c in elem.children:
181 self.serialize(c)
182 # Add closing tag
183 if defaultUri == uri:
184 write("</%s>" % (name))
185 else:
186 write("</%s:%s>" % (self.getPrefix(uri), name))
187 else:
188 write("/>")
189
190
191 SerializerClass = _Serializer
192
193 def escapeToXml(text, isattrib = 0):
194 """Escape text to proper XML form, per section 2.3 in the XML specification.
195
196 @type text: L{str}
197 @param text: Text to escape
198
199 @type isattrib: L{Boolean}
200 @param isattrib: Triggers escaping of characters necessary for use as attribute values
201 """
202 text = text.replace("&", "&amp;")
203 text = text.replace("<", "&lt;")
204 text = text.replace(">", "&gt;")
205 if isattrib == 1:
206 text = text.replace("'", "&apos;")
207 text = text.replace("\"", "&quot;")
208 return text
209
210 def unescapeFromXml(text):
211 text = text.replace("&lt;", "<")
212 text = text.replace("&gt;", ">")
213 text = text.replace("&apos;", "'")
214 text = text.replace("&quot;", "\"")
215 text = text.replace("&amp;", "&")
216 return text
217
218 def generateOnlyKlass(list, klass):
219 """ Filters items in a list by class
220 """
221 for n in list:
222 if n.__class__ == klass:
223 yield n
224
225 def generateElementsQNamed(list, name, uri):
226 """ Filters Element items in a list with matching name and URI
227 """
228 for n in list:
229 if n.__class__ == Element and n.name == name and n.uri == uri:
230 yield n
231
232 def generateElementsNamed(list, name):
233 """ Filters Element items in a list with matching name, regardless of URI
234 """
235 for n in list:
236 if n.__class__ == Element and n.name == name:
237 yield n
238
239
240 class SerializedXML(str):
241 """ Marker class for pre-serialized XML in the DOM """
242 pass
243
244
245 class Namespace:
246 """ Convenience object for tracking namespace declarations
247 """
248 def __init__(self, uri):
249 self._uri = uri
250 def __getattr__(self, n):
251 return (self._uri, n)
252 def __getitem__(self, n):
253 return (self._uri, n)
254
255
256 class Element(object):
257 """Object representing a container (a.k.a. tag or element) in an HTML or XML document.
258
259 An Element contains a series of attributes (name/value pairs),
260 content (character data), and other child Element objects. When building a document
261 with markup (such as HTML or XML), use this object as the starting point.
262
263 @type uri: C{str}
264 @ivar uri: URI of this Element's name
265
266 @type defaultUri: C{str}
267 @ivar defaultUri: URI this Element exists within
268
269 @type name: C{str}
270 @ivar name: Name of this Element
271
272 @type children: C{list}
273 @ivar children: List of child Elements and content
274
275 @type parent: C{Element}
276 @ivar parent: Reference to the parent Element, if any.
277
278 @type attributes: C{dict}
279 @ivar attributes: Dictionary of attributes associated with this Element.
280
281 """
282 _idCounter = 0
283 def __init__(self, qname, defaultUri = None, attribs = None):
284 """
285 @param qname: Tuple of (uri, name)
286 @param defaultUri: The default URI of the element; defaults to the URI specified in L{qname}
287 @param attribs: Dictionary of attributes
288 """
289 self.uri, self.name = qname
290 self.defaultUri = defaultUri or self.uri
291 self.attributes = attribs or {}
292 self.children = []
293 self.parent = None
294
295 def __getattr__(self, key):
296 # Check child list for first Element with a name matching the key
297 for n in self.children:
298 if n.__class__ == Element and n.name == key:
299 return n
300
301 # Tweak the behaviour so that it's more friendly about not
302 # finding elements -- we need to document this somewhere :)
303 return None
304
305 def __getitem__(self, key):
306 return self.attributes[self._dqa(key)]
307
308 def __delitem__(self, key):
309 del self.attributes[self._dqa(key)];
310
311 def __setitem__(self, key, value):
312 self.attributes[self._dqa(key)] = value
313
314 def __str__(self):
315 """ Retrieve the first CData (content) node
316 """
317 for n in self.children:
318 if isinstance(n, types.StringTypes): return n
319 return ""
320
321 def _dqa(self, attr):
322 """Dequalify an attribute key as needed"""
323 if isinstance(attr, types.TupleType) and attr[0] == self.uri:
324 return attr[1]
325 else:
326 return attr
327
328 def getAttribute(self, attribname, default = None):
329 """Retrieve the value of attribname, if it exists """
330 return self.attributes.get(attribname, default)
331
332 def hasAttribute(self, attrib):
333 """Determine if the specified attribute exists """
334 return self.attributes.has_key(self._dqa(attrib))
335
336 def compareAttribute(self, attrib, value):
337 """Safely compare the value of an attribute against a provided value; None-safe. """
338 return self.attributes.get(self._dqa(attrib), None) == value
339
340 def swapAttributeValues(self, left, right):
341 """Swap the values of two attribute"""
342 d = self.attributes
343 l = d[left]
344 d[left] = d[right]
345 d[right] = l
346
347 def addChild(self, node):
348 """Add a child to this Element"""
349 if node.__class__ == Element:
350 node.parent = self
351 self.children.append(node)
352 return self.children[-1]
353
354 def addContent(self, text):
355 """Add some text data to this element"""
356 c = self.children
357 if len(c) > 0 and isinstance(c[-1], types.StringTypes):
358 c[-1] = c[-1] + text
359 else:
360 c.append(text)
361 return c[-1]
362
363 def addElement(self, name, defaultUri = None, content = None):
364 """Add a new child Element to this Element; preferred method
365 """
366 result = None
367 if isinstance(name, type(())):
368 defaultUri = defaultUri or name[0]
369 self.children.append(Element(name, defaultUri))
370 else:
371 defaultUri = defaultUri or self.defaultUri
372 self.children.append(Element((self.uri, name), defaultUri))
373
374 result = self.children[-1]
375 result.parent = self
376
377 if content:
378 result.children.append(content)
379
380 return result
381
382 def addRawXml(self, rawxmlstring):
383 """Add a pre-serialized chunk o' XML as a child of this Element.
384 """
385 self.children.append(SerializedXML(rawxmlstring))
386
387 def addUniqueId(self):
388 """Add a unique (across a given Python session) id attribute to this Element"""
389 self.attributes["id"] = "H_%d" % Element._idCounter
390 Element._idCounter = Element._idCounter + 1
391
392 def elements(self):
393 """Iterate across all children of this Element that are Elements"""
394 return generateOnlyKlass(self.children, Element)
395
396 def toXml(self, prefixes = None, closeElement = 1):
397 """Serialize this Element and all children to a string """
398 s = SerializerClass(prefixes)
399 s.serialize(self, closeElement)
400 return s.getValue()
401
402 def firstChildElement(self):
403 for c in self.children:
404 if c.__class__ == Element:
405 return c
406 return None
407
408 def getElement(self, tagName):
409 for child in self.elements():
410 if(child.name == tagName):
411 return child
412
413
414 class ParserError(Exception):
415 """ Exception thrown when a parsing error occurs """
416 pass
417
418 def elementStream():
419 """ Preferred method to construct an ElementStream
420
421 Uses Expat-based stream if available, and falls back to Sux if necessary.
422 """
423 try:
424 es = ExpatElementStream()
425 return es
426 except ImportError:
427 es = SuxElementStream()
428 return es
429
430 from twisted.protocols import sux
431 class SuxElementStream(sux.XMLParser):
432 def __init__(self):
433 self.connectionMade()
434 self.DocumentStartEvent = None
435 self.ElementEvent = None
436 self.DocumentEndEvent = None
437 self.currElem = None
438 self.rootElem = None
439 self.documentStarted = False
440 self.defaultNsStack = []
441 self.prefixStack = []
442 self.parse = self.dataReceived
443
444 def findUri(self, prefix):
445 # Walk prefix stack backwards, looking for the uri
446 # matching the specified prefix
447 stack = self.prefixStack
448 for i in range(-1, (len(self.prefixStack)+1) * -1, -1):
449 if prefix in stack[i]:
450 return stack[i][prefix]
451 return None
452
453 def gotTagStart(self, name, attributes):
454 defaultUri = None
455 localPrefixes = {}
456 attribs = {}
457 uri = None
458
459 # Pass 1 - Identify namespace decls
460 for k, v in attributes.items():
461 if k.startswith("xmlns"):
462 x, p = _splitPrefix(k)
463 if (x == None): # I.e. default declaration
464 defaultUri = v
465 else:
466 localPrefixes[p] = v
467 del attributes[k]
468
469 # Push namespace decls onto prefix stack
470 self.prefixStack.append(localPrefixes)
471
472 # Determine default namespace for this element; if there
473 # is one
474 if defaultUri == None and len(self.defaultNsStack) > 0:
475 defaultUri = self.defaultNsStack[-1]
476
477 # Fix up name
478 prefix, name = _splitPrefix(name)
479 if prefix == None: # This element is in the default namespace
480 uri = defaultUri
481 else:
482 # Find the URI for the prefix
483 uri = self.findUri(prefix)
484
485 # Pass 2 - Fix up and escape attributes
486 for k, v in attributes.items():
487 p, n = _splitPrefix(k)
488 if p == None:
489 attribs[n] = v
490 else:
491 attribs[(self.findUri(p)), n] = unescapeFromXml(v)
492
493 # Construct the actual Element object
494 e = Element((uri, name), defaultUri, attribs)
495
496 # Save current default namespace
497 self.defaultNsStack.append(defaultUri)
498
499 # Document already started
500 if self.documentStarted:
501 # Starting a new packet
502 if self.currElem == None:
503 self.currElem = e
504 # Adding to existing element
505 else:
506 self.currElem = self.currElem.addChild(e)
507 # New document
508 else:
509 self.rootElem = e
510 self.documentStarted = True
511 self.DocumentStartEvent(e)
512
513 def gotText(self, data):
514 if self.currElem != None:
515 self.currElem.addContent(data)
516
517 def gotCData(self, data):
518 if self.currElem != None:
519 self.currElem.addContent(data)
520
521 def gotComment(self, data):
522 # Ignore comments for the moment
523 pass
524
525 entities = { "amp" : "&",
526 "lt" : "<",
527 "gt" : ">",
528 "apos": "'",
529 "quot": "\"" }
530
531 def gotEntityReference(self, entityRef):
532 # If this is an entity we know about, add it as content
533 # to the current element
534 if entityRef in SuxElementStream.entities:
535 self.currElem.addContent(SuxElementStream.entities[entityRef])
536
537 def gotTagEnd(self, name):
538 # Ensure the document hasn't already ended
539 if self.rootElem == None:
540 # XXX: Write more legible explanation
541 raise ParserError, "Element closed after end of document."
542
543 # Fix up name
544 prefix, name = _splitPrefix(name)
545 if prefix == None:
546 uri = self.defaultNsStack[-1]
547 else:
548 uri = self.findUri(prefix)
549
550 # End of document
551 if self.currElem == None:
552 # Ensure element name and uri matches
553 if self.rootElem.name != name or self.rootElem.uri != uri:
554 raise ParserError, "Mismatched root elements"
555 self.DocumentEndEvent()
556 self.rootElem = None
557
558 # Other elements
559 else:
560 # Ensure the tag being closed matches the name of the current
561 # element
562 if self.currElem.name != name or self.currElem.uri != uri:
563 # XXX: Write more legible explanation
564 raise ParserError, "Malformed element close"
565
566 # Pop prefix and default NS stack
567 self.prefixStack.pop()
568 self.defaultNsStack.pop()
569
570 # Check for parent null parent of current elem;
571 # that's the top of the stack
572 if self.currElem.parent == None:
573 self.ElementEvent(self.currElem)
574 self.currElem = None
575
576 # Anything else is just some element wrapping up
577 else:
578 self.currElem = self.currElem.parent
579
580
581 class ExpatElementStream:
582 def __init__(self):
583 import pyexpat
584 self.DocumentStartEvent = None
585 self.ElementEvent = None
586 self.DocumentEndEvent = None
587 self.parser = pyexpat.ParserCreate("UTF-8", " ")
588 self.parser.StartElementHandler = self._onStartElement
589 self.parser.EndElementHandler = self._onEndElement
590 self.parser.CharacterDataHandler = self._onCdata
591 self.parser.StartNamespaceDeclHandler = self._onStartNamespace
592 self.parser.EndNamespaceDeclHandler = self._onEndNamespace
593 self.currElem = None
594 self.defaultNsStack = []
595 self.documentStarted = 0
596
597 def parse(self, buffer):
598 self.parser.Parse(buffer)
599
600 def _onStartElement(self, name, attrs):
601 # Generate a qname tuple from the provided name
602 qname = name.split(" ")
603
604 # Process attributes
605 for k, v in attrs.items():
606 if k.find(" ") != -1:
607 # attrs[k.split(" ")] = v
608 aqname = k.split(" ")
609 attrs[(aqname[0], aqname[1])] = v
610 del attrs[k]
611
612 # Construct the new element
613 e = Element(qname, self.defaultNsStack[-1], attrs)
614
615 # Document already started
616 if self.documentStarted == 1:
617 if self.currElem != None:
618 self.currElem.children.append(e)
619 e.parent = self.currElem
620 self.currElem = e
621
622 # New document
623 else:
624 self.documentStarted = 1
625 self.DocumentStartEvent(e)
626
627 def _onEndElement(self, _):
628 # Check for null current elem; end of doc
629 if self.currElem == None:
630 self.DocumentEndEvent()
631
632 # Check for parent that is None; that's
633 # the top of the stack
634 elif self.currElem.parent == None:
635 self.ElementEvent(self.currElem)
636 self.currElem = None
637
638 # Anything else is just some element in the current
639 # packet wrapping up
640 else:
641 self.currElem = self.currElem.parent
642
643 def _onCdata(self, data):
644 if self.currElem != None:
645 self.currElem.addContent(data)
646
647 def _onStartNamespace(self, prefix, uri):
648 # If this is the default namespace, put
649 # it on the stack
650 if prefix == None:
651 self.defaultNsStack.append(uri)
652
653 def _onEndNamespace(self, prefix):
654 # Remove last element on the stack
655 if prefix == None:
656 self.defaultNsStack.pop()
657
658
659
660 def parseText(text):
661 t = TextParser()
662 t.parseString(text)
663 return t.root
664
665 def parseFile(filename):
666 t = TextParser()
667 t.parseFile(filename)
668 return t.root
669
670 class TextParser:
671 """ Taken from http://xoomer.virgilio.it/dialtone/rsschannel.py """
672
673 def __init__(self):
674 self.root = None
675
676 def parseFile(self, filename):
677 return self.parseString(file(filename).read())
678
679 def parseString(self, data):
680 es = SuxElementStream()
681 es.DocumentStartEvent = self.docStart
682 es.DocumentEndEvent = self.docEnd
683 es.ElementEvent = self.element
684 es.parse(data)
685 return self.root
686
687 def docStart(self, e):
688 self.root = e
689
690 def docEnd(self):
691 pass
692
693 def element(self, e):
694 self.root.addChild(e)
695
696
697 ## class FileParser(ElementStream):
698 ## def __init__(self):
699 ## ElementStream.__init__(self)
700 ## self.DocumentStartEvent = self.docStart
701 ## self.ElementEvent = self.elem
702 ## self.DocumentEndEvent = self.docEnd
703 ## self.done = 0
704
705 ## def docStart(self, elem):
706 ## self.document = elem
707
708 ## def elem(self, elem):
709 ## self.document.addChild(elem)
710
711 ## def docEnd(self):
712 ## self.done = 1
713
714 ## def parse(self, filename):
715 ## for l in open(filename).readlines():
716 ## self.parser.Parse(l)
717 ## assert self.done == 1
718 ## return self.document
719
720 ## def parseFile(filename):
721 ## return FileParser().parse(filename)
722
723