1 # -*- test-case-name: twisted.test.test_domish -*-
3 # Twisted, the Framework of Your Internet
4 # Copyright (C) 2001 Matthew W. Lefkowitz
6 # This library is free software; you can redistribute it and/or
7 # modify it under the terms of version 2.1 of the GNU Lesser General Public
8 # License as published by the Free Software Foundation.
10 # This library is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 # Lesser General Public License for more details.
15 # You should have received a copy of the GNU Lesser General Public
16 # License along with this library; if not, write to the Free Software
17 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 from __future__
import generators
24 import cStringIO
as StringIO
28 def _splitPrefix(name
):
29 """Internal method for splitting a prefixed Element name into its respective parts """
30 ntok
= name
.split(":", 1)
34 return (None, ntok
[0])
37 """ Internal class which serializes an Element tree into a buffer """
38 def __init__(self
, prefixes
= None):
39 self
.cio
= StringIO
.StringIO()
40 self
.prefixes
= prefixes
or {}
41 self
.prefixCounter
= 0
44 return self
.cio
.getvalue()
46 def getPrefix(self
, uri
):
47 if not self
.prefixes
.has_key(uri
):
48 self
.prefixes
[uri
] = "xn%d" % (self
.prefixCounter
)
49 self
.prefixCounter
= self
.prefixCounter
+ 1
50 return self
.prefixes
[uri
]
52 def serialize(self
, elem
, closeElement
= 1):
53 # Optimization shortcuts
54 write
= self
.cio
.write
56 # Shortcut, check to see if elem is actually a chunk o' serialized XML
57 if isinstance(elem
, SerializedXML
):
58 write(elem
.encode("utf-8"))
61 # Shortcut, check to see if elem is actually a string (aka Cdata)
62 if isinstance(elem
, types
.StringTypes
):
63 write(escapeToXml(elem
).encode("utf-8"))
66 # Further optimizations
70 defaultUri
= elem
.defaultUri
73 # Seralize element name
75 if parent
== None or defaultUri
== parent
.defaultUri
:
78 write("<%s xmlns='%s' " % (name
, defaultUri
))
80 prefix
= self
.getPrefix(uri
)
81 if parent
== None or elem
.defaultUri
== parent
.defaultUri
:
82 write("<%s:%s xmlns:%s='%s'" % (prefix
, name
, prefix
, uri
))
84 write("<%s:%s xmlns:%s='%s' xmlns='%s'" % (prefix
, name
, prefix
, uri
, defaultUri
))
86 # Serialize attributes
87 for k
,v
in elem
.attributes
.items():
88 # If the attribute name is a list, it's a qualified attribute
89 if isinstance(k
, types
.TupleType
):
90 write(" %s:%s='%s'" % (self
.getPrefix(k
[0]), k
[1], escapeToXml(v
, 1)).encode("utf-8"))
92 write((" %s='%s'" % ( k
, escapeToXml(v
, 1))).encode("utf-8"))
94 # Shortcut out if this is only going to return
95 # the element (i.e. no children)
101 if len(elem
.children
) > 0:
103 for c
in elem
.children
:
106 if defaultUri
== uri
:
107 write("</%s>" % (name
))
109 write("</%s:%s>" % (self
.getPrefix(uri
), name
))
113 class _ListSerializer
:
114 """ Internal class which serializes an Element tree into a buffer """
115 def __init__(self
, prefixes
= None):
117 self
.prefixes
= prefixes
or {}
118 self
.prefixCounter
= 0
121 d
= "".join(self
.writelist
)
122 return d
.encode("utf-8")
124 def getPrefix(self
, uri
):
125 if not self
.prefixes
.has_key(uri
):
126 self
.prefixes
[uri
] = "xn%d" % (self
.prefixCounter
)
127 self
.prefixCounter
= self
.prefixCounter
+ 1
128 return self
.prefixes
[uri
]
130 def serialize(self
, elem
, closeElement
= 1):
131 # Optimization shortcuts
132 write
= self
.writelist
.append
134 # Shortcut, check to see if elem is actually a chunk o' serialized XML
135 if isinstance(elem
, SerializedXML
):
139 # Shortcut, check to see if elem is actually a string (aka Cdata)
140 if isinstance(elem
, types
.StringTypes
):
141 write(escapeToXml(elem
))
144 # Further optimizations
148 defaultUri
= elem
.defaultUri
150 # Seralize element name
151 if defaultUri
== uri
:
152 if parent
== None or defaultUri
== parent
.defaultUri
:
153 write("<%s" % (name
))
155 write("<%s xmlns='%s' " % (name
, defaultUri
))
157 prefix
= self
.getPrefix(uri
)
158 if parent
== None or elem
.defaultUri
== parent
.defaultUri
:
159 write("<%s:%s xmlns:%s='%s'" % (prefix
, name
, prefix
, uri
))
161 write("<%s:%s xmlns:%s='%s' xmlns='%s'" % (prefix
, name
, prefix
, uri
, defaultUri
))
163 # Serialize attributes
164 for k
,v
in elem
.attributes
.items():
165 # If the attribute name is a list, it's a qualified attribute
166 if isinstance(k
, types
.TupleType
):
167 write(" %s:%s='%s'" % (self
.getPrefix(k
[0]), k
[1], escapeToXml(v
, 1)))
169 write((" %s='%s'" % ( k
, escapeToXml(v
, 1))))
171 # Shortcut out if this is only going to return
172 # the element (i.e. no children)
173 if closeElement
== 0:
178 if len(elem
.children
) > 0:
180 for c
in elem
.children
:
183 if defaultUri
== uri
:
184 write("</%s>" % (name
))
186 write("</%s:%s>" % (self
.getPrefix(uri
), name
))
191 SerializerClass
= _Serializer
193 def escapeToXml(text
, isattrib
= 0):
194 """Escape text to proper XML form, per section 2.3 in the XML specification.
197 @param text: Text to escape
199 @type isattrib: L{Boolean}
200 @param isattrib: Triggers escaping of characters necessary for use as attribute values
202 text
= text
.replace("&", "&")
203 text
= text
.replace("<", "<")
204 text
= text
.replace(">", ">")
206 text
= text
.replace("'", "'")
207 text
= text
.replace("\"", """)
210 def unescapeFromXml(text
):
211 text
= text
.replace("<", "<")
212 text
= text
.replace(">", ">")
213 text
= text
.replace("'", "'")
214 text
= text
.replace(""", "\"")
215 text
= text
.replace("&", "&")
218 def generateOnlyKlass(list, klass
):
219 """ Filters items in a list by class
222 if n
.__class
__ == klass
:
225 def generateElementsQNamed(list, name
, uri
):
226 """ Filters Element items in a list with matching name and URI
229 if n
.__class
__ == Element
and n
.name
== name
and n
.uri
== uri
:
232 def generateElementsNamed(list, name
):
233 """ Filters Element items in a list with matching name, regardless of URI
236 if n
.__class
__ == Element
and n
.name
== name
:
240 class SerializedXML(str):
241 """ Marker class for pre-serialized XML in the DOM """
246 """ Convenience object for tracking namespace declarations
248 def __init__(self
, uri
):
250 def __getattr__(self
, n
):
251 return (self
._uri
, n
)
252 def __getitem__(self
, n
):
253 return (self
._uri
, n
)
256 class Element(object):
257 """Object representing a container (a.k.a. tag or element) in an HTML or XML document.
259 An Element contains a series of attributes (name/value pairs),
260 content (character data), and other child Element objects. When building a document
261 with markup (such as HTML or XML), use this object as the starting point.
264 @ivar uri: URI of this Element's name
266 @type defaultUri: C{str}
267 @ivar defaultUri: URI this Element exists within
270 @ivar name: Name of this Element
272 @type children: C{list}
273 @ivar children: List of child Elements and content
275 @type parent: C{Element}
276 @ivar parent: Reference to the parent Element, if any.
278 @type attributes: C{dict}
279 @ivar attributes: Dictionary of attributes associated with this Element.
283 def __init__(self
, qname
, defaultUri
= None, attribs
= None):
285 @param qname: Tuple of (uri, name)
286 @param defaultUri: The default URI of the element; defaults to the URI specified in L{qname}
287 @param attribs: Dictionary of attributes
289 self
.uri
, self
.name
= qname
290 self
.defaultUri
= defaultUri
or self
.uri
291 self
.attributes
= attribs
or {}
295 def __getattr__(self
, key
):
296 # Check child list for first Element with a name matching the key
297 for n
in self
.children
:
298 if n
.__class
__ == Element
and n
.name
== key
:
301 # Tweak the behaviour so that it's more friendly about not
302 # finding elements -- we need to document this somewhere :)
305 def __getitem__(self
, key
):
306 return self
.attributes
[self
._dqa
(key
)]
308 def __delitem__(self
, key
):
309 del self
.attributes
[self
._dqa
(key
)];
311 def __setitem__(self
, key
, value
):
312 self
.attributes
[self
._dqa
(key
)] = value
315 """ Retrieve the first CData (content) node
317 for n
in self
.children
:
318 if isinstance(n
, types
.StringTypes
): return n
321 def _dqa(self
, attr
):
322 """Dequalify an attribute key as needed"""
323 if isinstance(attr
, types
.TupleType
) and attr
[0] == self
.uri
:
328 def getAttribute(self
, attribname
, default
= None):
329 """Retrieve the value of attribname, if it exists """
330 return self
.attributes
.get(attribname
, default
)
332 def hasAttribute(self
, attrib
):
333 """Determine if the specified attribute exists """
334 return self
.attributes
.has_key(self
._dqa
(attrib
))
336 def compareAttribute(self
, attrib
, value
):
337 """Safely compare the value of an attribute against a provided value; None-safe. """
338 return self
.attributes
.get(self
._dqa
(attrib
), None) == value
340 def swapAttributeValues(self
, left
, right
):
341 """Swap the values of two attribute"""
347 def addChild(self
, node
):
348 """Add a child to this Element"""
349 if node
.__class
__ == Element
:
351 self
.children
.append(node
)
352 return self
.children
[-1]
354 def addContent(self
, text
):
355 """Add some text data to this element"""
357 if len(c
) > 0 and isinstance(c
[-1], types
.StringTypes
):
363 def addElement(self
, name
, defaultUri
= None, content
= None):
364 """Add a new child Element to this Element; preferred method
367 if isinstance(name
, type(())):
368 defaultUri
= defaultUri
or name
[0]
369 self
.children
.append(Element(name
, defaultUri
))
371 defaultUri
= defaultUri
or self
.defaultUri
372 self
.children
.append(Element((self
.uri
, name
), defaultUri
))
374 result
= self
.children
[-1]
378 result
.children
.append(content
)
382 def addRawXml(self
, rawxmlstring
):
383 """Add a pre-serialized chunk o' XML as a child of this Element.
385 self
.children
.append(SerializedXML(rawxmlstring
))
387 def addUniqueId(self
):
388 """Add a unique (across a given Python session) id attribute to this Element"""
389 self
.attributes
["id"] = "H_%d" % Element
._idCounter
390 Element
._idCounter
= Element
._idCounter
+ 1
393 """Iterate across all children of this Element that are Elements"""
394 return generateOnlyKlass(self
.children
, Element
)
396 def toXml(self
, prefixes
= None, closeElement
= 1):
397 """Serialize this Element and all children to a string """
398 s
= SerializerClass(prefixes
)
399 s
.serialize(self
, closeElement
)
402 def firstChildElement(self
):
403 for c
in self
.children
:
404 if c
.__class
__ == Element
:
408 def getElement(self
, tagName
):
409 for child
in self
.elements():
410 if(child
.name
== tagName
):
414 class ParserError(Exception):
415 """ Exception thrown when a parsing error occurs """
419 """ Preferred method to construct an ElementStream
421 Uses Expat-based stream if available, and falls back to Sux if necessary.
424 es
= ExpatElementStream()
427 es
= SuxElementStream()
430 from twisted
.protocols
import sux
431 class SuxElementStream(sux
.XMLParser
):
433 self
.connectionMade()
434 self
.DocumentStartEvent
= None
435 self
.ElementEvent
= None
436 self
.DocumentEndEvent
= None
439 self
.documentStarted
= False
440 self
.defaultNsStack
= []
441 self
.prefixStack
= []
442 self
.parse
= self
.dataReceived
444 def findUri(self
, prefix
):
445 # Walk prefix stack backwards, looking for the uri
446 # matching the specified prefix
447 stack
= self
.prefixStack
448 for i
in range(-1, (len(self
.prefixStack
)+1) * -1, -1):
449 if prefix
in stack
[i
]:
450 return stack
[i
][prefix
]
453 def gotTagStart(self
, name
, attributes
):
459 # Pass 1 - Identify namespace decls
460 for k
, v
in attributes
.items():
461 if k
.startswith("xmlns"):
462 x
, p
= _splitPrefix(k
)
463 if (x
== None): # I.e. default declaration
469 # Push namespace decls onto prefix stack
470 self
.prefixStack
.append(localPrefixes
)
472 # Determine default namespace for this element; if there
474 if defaultUri
== None and len(self
.defaultNsStack
) > 0:
475 defaultUri
= self
.defaultNsStack
[-1]
478 prefix
, name
= _splitPrefix(name
)
479 if prefix
== None: # This element is in the default namespace
482 # Find the URI for the prefix
483 uri
= self
.findUri(prefix
)
485 # Pass 2 - Fix up and escape attributes
486 for k
, v
in attributes
.items():
487 p
, n
= _splitPrefix(k
)
491 attribs
[(self
.findUri(p
)), n
] = unescapeFromXml(v
)
493 # Construct the actual Element object
494 e
= Element((uri
, name
), defaultUri
, attribs
)
496 # Save current default namespace
497 self
.defaultNsStack
.append(defaultUri
)
499 # Document already started
500 if self
.documentStarted
:
501 # Starting a new packet
502 if self
.currElem
== None:
504 # Adding to existing element
506 self
.currElem
= self
.currElem
.addChild(e
)
510 self
.documentStarted
= True
511 self
.DocumentStartEvent(e
)
513 def gotText(self
, data
):
514 if self
.currElem
!= None:
515 self
.currElem
.addContent(data
)
517 def gotCData(self
, data
):
518 if self
.currElem
!= None:
519 self
.currElem
.addContent(data
)
521 def gotComment(self
, data
):
522 # Ignore comments for the moment
525 entities
= { "amp" : "&",
531 def gotEntityReference(self
, entityRef
):
532 # If this is an entity we know about, add it as content
533 # to the current element
534 if entityRef
in SuxElementStream
.entities
:
535 self
.currElem
.addContent(SuxElementStream
.entities
[entityRef
])
537 def gotTagEnd(self
, name
):
538 # Ensure the document hasn't already ended
539 if self
.rootElem
== None:
540 # XXX: Write more legible explanation
541 raise ParserError
, "Element closed after end of document."
544 prefix
, name
= _splitPrefix(name
)
546 uri
= self
.defaultNsStack
[-1]
548 uri
= self
.findUri(prefix
)
551 if self
.currElem
== None:
552 # Ensure element name and uri matches
553 if self
.rootElem
.name
!= name
or self
.rootElem
.uri
!= uri
:
554 raise ParserError
, "Mismatched root elements"
555 self
.DocumentEndEvent()
560 # Ensure the tag being closed matches the name of the current
562 if self
.currElem
.name
!= name
or self
.currElem
.uri
!= uri
:
563 # XXX: Write more legible explanation
564 raise ParserError
, "Malformed element close"
566 # Pop prefix and default NS stack
567 self
.prefixStack
.pop()
568 self
.defaultNsStack
.pop()
570 # Check for parent null parent of current elem;
571 # that's the top of the stack
572 if self
.currElem
.parent
== None:
573 self
.ElementEvent(self
.currElem
)
576 # Anything else is just some element wrapping up
578 self
.currElem
= self
.currElem
.parent
581 class ExpatElementStream
:
584 self
.DocumentStartEvent
= None
585 self
.ElementEvent
= None
586 self
.DocumentEndEvent
= None
587 self
.parser
= pyexpat
.ParserCreate("UTF-8", " ")
588 self
.parser
.StartElementHandler
= self
._onStartElement
589 self
.parser
.EndElementHandler
= self
._onEndElement
590 self
.parser
.CharacterDataHandler
= self
._onCdata
591 self
.parser
.StartNamespaceDeclHandler
= self
._onStartNamespace
592 self
.parser
.EndNamespaceDeclHandler
= self
._onEndNamespace
594 self
.defaultNsStack
= []
595 self
.documentStarted
= 0
597 def parse(self
, buffer):
598 self
.parser
.Parse(buffer)
600 def _onStartElement(self
, name
, attrs
):
601 # Generate a qname tuple from the provided name
602 qname
= name
.split(" ")
605 for k
, v
in attrs
.items():
606 if k
.find(" ") != -1:
607 # attrs[k.split(" ")] = v
608 aqname
= k
.split(" ")
609 attrs
[(aqname
[0], aqname
[1])] = v
612 # Construct the new element
613 e
= Element(qname
, self
.defaultNsStack
[-1], attrs
)
615 # Document already started
616 if self
.documentStarted
== 1:
617 if self
.currElem
!= None:
618 self
.currElem
.children
.append(e
)
619 e
.parent
= self
.currElem
624 self
.documentStarted
= 1
625 self
.DocumentStartEvent(e
)
627 def _onEndElement(self
, _
):
628 # Check for null current elem; end of doc
629 if self
.currElem
== None:
630 self
.DocumentEndEvent()
632 # Check for parent that is None; that's
633 # the top of the stack
634 elif self
.currElem
.parent
== None:
635 self
.ElementEvent(self
.currElem
)
638 # Anything else is just some element in the current
641 self
.currElem
= self
.currElem
.parent
643 def _onCdata(self
, data
):
644 if self
.currElem
!= None:
645 self
.currElem
.addContent(data
)
647 def _onStartNamespace(self
, prefix
, uri
):
648 # If this is the default namespace, put
651 self
.defaultNsStack
.append(uri
)
653 def _onEndNamespace(self
, prefix
):
654 # Remove last element on the stack
656 self
.defaultNsStack
.pop()
665 def parseFile(filename
):
667 t
.parseFile(filename
)
671 """ Taken from http://xoomer.virgilio.it/dialtone/rsschannel.py """
676 def parseFile(self
, filename
):
677 return self
.parseString(file(filename
).read())
679 def parseString(self
, data
):
680 es
= SuxElementStream()
681 es
.DocumentStartEvent
= self
.docStart
682 es
.DocumentEndEvent
= self
.docEnd
683 es
.ElementEvent
= self
.element
687 def docStart(self
, e
):
693 def element(self
, e
):
694 self
.root
.addChild(e
)
697 ## class FileParser(ElementStream):
698 ## def __init__(self):
699 ## ElementStream.__init__(self)
700 ## self.DocumentStartEvent = self.docStart
701 ## self.ElementEvent = self.elem
702 ## self.DocumentEndEvent = self.docEnd
705 ## def docStart(self, elem):
706 ## self.document = elem
708 ## def elem(self, elem):
709 ## self.document.addChild(elem)
714 ## def parse(self, filename):
715 ## for l in open(filename).readlines():
716 ## self.parser.Parse(l)
717 ## assert self.done == 1
718 ## return self.document
720 ## def parseFile(filename):
721 ## return FileParser().parse(filename)