lxmletree.py 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204
  1. from __future__ import absolute_import, division, unicode_literals
  2. from pip._vendor.six import text_type
  3. from lxml import etree
  4. from ..treebuilders.etree import tag_regexp
  5. from gettext import gettext
  6. _ = gettext
  7. from . import _base
  8. from .. import ihatexml
  9. def ensure_str(s):
  10. if s is None:
  11. return None
  12. elif isinstance(s, text_type):
  13. return s
  14. else:
  15. return s.decode("utf-8", "strict")
  16. class Root(object):
  17. def __init__(self, et):
  18. self.elementtree = et
  19. self.children = []
  20. if et.docinfo.internalDTD:
  21. self.children.append(Doctype(self,
  22. ensure_str(et.docinfo.root_name),
  23. ensure_str(et.docinfo.public_id),
  24. ensure_str(et.docinfo.system_url)))
  25. root = et.getroot()
  26. node = root
  27. while node.getprevious() is not None:
  28. node = node.getprevious()
  29. while node is not None:
  30. self.children.append(node)
  31. node = node.getnext()
  32. self.text = None
  33. self.tail = None
  34. def __getitem__(self, key):
  35. return self.children[key]
  36. def getnext(self):
  37. return None
  38. def __len__(self):
  39. return 1
  40. class Doctype(object):
  41. def __init__(self, root_node, name, public_id, system_id):
  42. self.root_node = root_node
  43. self.name = name
  44. self.public_id = public_id
  45. self.system_id = system_id
  46. self.text = None
  47. self.tail = None
  48. def getnext(self):
  49. return self.root_node.children[1]
  50. class FragmentRoot(Root):
  51. def __init__(self, children):
  52. self.children = [FragmentWrapper(self, child) for child in children]
  53. self.text = self.tail = None
  54. def getnext(self):
  55. return None
  56. class FragmentWrapper(object):
  57. def __init__(self, fragment_root, obj):
  58. self.root_node = fragment_root
  59. self.obj = obj
  60. if hasattr(self.obj, 'text'):
  61. self.text = ensure_str(self.obj.text)
  62. else:
  63. self.text = None
  64. if hasattr(self.obj, 'tail'):
  65. self.tail = ensure_str(self.obj.tail)
  66. else:
  67. self.tail = None
  68. def __getattr__(self, name):
  69. return getattr(self.obj, name)
  70. def getnext(self):
  71. siblings = self.root_node.children
  72. idx = siblings.index(self)
  73. if idx < len(siblings) - 1:
  74. return siblings[idx + 1]
  75. else:
  76. return None
  77. def __getitem__(self, key):
  78. return self.obj[key]
  79. def __bool__(self):
  80. return bool(self.obj)
  81. def getparent(self):
  82. return None
  83. def __str__(self):
  84. return str(self.obj)
  85. def __unicode__(self):
  86. return str(self.obj)
  87. def __len__(self):
  88. return len(self.obj)
  89. class TreeWalker(_base.NonRecursiveTreeWalker):
  90. def __init__(self, tree):
  91. if hasattr(tree, "getroot"):
  92. tree = Root(tree)
  93. elif isinstance(tree, list):
  94. tree = FragmentRoot(tree)
  95. _base.NonRecursiveTreeWalker.__init__(self, tree)
  96. self.filter = ihatexml.InfosetFilter()
  97. def getNodeDetails(self, node):
  98. if isinstance(node, tuple): # Text node
  99. node, key = node
  100. assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
  101. return _base.TEXT, ensure_str(getattr(node, key))
  102. elif isinstance(node, Root):
  103. return (_base.DOCUMENT,)
  104. elif isinstance(node, Doctype):
  105. return _base.DOCTYPE, node.name, node.public_id, node.system_id
  106. elif isinstance(node, FragmentWrapper) and not hasattr(node, "tag"):
  107. return _base.TEXT, node.obj
  108. elif node.tag == etree.Comment:
  109. return _base.COMMENT, ensure_str(node.text)
  110. elif node.tag == etree.Entity:
  111. return _base.ENTITY, ensure_str(node.text)[1:-1] # strip &;
  112. else:
  113. # This is assumed to be an ordinary element
  114. match = tag_regexp.match(ensure_str(node.tag))
  115. if match:
  116. namespace, tag = match.groups()
  117. else:
  118. namespace = None
  119. tag = ensure_str(node.tag)
  120. attrs = {}
  121. for name, value in list(node.attrib.items()):
  122. name = ensure_str(name)
  123. value = ensure_str(value)
  124. match = tag_regexp.match(name)
  125. if match:
  126. attrs[(match.group(1), match.group(2))] = value
  127. else:
  128. attrs[(None, name)] = value
  129. return (_base.ELEMENT, namespace, self.filter.fromXmlName(tag),
  130. attrs, len(node) > 0 or node.text)
  131. def getFirstChild(self, node):
  132. assert not isinstance(node, tuple), _("Text nodes have no children")
  133. assert len(node) or node.text, "Node has no children"
  134. if node.text:
  135. return (node, "text")
  136. else:
  137. return node[0]
  138. def getNextSibling(self, node):
  139. if isinstance(node, tuple): # Text node
  140. node, key = node
  141. assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
  142. if key == "text":
  143. # XXX: we cannot use a "bool(node) and node[0] or None" construct here
  144. # because node[0] might evaluate to False if it has no child element
  145. if len(node):
  146. return node[0]
  147. else:
  148. return None
  149. else: # tail
  150. return node.getnext()
  151. return (node, "tail") if node.tail else node.getnext()
  152. def getParentNode(self, node):
  153. if isinstance(node, tuple): # Text node
  154. node, key = node
  155. assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
  156. if key == "text":
  157. return node
  158. # else: fallback to "normal" processing
  159. return node.getparent()