htmlserializer.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320
  1. from __future__ import absolute_import, division, unicode_literals
  2. from pip._vendor.six import text_type
  3. import gettext
  4. _ = gettext.gettext
  5. try:
  6. from functools import reduce
  7. except ImportError:
  8. pass
  9. from ..constants import voidElements, booleanAttributes, spaceCharacters
  10. from ..constants import rcdataElements, entities, xmlEntities
  11. from .. import utils
  12. from xml.sax.saxutils import escape
  13. spaceCharacters = "".join(spaceCharacters)
  14. try:
  15. from codecs import register_error, xmlcharrefreplace_errors
  16. except ImportError:
  17. unicode_encode_errors = "strict"
  18. else:
  19. unicode_encode_errors = "htmlentityreplace"
  20. encode_entity_map = {}
  21. is_ucs4 = len("\U0010FFFF") == 1
  22. for k, v in list(entities.items()):
  23. # skip multi-character entities
  24. if ((is_ucs4 and len(v) > 1) or
  25. (not is_ucs4 and len(v) > 2)):
  26. continue
  27. if v != "&":
  28. if len(v) == 2:
  29. v = utils.surrogatePairToCodepoint(v)
  30. else:
  31. v = ord(v)
  32. if not v in encode_entity_map or k.islower():
  33. # prefer < over < and similarly for &, >, etc.
  34. encode_entity_map[v] = k
  35. def htmlentityreplace_errors(exc):
  36. if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
  37. res = []
  38. codepoints = []
  39. skip = False
  40. for i, c in enumerate(exc.object[exc.start:exc.end]):
  41. if skip:
  42. skip = False
  43. continue
  44. index = i + exc.start
  45. if utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
  46. codepoint = utils.surrogatePairToCodepoint(exc.object[index:index + 2])
  47. skip = True
  48. else:
  49. codepoint = ord(c)
  50. codepoints.append(codepoint)
  51. for cp in codepoints:
  52. e = encode_entity_map.get(cp)
  53. if e:
  54. res.append("&")
  55. res.append(e)
  56. if not e.endswith(";"):
  57. res.append(";")
  58. else:
  59. res.append("&#x%s;" % (hex(cp)[2:]))
  60. return ("".join(res), exc.end)
  61. else:
  62. return xmlcharrefreplace_errors(exc)
  63. register_error(unicode_encode_errors, htmlentityreplace_errors)
  64. del register_error
  65. class HTMLSerializer(object):
  66. # attribute quoting options
  67. quote_attr_values = False
  68. quote_char = '"'
  69. use_best_quote_char = True
  70. # tag syntax options
  71. omit_optional_tags = True
  72. minimize_boolean_attributes = True
  73. use_trailing_solidus = False
  74. space_before_trailing_solidus = True
  75. # escaping options
  76. escape_lt_in_attrs = False
  77. escape_rcdata = False
  78. resolve_entities = True
  79. # miscellaneous options
  80. alphabetical_attributes = False
  81. inject_meta_charset = True
  82. strip_whitespace = False
  83. sanitize = False
  84. options = ("quote_attr_values", "quote_char", "use_best_quote_char",
  85. "omit_optional_tags", "minimize_boolean_attributes",
  86. "use_trailing_solidus", "space_before_trailing_solidus",
  87. "escape_lt_in_attrs", "escape_rcdata", "resolve_entities",
  88. "alphabetical_attributes", "inject_meta_charset",
  89. "strip_whitespace", "sanitize")
  90. def __init__(self, **kwargs):
  91. """Initialize HTMLSerializer.
  92. Keyword options (default given first unless specified) include:
  93. inject_meta_charset=True|False
  94. Whether it insert a meta element to define the character set of the
  95. document.
  96. quote_attr_values=True|False
  97. Whether to quote attribute values that don't require quoting
  98. per HTML5 parsing rules.
  99. quote_char=u'"'|u"'"
  100. Use given quote character for attribute quoting. Default is to
  101. use double quote unless attribute value contains a double quote,
  102. in which case single quotes are used instead.
  103. escape_lt_in_attrs=False|True
  104. Whether to escape < in attribute values.
  105. escape_rcdata=False|True
  106. Whether to escape characters that need to be escaped within normal
  107. elements within rcdata elements such as style.
  108. resolve_entities=True|False
  109. Whether to resolve named character entities that appear in the
  110. source tree. The XML predefined entities &lt; &gt; &amp; &quot; &apos;
  111. are unaffected by this setting.
  112. strip_whitespace=False|True
  113. Whether to remove semantically meaningless whitespace. (This
  114. compresses all whitespace to a single space except within pre.)
  115. minimize_boolean_attributes=True|False
  116. Shortens boolean attributes to give just the attribute value,
  117. for example <input disabled="disabled"> becomes <input disabled>.
  118. use_trailing_solidus=False|True
  119. Includes a close-tag slash at the end of the start tag of void
  120. elements (empty elements whose end tag is forbidden). E.g. <hr/>.
  121. space_before_trailing_solidus=True|False
  122. Places a space immediately before the closing slash in a tag
  123. using a trailing solidus. E.g. <hr />. Requires use_trailing_solidus.
  124. sanitize=False|True
  125. Strip all unsafe or unknown constructs from output.
  126. See `html5lib user documentation`_
  127. omit_optional_tags=True|False
  128. Omit start/end tags that are optional.
  129. alphabetical_attributes=False|True
  130. Reorder attributes to be in alphabetical order.
  131. .. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation
  132. """
  133. if 'quote_char' in kwargs:
  134. self.use_best_quote_char = False
  135. for attr in self.options:
  136. setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
  137. self.errors = []
  138. self.strict = False
  139. def encode(self, string):
  140. assert(isinstance(string, text_type))
  141. if self.encoding:
  142. return string.encode(self.encoding, unicode_encode_errors)
  143. else:
  144. return string
  145. def encodeStrict(self, string):
  146. assert(isinstance(string, text_type))
  147. if self.encoding:
  148. return string.encode(self.encoding, "strict")
  149. else:
  150. return string
  151. def serialize(self, treewalker, encoding=None):
  152. self.encoding = encoding
  153. in_cdata = False
  154. self.errors = []
  155. if encoding and self.inject_meta_charset:
  156. from ..filters.inject_meta_charset import Filter
  157. treewalker = Filter(treewalker, encoding)
  158. # WhitespaceFilter should be used before OptionalTagFilter
  159. # for maximum efficiently of this latter filter
  160. if self.strip_whitespace:
  161. from ..filters.whitespace import Filter
  162. treewalker = Filter(treewalker)
  163. if self.sanitize:
  164. from ..filters.sanitizer import Filter
  165. treewalker = Filter(treewalker)
  166. if self.omit_optional_tags:
  167. from ..filters.optionaltags import Filter
  168. treewalker = Filter(treewalker)
  169. # Alphabetical attributes must be last, as other filters
  170. # could add attributes and alter the order
  171. if self.alphabetical_attributes:
  172. from ..filters.alphabeticalattributes import Filter
  173. treewalker = Filter(treewalker)
  174. for token in treewalker:
  175. type = token["type"]
  176. if type == "Doctype":
  177. doctype = "<!DOCTYPE %s" % token["name"]
  178. if token["publicId"]:
  179. doctype += ' PUBLIC "%s"' % token["publicId"]
  180. elif token["systemId"]:
  181. doctype += " SYSTEM"
  182. if token["systemId"]:
  183. if token["systemId"].find('"') >= 0:
  184. if token["systemId"].find("'") >= 0:
  185. self.serializeError(_("System identifer contains both single and double quote characters"))
  186. quote_char = "'"
  187. else:
  188. quote_char = '"'
  189. doctype += " %s%s%s" % (quote_char, token["systemId"], quote_char)
  190. doctype += ">"
  191. yield self.encodeStrict(doctype)
  192. elif type in ("Characters", "SpaceCharacters"):
  193. if type == "SpaceCharacters" or in_cdata:
  194. if in_cdata and token["data"].find("</") >= 0:
  195. self.serializeError(_("Unexpected </ in CDATA"))
  196. yield self.encode(token["data"])
  197. else:
  198. yield self.encode(escape(token["data"]))
  199. elif type in ("StartTag", "EmptyTag"):
  200. name = token["name"]
  201. yield self.encodeStrict("<%s" % name)
  202. if name in rcdataElements and not self.escape_rcdata:
  203. in_cdata = True
  204. elif in_cdata:
  205. self.serializeError(_("Unexpected child element of a CDATA element"))
  206. for (attr_namespace, attr_name), attr_value in token["data"].items():
  207. # TODO: Add namespace support here
  208. k = attr_name
  209. v = attr_value
  210. yield self.encodeStrict(' ')
  211. yield self.encodeStrict(k)
  212. if not self.minimize_boolean_attributes or \
  213. (k not in booleanAttributes.get(name, tuple())
  214. and k not in booleanAttributes.get("", tuple())):
  215. yield self.encodeStrict("=")
  216. if self.quote_attr_values or not v:
  217. quote_attr = True
  218. else:
  219. quote_attr = reduce(lambda x, y: x or (y in v),
  220. spaceCharacters + ">\"'=", False)
  221. v = v.replace("&", "&amp;")
  222. if self.escape_lt_in_attrs:
  223. v = v.replace("<", "&lt;")
  224. if quote_attr:
  225. quote_char = self.quote_char
  226. if self.use_best_quote_char:
  227. if "'" in v and '"' not in v:
  228. quote_char = '"'
  229. elif '"' in v and "'" not in v:
  230. quote_char = "'"
  231. if quote_char == "'":
  232. v = v.replace("'", "&#39;")
  233. else:
  234. v = v.replace('"', "&quot;")
  235. yield self.encodeStrict(quote_char)
  236. yield self.encode(v)
  237. yield self.encodeStrict(quote_char)
  238. else:
  239. yield self.encode(v)
  240. if name in voidElements and self.use_trailing_solidus:
  241. if self.space_before_trailing_solidus:
  242. yield self.encodeStrict(" /")
  243. else:
  244. yield self.encodeStrict("/")
  245. yield self.encode(">")
  246. elif type == "EndTag":
  247. name = token["name"]
  248. if name in rcdataElements:
  249. in_cdata = False
  250. elif in_cdata:
  251. self.serializeError(_("Unexpected child element of a CDATA element"))
  252. yield self.encodeStrict("</%s>" % name)
  253. elif type == "Comment":
  254. data = token["data"]
  255. if data.find("--") >= 0:
  256. self.serializeError(_("Comment contains --"))
  257. yield self.encodeStrict("<!--%s-->" % token["data"])
  258. elif type == "Entity":
  259. name = token["name"]
  260. key = name + ";"
  261. if not key in entities:
  262. self.serializeError(_("Entity %s not recognized" % name))
  263. if self.resolve_entities and key not in xmlEntities:
  264. data = entities[key]
  265. else:
  266. data = "&%s;" % name
  267. yield self.encodeStrict(data)
  268. else:
  269. self.serializeError(token["data"])
  270. def render(self, treewalker, encoding=None):
  271. if encoding:
  272. return b"".join(list(self.serialize(treewalker, encoding)))
  273. else:
  274. return "".join(list(self.serialize(treewalker)))
  275. def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
  276. # XXX The idea is to make data mandatory.
  277. self.errors.append(data)
  278. if self.strict:
  279. raise SerializeError
  280. def SerializeError(Exception):
  281. """Error in serialized tree"""
  282. pass