html5parser.py 117 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713
  1. from __future__ import absolute_import, division, unicode_literals
  2. from pip._vendor.six import with_metaclass
  3. import types
  4. from . import inputstream
  5. from . import tokenizer
  6. from . import treebuilders
  7. from .treebuilders._base import Marker
  8. from . import utils
  9. from . import constants
  10. from .constants import spaceCharacters, asciiUpper2Lower
  11. from .constants import specialElements
  12. from .constants import headingElements
  13. from .constants import cdataElements, rcdataElements
  14. from .constants import tokenTypes, ReparseException, namespaces
  15. from .constants import htmlIntegrationPointElements, mathmlTextIntegrationPointElements
  16. from .constants import adjustForeignAttributes as adjustForeignAttributesMap
  17. def parse(doc, treebuilder="etree", encoding=None,
  18. namespaceHTMLElements=True):
  19. """Parse a string or file-like object into a tree"""
  20. tb = treebuilders.getTreeBuilder(treebuilder)
  21. p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
  22. return p.parse(doc, encoding=encoding)
  23. def parseFragment(doc, container="div", treebuilder="etree", encoding=None,
  24. namespaceHTMLElements=True):
  25. tb = treebuilders.getTreeBuilder(treebuilder)
  26. p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
  27. return p.parseFragment(doc, container=container, encoding=encoding)
  28. def method_decorator_metaclass(function):
  29. class Decorated(type):
  30. def __new__(meta, classname, bases, classDict):
  31. for attributeName, attribute in classDict.items():
  32. if isinstance(attribute, types.FunctionType):
  33. attribute = function(attribute)
  34. classDict[attributeName] = attribute
  35. return type.__new__(meta, classname, bases, classDict)
  36. return Decorated
  37. class HTMLParser(object):
  38. """HTML parser. Generates a tree structure from a stream of (possibly
  39. malformed) HTML"""
  40. def __init__(self, tree=None, tokenizer=tokenizer.HTMLTokenizer,
  41. strict=False, namespaceHTMLElements=True, debug=False):
  42. """
  43. strict - raise an exception when a parse error is encountered
  44. tree - a treebuilder class controlling the type of tree that will be
  45. returned. Built in treebuilders can be accessed through
  46. html5lib.treebuilders.getTreeBuilder(treeType)
  47. tokenizer - a class that provides a stream of tokens to the treebuilder.
  48. This may be replaced for e.g. a sanitizer which converts some tags to
  49. text
  50. """
  51. # Raise an exception on the first error encountered
  52. self.strict = strict
  53. if tree is None:
  54. tree = treebuilders.getTreeBuilder("etree")
  55. self.tree = tree(namespaceHTMLElements)
  56. self.tokenizer_class = tokenizer
  57. self.errors = []
  58. self.phases = dict([(name, cls(self, self.tree)) for name, cls in
  59. getPhases(debug).items()])
  60. def _parse(self, stream, innerHTML=False, container="div",
  61. encoding=None, parseMeta=True, useChardet=True, **kwargs):
  62. self.innerHTMLMode = innerHTML
  63. self.container = container
  64. self.tokenizer = self.tokenizer_class(stream, encoding=encoding,
  65. parseMeta=parseMeta,
  66. useChardet=useChardet,
  67. parser=self, **kwargs)
  68. self.reset()
  69. while True:
  70. try:
  71. self.mainLoop()
  72. break
  73. except ReparseException:
  74. self.reset()
  75. def reset(self):
  76. self.tree.reset()
  77. self.firstStartTag = False
  78. self.errors = []
  79. self.log = [] # only used with debug mode
  80. # "quirks" / "limited quirks" / "no quirks"
  81. self.compatMode = "no quirks"
  82. if self.innerHTMLMode:
  83. self.innerHTML = self.container.lower()
  84. if self.innerHTML in cdataElements:
  85. self.tokenizer.state = self.tokenizer.rcdataState
  86. elif self.innerHTML in rcdataElements:
  87. self.tokenizer.state = self.tokenizer.rawtextState
  88. elif self.innerHTML == 'plaintext':
  89. self.tokenizer.state = self.tokenizer.plaintextState
  90. else:
  91. # state already is data state
  92. # self.tokenizer.state = self.tokenizer.dataState
  93. pass
  94. self.phase = self.phases["beforeHtml"]
  95. self.phase.insertHtmlElement()
  96. self.resetInsertionMode()
  97. else:
  98. self.innerHTML = False
  99. self.phase = self.phases["initial"]
  100. self.lastPhase = None
  101. self.beforeRCDataPhase = None
  102. self.framesetOK = True
  103. def isHTMLIntegrationPoint(self, element):
  104. if (element.name == "annotation-xml" and
  105. element.namespace == namespaces["mathml"]):
  106. return ("encoding" in element.attributes and
  107. element.attributes["encoding"].translate(
  108. asciiUpper2Lower) in
  109. ("text/html", "application/xhtml+xml"))
  110. else:
  111. return (element.namespace, element.name) in htmlIntegrationPointElements
  112. def isMathMLTextIntegrationPoint(self, element):
  113. return (element.namespace, element.name) in mathmlTextIntegrationPointElements
  114. def mainLoop(self):
  115. CharactersToken = tokenTypes["Characters"]
  116. SpaceCharactersToken = tokenTypes["SpaceCharacters"]
  117. StartTagToken = tokenTypes["StartTag"]
  118. EndTagToken = tokenTypes["EndTag"]
  119. CommentToken = tokenTypes["Comment"]
  120. DoctypeToken = tokenTypes["Doctype"]
  121. ParseErrorToken = tokenTypes["ParseError"]
  122. for token in self.normalizedTokens():
  123. new_token = token
  124. while new_token is not None:
  125. currentNode = self.tree.openElements[-1] if self.tree.openElements else None
  126. currentNodeNamespace = currentNode.namespace if currentNode else None
  127. currentNodeName = currentNode.name if currentNode else None
  128. type = new_token["type"]
  129. if type == ParseErrorToken:
  130. self.parseError(new_token["data"], new_token.get("datavars", {}))
  131. new_token = None
  132. else:
  133. if (len(self.tree.openElements) == 0 or
  134. currentNodeNamespace == self.tree.defaultNamespace or
  135. (self.isMathMLTextIntegrationPoint(currentNode) and
  136. ((type == StartTagToken and
  137. token["name"] not in frozenset(["mglyph", "malignmark"])) or
  138. type in (CharactersToken, SpaceCharactersToken))) or
  139. (currentNodeNamespace == namespaces["mathml"] and
  140. currentNodeName == "annotation-xml" and
  141. token["name"] == "svg") or
  142. (self.isHTMLIntegrationPoint(currentNode) and
  143. type in (StartTagToken, CharactersToken, SpaceCharactersToken))):
  144. phase = self.phase
  145. else:
  146. phase = self.phases["inForeignContent"]
  147. if type == CharactersToken:
  148. new_token = phase.processCharacters(new_token)
  149. elif type == SpaceCharactersToken:
  150. new_token = phase.processSpaceCharacters(new_token)
  151. elif type == StartTagToken:
  152. new_token = phase.processStartTag(new_token)
  153. elif type == EndTagToken:
  154. new_token = phase.processEndTag(new_token)
  155. elif type == CommentToken:
  156. new_token = phase.processComment(new_token)
  157. elif type == DoctypeToken:
  158. new_token = phase.processDoctype(new_token)
  159. if (type == StartTagToken and token["selfClosing"]
  160. and not token["selfClosingAcknowledged"]):
  161. self.parseError("non-void-element-with-trailing-solidus",
  162. {"name": token["name"]})
  163. # When the loop finishes it's EOF
  164. reprocess = True
  165. phases = []
  166. while reprocess:
  167. phases.append(self.phase)
  168. reprocess = self.phase.processEOF()
  169. if reprocess:
  170. assert self.phase not in phases
  171. def normalizedTokens(self):
  172. for token in self.tokenizer:
  173. yield self.normalizeToken(token)
  174. def parse(self, stream, encoding=None, parseMeta=True, useChardet=True):
  175. """Parse a HTML document into a well-formed tree
  176. stream - a filelike object or string containing the HTML to be parsed
  177. The optional encoding parameter must be a string that indicates
  178. the encoding. If specified, that encoding will be used,
  179. regardless of any BOM or later declaration (such as in a meta
  180. element)
  181. """
  182. self._parse(stream, innerHTML=False, encoding=encoding,
  183. parseMeta=parseMeta, useChardet=useChardet)
  184. return self.tree.getDocument()
  185. def parseFragment(self, stream, container="div", encoding=None,
  186. parseMeta=False, useChardet=True):
  187. """Parse a HTML fragment into a well-formed tree fragment
  188. container - name of the element we're setting the innerHTML property
  189. if set to None, default to 'div'
  190. stream - a filelike object or string containing the HTML to be parsed
  191. The optional encoding parameter must be a string that indicates
  192. the encoding. If specified, that encoding will be used,
  193. regardless of any BOM or later declaration (such as in a meta
  194. element)
  195. """
  196. self._parse(stream, True, container=container, encoding=encoding)
  197. return self.tree.getFragment()
  198. def parseError(self, errorcode="XXX-undefined-error", datavars={}):
  199. # XXX The idea is to make errorcode mandatory.
  200. self.errors.append((self.tokenizer.stream.position(), errorcode, datavars))
  201. if self.strict:
  202. raise ParseError
  203. def normalizeToken(self, token):
  204. """ HTML5 specific normalizations to the token stream """
  205. if token["type"] == tokenTypes["StartTag"]:
  206. token["data"] = dict(token["data"][::-1])
  207. return token
  208. def adjustMathMLAttributes(self, token):
  209. replacements = {"definitionurl": "definitionURL"}
  210. for k, v in replacements.items():
  211. if k in token["data"]:
  212. token["data"][v] = token["data"][k]
  213. del token["data"][k]
  214. def adjustSVGAttributes(self, token):
  215. replacements = {
  216. "attributename": "attributeName",
  217. "attributetype": "attributeType",
  218. "basefrequency": "baseFrequency",
  219. "baseprofile": "baseProfile",
  220. "calcmode": "calcMode",
  221. "clippathunits": "clipPathUnits",
  222. "contentscripttype": "contentScriptType",
  223. "contentstyletype": "contentStyleType",
  224. "diffuseconstant": "diffuseConstant",
  225. "edgemode": "edgeMode",
  226. "externalresourcesrequired": "externalResourcesRequired",
  227. "filterres": "filterRes",
  228. "filterunits": "filterUnits",
  229. "glyphref": "glyphRef",
  230. "gradienttransform": "gradientTransform",
  231. "gradientunits": "gradientUnits",
  232. "kernelmatrix": "kernelMatrix",
  233. "kernelunitlength": "kernelUnitLength",
  234. "keypoints": "keyPoints",
  235. "keysplines": "keySplines",
  236. "keytimes": "keyTimes",
  237. "lengthadjust": "lengthAdjust",
  238. "limitingconeangle": "limitingConeAngle",
  239. "markerheight": "markerHeight",
  240. "markerunits": "markerUnits",
  241. "markerwidth": "markerWidth",
  242. "maskcontentunits": "maskContentUnits",
  243. "maskunits": "maskUnits",
  244. "numoctaves": "numOctaves",
  245. "pathlength": "pathLength",
  246. "patterncontentunits": "patternContentUnits",
  247. "patterntransform": "patternTransform",
  248. "patternunits": "patternUnits",
  249. "pointsatx": "pointsAtX",
  250. "pointsaty": "pointsAtY",
  251. "pointsatz": "pointsAtZ",
  252. "preservealpha": "preserveAlpha",
  253. "preserveaspectratio": "preserveAspectRatio",
  254. "primitiveunits": "primitiveUnits",
  255. "refx": "refX",
  256. "refy": "refY",
  257. "repeatcount": "repeatCount",
  258. "repeatdur": "repeatDur",
  259. "requiredextensions": "requiredExtensions",
  260. "requiredfeatures": "requiredFeatures",
  261. "specularconstant": "specularConstant",
  262. "specularexponent": "specularExponent",
  263. "spreadmethod": "spreadMethod",
  264. "startoffset": "startOffset",
  265. "stddeviation": "stdDeviation",
  266. "stitchtiles": "stitchTiles",
  267. "surfacescale": "surfaceScale",
  268. "systemlanguage": "systemLanguage",
  269. "tablevalues": "tableValues",
  270. "targetx": "targetX",
  271. "targety": "targetY",
  272. "textlength": "textLength",
  273. "viewbox": "viewBox",
  274. "viewtarget": "viewTarget",
  275. "xchannelselector": "xChannelSelector",
  276. "ychannelselector": "yChannelSelector",
  277. "zoomandpan": "zoomAndPan"
  278. }
  279. for originalName in list(token["data"].keys()):
  280. if originalName in replacements:
  281. svgName = replacements[originalName]
  282. token["data"][svgName] = token["data"][originalName]
  283. del token["data"][originalName]
  284. def adjustForeignAttributes(self, token):
  285. replacements = adjustForeignAttributesMap
  286. for originalName in token["data"].keys():
  287. if originalName in replacements:
  288. foreignName = replacements[originalName]
  289. token["data"][foreignName] = token["data"][originalName]
  290. del token["data"][originalName]
  291. def reparseTokenNormal(self, token):
  292. self.parser.phase()
  293. def resetInsertionMode(self):
  294. # The name of this method is mostly historical. (It's also used in the
  295. # specification.)
  296. last = False
  297. newModes = {
  298. "select": "inSelect",
  299. "td": "inCell",
  300. "th": "inCell",
  301. "tr": "inRow",
  302. "tbody": "inTableBody",
  303. "thead": "inTableBody",
  304. "tfoot": "inTableBody",
  305. "caption": "inCaption",
  306. "colgroup": "inColumnGroup",
  307. "table": "inTable",
  308. "head": "inBody",
  309. "body": "inBody",
  310. "frameset": "inFrameset",
  311. "html": "beforeHead"
  312. }
  313. for node in self.tree.openElements[::-1]:
  314. nodeName = node.name
  315. new_phase = None
  316. if node == self.tree.openElements[0]:
  317. assert self.innerHTML
  318. last = True
  319. nodeName = self.innerHTML
  320. # Check for conditions that should only happen in the innerHTML
  321. # case
  322. if nodeName in ("select", "colgroup", "head", "html"):
  323. assert self.innerHTML
  324. if not last and node.namespace != self.tree.defaultNamespace:
  325. continue
  326. if nodeName in newModes:
  327. new_phase = self.phases[newModes[nodeName]]
  328. break
  329. elif last:
  330. new_phase = self.phases["inBody"]
  331. break
  332. self.phase = new_phase
  333. def parseRCDataRawtext(self, token, contentType):
  334. """Generic RCDATA/RAWTEXT Parsing algorithm
  335. contentType - RCDATA or RAWTEXT
  336. """
  337. assert contentType in ("RAWTEXT", "RCDATA")
  338. self.tree.insertElement(token)
  339. if contentType == "RAWTEXT":
  340. self.tokenizer.state = self.tokenizer.rawtextState
  341. else:
  342. self.tokenizer.state = self.tokenizer.rcdataState
  343. self.originalPhase = self.phase
  344. self.phase = self.phases["text"]
  345. def getPhases(debug):
  346. def log(function):
  347. """Logger that records which phase processes each token"""
  348. type_names = dict((value, key) for key, value in
  349. constants.tokenTypes.items())
  350. def wrapped(self, *args, **kwargs):
  351. if function.__name__.startswith("process") and len(args) > 0:
  352. token = args[0]
  353. try:
  354. info = {"type": type_names[token['type']]}
  355. except:
  356. raise
  357. if token['type'] in constants.tagTokenTypes:
  358. info["name"] = token['name']
  359. self.parser.log.append((self.parser.tokenizer.state.__name__,
  360. self.parser.phase.__class__.__name__,
  361. self.__class__.__name__,
  362. function.__name__,
  363. info))
  364. return function(self, *args, **kwargs)
  365. else:
  366. return function(self, *args, **kwargs)
  367. return wrapped
  368. def getMetaclass(use_metaclass, metaclass_func):
  369. if use_metaclass:
  370. return method_decorator_metaclass(metaclass_func)
  371. else:
  372. return type
  373. class Phase(with_metaclass(getMetaclass(debug, log))):
  374. """Base class for helper object that implements each phase of processing
  375. """
  376. def __init__(self, parser, tree):
  377. self.parser = parser
  378. self.tree = tree
  379. def processEOF(self):
  380. raise NotImplementedError
  381. def processComment(self, token):
  382. # For most phases the following is correct. Where it's not it will be
  383. # overridden.
  384. self.tree.insertComment(token, self.tree.openElements[-1])
  385. def processDoctype(self, token):
  386. self.parser.parseError("unexpected-doctype")
  387. def processCharacters(self, token):
  388. self.tree.insertText(token["data"])
  389. def processSpaceCharacters(self, token):
  390. self.tree.insertText(token["data"])
  391. def processStartTag(self, token):
  392. return self.startTagHandler[token["name"]](token)
  393. def startTagHtml(self, token):
  394. if not self.parser.firstStartTag and token["name"] == "html":
  395. self.parser.parseError("non-html-root")
  396. # XXX Need a check here to see if the first start tag token emitted is
  397. # this token... If it's not, invoke self.parser.parseError().
  398. for attr, value in token["data"].items():
  399. if attr not in self.tree.openElements[0].attributes:
  400. self.tree.openElements[0].attributes[attr] = value
  401. self.parser.firstStartTag = False
  402. def processEndTag(self, token):
  403. return self.endTagHandler[token["name"]](token)
  404. class InitialPhase(Phase):
  405. def processSpaceCharacters(self, token):
  406. pass
  407. def processComment(self, token):
  408. self.tree.insertComment(token, self.tree.document)
  409. def processDoctype(self, token):
  410. name = token["name"]
  411. publicId = token["publicId"]
  412. systemId = token["systemId"]
  413. correct = token["correct"]
  414. if (name != "html" or publicId is not None or
  415. systemId is not None and systemId != "about:legacy-compat"):
  416. self.parser.parseError("unknown-doctype")
  417. if publicId is None:
  418. publicId = ""
  419. self.tree.insertDoctype(token)
  420. if publicId != "":
  421. publicId = publicId.translate(asciiUpper2Lower)
  422. if (not correct or token["name"] != "html"
  423. or publicId.startswith(
  424. ("+//silmaril//dtd html pro v0r11 19970101//",
  425. "-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
  426. "-//as//dtd html 3.0 aswedit + extensions//",
  427. "-//ietf//dtd html 2.0 level 1//",
  428. "-//ietf//dtd html 2.0 level 2//",
  429. "-//ietf//dtd html 2.0 strict level 1//",
  430. "-//ietf//dtd html 2.0 strict level 2//",
  431. "-//ietf//dtd html 2.0 strict//",
  432. "-//ietf//dtd html 2.0//",
  433. "-//ietf//dtd html 2.1e//",
  434. "-//ietf//dtd html 3.0//",
  435. "-//ietf//dtd html 3.2 final//",
  436. "-//ietf//dtd html 3.2//",
  437. "-//ietf//dtd html 3//",
  438. "-//ietf//dtd html level 0//",
  439. "-//ietf//dtd html level 1//",
  440. "-//ietf//dtd html level 2//",
  441. "-//ietf//dtd html level 3//",
  442. "-//ietf//dtd html strict level 0//",
  443. "-//ietf//dtd html strict level 1//",
  444. "-//ietf//dtd html strict level 2//",
  445. "-//ietf//dtd html strict level 3//",
  446. "-//ietf//dtd html strict//",
  447. "-//ietf//dtd html//",
  448. "-//metrius//dtd metrius presentational//",
  449. "-//microsoft//dtd internet explorer 2.0 html strict//",
  450. "-//microsoft//dtd internet explorer 2.0 html//",
  451. "-//microsoft//dtd internet explorer 2.0 tables//",
  452. "-//microsoft//dtd internet explorer 3.0 html strict//",
  453. "-//microsoft//dtd internet explorer 3.0 html//",
  454. "-//microsoft//dtd internet explorer 3.0 tables//",
  455. "-//netscape comm. corp.//dtd html//",
  456. "-//netscape comm. corp.//dtd strict html//",
  457. "-//o'reilly and associates//dtd html 2.0//",
  458. "-//o'reilly and associates//dtd html extended 1.0//",
  459. "-//o'reilly and associates//dtd html extended relaxed 1.0//",
  460. "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
  461. "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
  462. "-//spyglass//dtd html 2.0 extended//",
  463. "-//sq//dtd html 2.0 hotmetal + extensions//",
  464. "-//sun microsystems corp.//dtd hotjava html//",
  465. "-//sun microsystems corp.//dtd hotjava strict html//",
  466. "-//w3c//dtd html 3 1995-03-24//",
  467. "-//w3c//dtd html 3.2 draft//",
  468. "-//w3c//dtd html 3.2 final//",
  469. "-//w3c//dtd html 3.2//",
  470. "-//w3c//dtd html 3.2s draft//",
  471. "-//w3c//dtd html 4.0 frameset//",
  472. "-//w3c//dtd html 4.0 transitional//",
  473. "-//w3c//dtd html experimental 19960712//",
  474. "-//w3c//dtd html experimental 970421//",
  475. "-//w3c//dtd w3 html//",
  476. "-//w3o//dtd w3 html 3.0//",
  477. "-//webtechs//dtd mozilla html 2.0//",
  478. "-//webtechs//dtd mozilla html//"))
  479. or publicId in
  480. ("-//w3o//dtd w3 html strict 3.0//en//",
  481. "-/w3c/dtd html 4.0 transitional/en",
  482. "html")
  483. or publicId.startswith(
  484. ("-//w3c//dtd html 4.01 frameset//",
  485. "-//w3c//dtd html 4.01 transitional//")) and
  486. systemId is None
  487. or systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
  488. self.parser.compatMode = "quirks"
  489. elif (publicId.startswith(
  490. ("-//w3c//dtd xhtml 1.0 frameset//",
  491. "-//w3c//dtd xhtml 1.0 transitional//"))
  492. or publicId.startswith(
  493. ("-//w3c//dtd html 4.01 frameset//",
  494. "-//w3c//dtd html 4.01 transitional//")) and
  495. systemId is not None):
  496. self.parser.compatMode = "limited quirks"
  497. self.parser.phase = self.parser.phases["beforeHtml"]
  498. def anythingElse(self):
  499. self.parser.compatMode = "quirks"
  500. self.parser.phase = self.parser.phases["beforeHtml"]
  501. def processCharacters(self, token):
  502. self.parser.parseError("expected-doctype-but-got-chars")
  503. self.anythingElse()
  504. return token
  505. def processStartTag(self, token):
  506. self.parser.parseError("expected-doctype-but-got-start-tag",
  507. {"name": token["name"]})
  508. self.anythingElse()
  509. return token
  510. def processEndTag(self, token):
  511. self.parser.parseError("expected-doctype-but-got-end-tag",
  512. {"name": token["name"]})
  513. self.anythingElse()
  514. return token
  515. def processEOF(self):
  516. self.parser.parseError("expected-doctype-but-got-eof")
  517. self.anythingElse()
  518. return True
  519. class BeforeHtmlPhase(Phase):
  520. # helper methods
  521. def insertHtmlElement(self):
  522. self.tree.insertRoot(impliedTagToken("html", "StartTag"))
  523. self.parser.phase = self.parser.phases["beforeHead"]
  524. # other
  525. def processEOF(self):
  526. self.insertHtmlElement()
  527. return True
  528. def processComment(self, token):
  529. self.tree.insertComment(token, self.tree.document)
  530. def processSpaceCharacters(self, token):
  531. pass
  532. def processCharacters(self, token):
  533. self.insertHtmlElement()
  534. return token
  535. def processStartTag(self, token):
  536. if token["name"] == "html":
  537. self.parser.firstStartTag = True
  538. self.insertHtmlElement()
  539. return token
  540. def processEndTag(self, token):
  541. if token["name"] not in ("head", "body", "html", "br"):
  542. self.parser.parseError("unexpected-end-tag-before-html",
  543. {"name": token["name"]})
  544. else:
  545. self.insertHtmlElement()
  546. return token
  547. class BeforeHeadPhase(Phase):
  548. def __init__(self, parser, tree):
  549. Phase.__init__(self, parser, tree)
  550. self.startTagHandler = utils.MethodDispatcher([
  551. ("html", self.startTagHtml),
  552. ("head", self.startTagHead)
  553. ])
  554. self.startTagHandler.default = self.startTagOther
  555. self.endTagHandler = utils.MethodDispatcher([
  556. (("head", "body", "html", "br"), self.endTagImplyHead)
  557. ])
  558. self.endTagHandler.default = self.endTagOther
  559. def processEOF(self):
  560. self.startTagHead(impliedTagToken("head", "StartTag"))
  561. return True
  562. def processSpaceCharacters(self, token):
  563. pass
  564. def processCharacters(self, token):
  565. self.startTagHead(impliedTagToken("head", "StartTag"))
  566. return token
  567. def startTagHtml(self, token):
  568. return self.parser.phases["inBody"].processStartTag(token)
  569. def startTagHead(self, token):
  570. self.tree.insertElement(token)
  571. self.tree.headPointer = self.tree.openElements[-1]
  572. self.parser.phase = self.parser.phases["inHead"]
  573. def startTagOther(self, token):
  574. self.startTagHead(impliedTagToken("head", "StartTag"))
  575. return token
  576. def endTagImplyHead(self, token):
  577. self.startTagHead(impliedTagToken("head", "StartTag"))
  578. return token
  579. def endTagOther(self, token):
  580. self.parser.parseError("end-tag-after-implied-root",
  581. {"name": token["name"]})
  582. class InHeadPhase(Phase):
  583. def __init__(self, parser, tree):
  584. Phase.__init__(self, parser, tree)
  585. self.startTagHandler = utils.MethodDispatcher([
  586. ("html", self.startTagHtml),
  587. ("title", self.startTagTitle),
  588. (("noscript", "noframes", "style"), self.startTagNoScriptNoFramesStyle),
  589. ("script", self.startTagScript),
  590. (("base", "basefont", "bgsound", "command", "link"),
  591. self.startTagBaseLinkCommand),
  592. ("meta", self.startTagMeta),
  593. ("head", self.startTagHead)
  594. ])
  595. self.startTagHandler.default = self.startTagOther
  596. self. endTagHandler = utils.MethodDispatcher([
  597. ("head", self.endTagHead),
  598. (("br", "html", "body"), self.endTagHtmlBodyBr)
  599. ])
  600. self.endTagHandler.default = self.endTagOther
  601. # the real thing
  602. def processEOF(self):
  603. self.anythingElse()
  604. return True
  605. def processCharacters(self, token):
  606. self.anythingElse()
  607. return token
  608. def startTagHtml(self, token):
  609. return self.parser.phases["inBody"].processStartTag(token)
  610. def startTagHead(self, token):
  611. self.parser.parseError("two-heads-are-not-better-than-one")
  612. def startTagBaseLinkCommand(self, token):
  613. self.tree.insertElement(token)
  614. self.tree.openElements.pop()
  615. token["selfClosingAcknowledged"] = True
  616. def startTagMeta(self, token):
  617. self.tree.insertElement(token)
  618. self.tree.openElements.pop()
  619. token["selfClosingAcknowledged"] = True
  620. attributes = token["data"]
  621. if self.parser.tokenizer.stream.charEncoding[1] == "tentative":
  622. if "charset" in attributes:
  623. self.parser.tokenizer.stream.changeEncoding(attributes["charset"])
  624. elif ("content" in attributes and
  625. "http-equiv" in attributes and
  626. attributes["http-equiv"].lower() == "content-type"):
  627. # Encoding it as UTF-8 here is a hack, as really we should pass
  628. # the abstract Unicode string, and just use the
  629. # ContentAttrParser on that, but using UTF-8 allows all chars
  630. # to be encoded and as a ASCII-superset works.
  631. data = inputstream.EncodingBytes(attributes["content"].encode("utf-8"))
  632. parser = inputstream.ContentAttrParser(data)
  633. codec = parser.parse()
  634. self.parser.tokenizer.stream.changeEncoding(codec)
  635. def startTagTitle(self, token):
  636. self.parser.parseRCDataRawtext(token, "RCDATA")
  637. def startTagNoScriptNoFramesStyle(self, token):
  638. # Need to decide whether to implement the scripting-disabled case
  639. self.parser.parseRCDataRawtext(token, "RAWTEXT")
  640. def startTagScript(self, token):
  641. self.tree.insertElement(token)
  642. self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState
  643. self.parser.originalPhase = self.parser.phase
  644. self.parser.phase = self.parser.phases["text"]
  645. def startTagOther(self, token):
  646. self.anythingElse()
  647. return token
  648. def endTagHead(self, token):
  649. node = self.parser.tree.openElements.pop()
  650. assert node.name == "head", "Expected head got %s" % node.name
  651. self.parser.phase = self.parser.phases["afterHead"]
  652. def endTagHtmlBodyBr(self, token):
  653. self.anythingElse()
  654. return token
  655. def endTagOther(self, token):
  656. self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
  657. def anythingElse(self):
  658. self.endTagHead(impliedTagToken("head"))
  659. # XXX If we implement a parser for which scripting is disabled we need to
  660. # implement this phase.
  661. #
  662. # class InHeadNoScriptPhase(Phase):
  663. class AfterHeadPhase(Phase):
  664. def __init__(self, parser, tree):
  665. Phase.__init__(self, parser, tree)
  666. self.startTagHandler = utils.MethodDispatcher([
  667. ("html", self.startTagHtml),
  668. ("body", self.startTagBody),
  669. ("frameset", self.startTagFrameset),
  670. (("base", "basefont", "bgsound", "link", "meta", "noframes", "script",
  671. "style", "title"),
  672. self.startTagFromHead),
  673. ("head", self.startTagHead)
  674. ])
  675. self.startTagHandler.default = self.startTagOther
  676. self.endTagHandler = utils.MethodDispatcher([(("body", "html", "br"),
  677. self.endTagHtmlBodyBr)])
  678. self.endTagHandler.default = self.endTagOther
  679. def processEOF(self):
  680. self.anythingElse()
  681. return True
  682. def processCharacters(self, token):
  683. self.anythingElse()
  684. return token
  685. def startTagHtml(self, token):
  686. return self.parser.phases["inBody"].processStartTag(token)
  687. def startTagBody(self, token):
  688. self.parser.framesetOK = False
  689. self.tree.insertElement(token)
  690. self.parser.phase = self.parser.phases["inBody"]
  691. def startTagFrameset(self, token):
  692. self.tree.insertElement(token)
  693. self.parser.phase = self.parser.phases["inFrameset"]
  694. def startTagFromHead(self, token):
  695. self.parser.parseError("unexpected-start-tag-out-of-my-head",
  696. {"name": token["name"]})
  697. self.tree.openElements.append(self.tree.headPointer)
  698. self.parser.phases["inHead"].processStartTag(token)
  699. for node in self.tree.openElements[::-1]:
  700. if node.name == "head":
  701. self.tree.openElements.remove(node)
  702. break
  703. def startTagHead(self, token):
  704. self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
  705. def startTagOther(self, token):
  706. self.anythingElse()
  707. return token
  708. def endTagHtmlBodyBr(self, token):
  709. self.anythingElse()
  710. return token
  711. def endTagOther(self, token):
  712. self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
  713. def anythingElse(self):
  714. self.tree.insertElement(impliedTagToken("body", "StartTag"))
  715. self.parser.phase = self.parser.phases["inBody"]
  716. self.parser.framesetOK = True
  717. class InBodyPhase(Phase):
  718. # http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody
  719. # the really-really-really-very crazy mode
  720. def __init__(self, parser, tree):
  721. Phase.__init__(self, parser, tree)
  722. # Keep a ref to this for special handling of whitespace in <pre>
  723. self.processSpaceCharactersNonPre = self.processSpaceCharacters
  724. self.startTagHandler = utils.MethodDispatcher([
  725. ("html", self.startTagHtml),
  726. (("base", "basefont", "bgsound", "command", "link", "meta",
  727. "noframes", "script", "style", "title"),
  728. self.startTagProcessInHead),
  729. ("body", self.startTagBody),
  730. ("frameset", self.startTagFrameset),
  731. (("address", "article", "aside", "blockquote", "center", "details",
  732. "details", "dir", "div", "dl", "fieldset", "figcaption", "figure",
  733. "footer", "header", "hgroup", "main", "menu", "nav", "ol", "p",
  734. "section", "summary", "ul"),
  735. self.startTagCloseP),
  736. (headingElements, self.startTagHeading),
  737. (("pre", "listing"), self.startTagPreListing),
  738. ("form", self.startTagForm),
  739. (("li", "dd", "dt"), self.startTagListItem),
  740. ("plaintext", self.startTagPlaintext),
  741. ("a", self.startTagA),
  742. (("b", "big", "code", "em", "font", "i", "s", "small", "strike",
  743. "strong", "tt", "u"), self.startTagFormatting),
  744. ("nobr", self.startTagNobr),
  745. ("button", self.startTagButton),
  746. (("applet", "marquee", "object"), self.startTagAppletMarqueeObject),
  747. ("xmp", self.startTagXmp),
  748. ("table", self.startTagTable),
  749. (("area", "br", "embed", "img", "keygen", "wbr"),
  750. self.startTagVoidFormatting),
  751. (("param", "source", "track"), self.startTagParamSource),
  752. ("input", self.startTagInput),
  753. ("hr", self.startTagHr),
  754. ("image", self.startTagImage),
  755. ("isindex", self.startTagIsIndex),
  756. ("textarea", self.startTagTextarea),
  757. ("iframe", self.startTagIFrame),
  758. (("noembed", "noframes", "noscript"), self.startTagRawtext),
  759. ("select", self.startTagSelect),
  760. (("rp", "rt"), self.startTagRpRt),
  761. (("option", "optgroup"), self.startTagOpt),
  762. (("math"), self.startTagMath),
  763. (("svg"), self.startTagSvg),
  764. (("caption", "col", "colgroup", "frame", "head",
  765. "tbody", "td", "tfoot", "th", "thead",
  766. "tr"), self.startTagMisplaced)
  767. ])
  768. self.startTagHandler.default = self.startTagOther
  769. self.endTagHandler = utils.MethodDispatcher([
  770. ("body", self.endTagBody),
  771. ("html", self.endTagHtml),
  772. (("address", "article", "aside", "blockquote", "button", "center",
  773. "details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure",
  774. "footer", "header", "hgroup", "listing", "main", "menu", "nav", "ol", "pre",
  775. "section", "summary", "ul"), self.endTagBlock),
  776. ("form", self.endTagForm),
  777. ("p", self.endTagP),
  778. (("dd", "dt", "li"), self.endTagListItem),
  779. (headingElements, self.endTagHeading),
  780. (("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small",
  781. "strike", "strong", "tt", "u"), self.endTagFormatting),
  782. (("applet", "marquee", "object"), self.endTagAppletMarqueeObject),
  783. ("br", self.endTagBr),
  784. ])
  785. self.endTagHandler.default = self.endTagOther
  786. def isMatchingFormattingElement(self, node1, node2):
  787. if node1.name != node2.name or node1.namespace != node2.namespace:
  788. return False
  789. elif len(node1.attributes) != len(node2.attributes):
  790. return False
  791. else:
  792. attributes1 = sorted(node1.attributes.items())
  793. attributes2 = sorted(node2.attributes.items())
  794. for attr1, attr2 in zip(attributes1, attributes2):
  795. if attr1 != attr2:
  796. return False
  797. return True
  798. # helper
  799. def addFormattingElement(self, token):
  800. self.tree.insertElement(token)
  801. element = self.tree.openElements[-1]
  802. matchingElements = []
  803. for node in self.tree.activeFormattingElements[::-1]:
  804. if node is Marker:
  805. break
  806. elif self.isMatchingFormattingElement(node, element):
  807. matchingElements.append(node)
  808. assert len(matchingElements) <= 3
  809. if len(matchingElements) == 3:
  810. self.tree.activeFormattingElements.remove(matchingElements[-1])
  811. self.tree.activeFormattingElements.append(element)
  812. # the real deal
  813. def processEOF(self):
  814. allowed_elements = frozenset(("dd", "dt", "li", "p", "tbody", "td",
  815. "tfoot", "th", "thead", "tr", "body",
  816. "html"))
  817. for node in self.tree.openElements[::-1]:
  818. if node.name not in allowed_elements:
  819. self.parser.parseError("expected-closing-tag-but-got-eof")
  820. break
  821. # Stop parsing
  822. def processSpaceCharactersDropNewline(self, token):
  823. # Sometimes (start of <pre>, <listing>, and <textarea> blocks) we
  824. # want to drop leading newlines
  825. data = token["data"]
  826. self.processSpaceCharacters = self.processSpaceCharactersNonPre
  827. if (data.startswith("\n") and
  828. self.tree.openElements[-1].name in ("pre", "listing", "textarea")
  829. and not self.tree.openElements[-1].hasContent()):
  830. data = data[1:]
  831. if data:
  832. self.tree.reconstructActiveFormattingElements()
  833. self.tree.insertText(data)
  834. def processCharacters(self, token):
  835. if token["data"] == "\u0000":
  836. # The tokenizer should always emit null on its own
  837. return
  838. self.tree.reconstructActiveFormattingElements()
  839. self.tree.insertText(token["data"])
  840. # This must be bad for performance
  841. if (self.parser.framesetOK and
  842. any([char not in spaceCharacters
  843. for char in token["data"]])):
  844. self.parser.framesetOK = False
  845. def processSpaceCharacters(self, token):
  846. self.tree.reconstructActiveFormattingElements()
  847. self.tree.insertText(token["data"])
  848. def startTagProcessInHead(self, token):
  849. return self.parser.phases["inHead"].processStartTag(token)
  850. def startTagBody(self, token):
  851. self.parser.parseError("unexpected-start-tag", {"name": "body"})
  852. if (len(self.tree.openElements) == 1
  853. or self.tree.openElements[1].name != "body"):
  854. assert self.parser.innerHTML
  855. else:
  856. self.parser.framesetOK = False
  857. for attr, value in token["data"].items():
  858. if attr not in self.tree.openElements[1].attributes:
  859. self.tree.openElements[1].attributes[attr] = value
  860. def startTagFrameset(self, token):
  861. self.parser.parseError("unexpected-start-tag", {"name": "frameset"})
  862. if (len(self.tree.openElements) == 1 or self.tree.openElements[1].name != "body"):
  863. assert self.parser.innerHTML
  864. elif not self.parser.framesetOK:
  865. pass
  866. else:
  867. if self.tree.openElements[1].parent:
  868. self.tree.openElements[1].parent.removeChild(self.tree.openElements[1])
  869. while self.tree.openElements[-1].name != "html":
  870. self.tree.openElements.pop()
  871. self.tree.insertElement(token)
  872. self.parser.phase = self.parser.phases["inFrameset"]
  873. def startTagCloseP(self, token):
  874. if self.tree.elementInScope("p", variant="button"):
  875. self.endTagP(impliedTagToken("p"))
  876. self.tree.insertElement(token)
  877. def startTagPreListing(self, token):
  878. if self.tree.elementInScope("p", variant="button"):
  879. self.endTagP(impliedTagToken("p"))
  880. self.tree.insertElement(token)
  881. self.parser.framesetOK = False
  882. self.processSpaceCharacters = self.processSpaceCharactersDropNewline
  883. def startTagForm(self, token):
  884. if self.tree.formPointer:
  885. self.parser.parseError("unexpected-start-tag", {"name": "form"})
  886. else:
  887. if self.tree.elementInScope("p", variant="button"):
  888. self.endTagP(impliedTagToken("p"))
  889. self.tree.insertElement(token)
  890. self.tree.formPointer = self.tree.openElements[-1]
  891. def startTagListItem(self, token):
  892. self.parser.framesetOK = False
  893. stopNamesMap = {"li": ["li"],
  894. "dt": ["dt", "dd"],
  895. "dd": ["dt", "dd"]}
  896. stopNames = stopNamesMap[token["name"]]
  897. for node in reversed(self.tree.openElements):
  898. if node.name in stopNames:
  899. self.parser.phase.processEndTag(
  900. impliedTagToken(node.name, "EndTag"))
  901. break
  902. if (node.nameTuple in specialElements and
  903. node.name not in ("address", "div", "p")):
  904. break
  905. if self.tree.elementInScope("p", variant="button"):
  906. self.parser.phase.processEndTag(
  907. impliedTagToken("p", "EndTag"))
  908. self.tree.insertElement(token)
  909. def startTagPlaintext(self, token):
  910. if self.tree.elementInScope("p", variant="button"):
  911. self.endTagP(impliedTagToken("p"))
  912. self.tree.insertElement(token)
  913. self.parser.tokenizer.state = self.parser.tokenizer.plaintextState
  914. def startTagHeading(self, token):
  915. if self.tree.elementInScope("p", variant="button"):
  916. self.endTagP(impliedTagToken("p"))
  917. if self.tree.openElements[-1].name in headingElements:
  918. self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
  919. self.tree.openElements.pop()
  920. self.tree.insertElement(token)
  921. def startTagA(self, token):
  922. afeAElement = self.tree.elementInActiveFormattingElements("a")
  923. if afeAElement:
  924. self.parser.parseError("unexpected-start-tag-implies-end-tag",
  925. {"startName": "a", "endName": "a"})
  926. self.endTagFormatting(impliedTagToken("a"))
  927. if afeAElement in self.tree.openElements:
  928. self.tree.openElements.remove(afeAElement)
  929. if afeAElement in self.tree.activeFormattingElements:
  930. self.tree.activeFormattingElements.remove(afeAElement)
  931. self.tree.reconstructActiveFormattingElements()
  932. self.addFormattingElement(token)
  933. def startTagFormatting(self, token):
  934. self.tree.reconstructActiveFormattingElements()
  935. self.addFormattingElement(token)
  936. def startTagNobr(self, token):
  937. self.tree.reconstructActiveFormattingElements()
  938. if self.tree.elementInScope("nobr"):
  939. self.parser.parseError("unexpected-start-tag-implies-end-tag",
  940. {"startName": "nobr", "endName": "nobr"})
  941. self.processEndTag(impliedTagToken("nobr"))
  942. # XXX Need tests that trigger the following
  943. self.tree.reconstructActiveFormattingElements()
  944. self.addFormattingElement(token)
  945. def startTagButton(self, token):
  946. if self.tree.elementInScope("button"):
  947. self.parser.parseError("unexpected-start-tag-implies-end-tag",
  948. {"startName": "button", "endName": "button"})
  949. self.processEndTag(impliedTagToken("button"))
  950. return token
  951. else:
  952. self.tree.reconstructActiveFormattingElements()
  953. self.tree.insertElement(token)
  954. self.parser.framesetOK = False
  955. def startTagAppletMarqueeObject(self, token):
  956. self.tree.reconstructActiveFormattingElements()
  957. self.tree.insertElement(token)
  958. self.tree.activeFormattingElements.append(Marker)
  959. self.parser.framesetOK = False
  960. def startTagXmp(self, token):
  961. if self.tree.elementInScope("p", variant="button"):
  962. self.endTagP(impliedTagToken("p"))
  963. self.tree.reconstructActiveFormattingElements()
  964. self.parser.framesetOK = False
  965. self.parser.parseRCDataRawtext(token, "RAWTEXT")
  966. def startTagTable(self, token):
  967. if self.parser.compatMode != "quirks":
  968. if self.tree.elementInScope("p", variant="button"):
  969. self.processEndTag(impliedTagToken("p"))
  970. self.tree.insertElement(token)
  971. self.parser.framesetOK = False
  972. self.parser.phase = self.parser.phases["inTable"]
  973. def startTagVoidFormatting(self, token):
  974. self.tree.reconstructActiveFormattingElements()
  975. self.tree.insertElement(token)
  976. self.tree.openElements.pop()
  977. token["selfClosingAcknowledged"] = True
  978. self.parser.framesetOK = False
  979. def startTagInput(self, token):
  980. framesetOK = self.parser.framesetOK
  981. self.startTagVoidFormatting(token)
  982. if ("type" in token["data"] and
  983. token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):
  984. # input type=hidden doesn't change framesetOK
  985. self.parser.framesetOK = framesetOK
  986. def startTagParamSource(self, token):
  987. self.tree.insertElement(token)
  988. self.tree.openElements.pop()
  989. token["selfClosingAcknowledged"] = True
  990. def startTagHr(self, token):
  991. if self.tree.elementInScope("p", variant="button"):
  992. self.endTagP(impliedTagToken("p"))
  993. self.tree.insertElement(token)
  994. self.tree.openElements.pop()
  995. token["selfClosingAcknowledged"] = True
  996. self.parser.framesetOK = False
  997. def startTagImage(self, token):
  998. # No really...
  999. self.parser.parseError("unexpected-start-tag-treated-as",
  1000. {"originalName": "image", "newName": "img"})
  1001. self.processStartTag(impliedTagToken("img", "StartTag",
  1002. attributes=token["data"],
  1003. selfClosing=token["selfClosing"]))
  1004. def startTagIsIndex(self, token):
  1005. self.parser.parseError("deprecated-tag", {"name": "isindex"})
  1006. if self.tree.formPointer:
  1007. return
  1008. form_attrs = {}
  1009. if "action" in token["data"]:
  1010. form_attrs["action"] = token["data"]["action"]
  1011. self.processStartTag(impliedTagToken("form", "StartTag",
  1012. attributes=form_attrs))
  1013. self.processStartTag(impliedTagToken("hr", "StartTag"))
  1014. self.processStartTag(impliedTagToken("label", "StartTag"))
  1015. # XXX Localization ...
  1016. if "prompt" in token["data"]:
  1017. prompt = token["data"]["prompt"]
  1018. else:
  1019. prompt = "This is a searchable index. Enter search keywords: "
  1020. self.processCharacters(
  1021. {"type": tokenTypes["Characters"], "data": prompt})
  1022. attributes = token["data"].copy()
  1023. if "action" in attributes:
  1024. del attributes["action"]
  1025. if "prompt" in attributes:
  1026. del attributes["prompt"]
  1027. attributes["name"] = "isindex"
  1028. self.processStartTag(impliedTagToken("input", "StartTag",
  1029. attributes=attributes,
  1030. selfClosing=
  1031. token["selfClosing"]))
  1032. self.processEndTag(impliedTagToken("label"))
  1033. self.processStartTag(impliedTagToken("hr", "StartTag"))
  1034. self.processEndTag(impliedTagToken("form"))
  1035. def startTagTextarea(self, token):
  1036. self.tree.insertElement(token)
  1037. self.parser.tokenizer.state = self.parser.tokenizer.rcdataState
  1038. self.processSpaceCharacters = self.processSpaceCharactersDropNewline
  1039. self.parser.framesetOK = False
  1040. def startTagIFrame(self, token):
  1041. self.parser.framesetOK = False
  1042. self.startTagRawtext(token)
  1043. def startTagRawtext(self, token):
  1044. """iframe, noembed noframes, noscript(if scripting enabled)"""
  1045. self.parser.parseRCDataRawtext(token, "RAWTEXT")
  1046. def startTagOpt(self, token):
  1047. if self.tree.openElements[-1].name == "option":
  1048. self.parser.phase.processEndTag(impliedTagToken("option"))
  1049. self.tree.reconstructActiveFormattingElements()
  1050. self.parser.tree.insertElement(token)
  1051. def startTagSelect(self, token):
  1052. self.tree.reconstructActiveFormattingElements()
  1053. self.tree.insertElement(token)
  1054. self.parser.framesetOK = False
  1055. if self.parser.phase in (self.parser.phases["inTable"],
  1056. self.parser.phases["inCaption"],
  1057. self.parser.phases["inColumnGroup"],
  1058. self.parser.phases["inTableBody"],
  1059. self.parser.phases["inRow"],
  1060. self.parser.phases["inCell"]):
  1061. self.parser.phase = self.parser.phases["inSelectInTable"]
  1062. else:
  1063. self.parser.phase = self.parser.phases["inSelect"]
  1064. def startTagRpRt(self, token):
  1065. if self.tree.elementInScope("ruby"):
  1066. self.tree.generateImpliedEndTags()
  1067. if self.tree.openElements[-1].name != "ruby":
  1068. self.parser.parseError()
  1069. self.tree.insertElement(token)
  1070. def startTagMath(self, token):
  1071. self.tree.reconstructActiveFormattingElements()
  1072. self.parser.adjustMathMLAttributes(token)
  1073. self.parser.adjustForeignAttributes(token)
  1074. token["namespace"] = namespaces["mathml"]
  1075. self.tree.insertElement(token)
  1076. # Need to get the parse error right for the case where the token
  1077. # has a namespace not equal to the xmlns attribute
  1078. if token["selfClosing"]:
  1079. self.tree.openElements.pop()
  1080. token["selfClosingAcknowledged"] = True
  1081. def startTagSvg(self, token):
  1082. self.tree.reconstructActiveFormattingElements()
  1083. self.parser.adjustSVGAttributes(token)
  1084. self.parser.adjustForeignAttributes(token)
  1085. token["namespace"] = namespaces["svg"]
  1086. self.tree.insertElement(token)
  1087. # Need to get the parse error right for the case where the token
  1088. # has a namespace not equal to the xmlns attribute
  1089. if token["selfClosing"]:
  1090. self.tree.openElements.pop()
  1091. token["selfClosingAcknowledged"] = True
  1092. def startTagMisplaced(self, token):
  1093. """ Elements that should be children of other elements that have a
  1094. different insertion mode; here they are ignored
  1095. "caption", "col", "colgroup", "frame", "frameset", "head",
  1096. "option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
  1097. "tr", "noscript"
  1098. """
  1099. self.parser.parseError("unexpected-start-tag-ignored", {"name": token["name"]})
  1100. def startTagOther(self, token):
  1101. self.tree.reconstructActiveFormattingElements()
  1102. self.tree.insertElement(token)
  1103. def endTagP(self, token):
  1104. if not self.tree.elementInScope("p", variant="button"):
  1105. self.startTagCloseP(impliedTagToken("p", "StartTag"))
  1106. self.parser.parseError("unexpected-end-tag", {"name": "p"})
  1107. self.endTagP(impliedTagToken("p", "EndTag"))
  1108. else:
  1109. self.tree.generateImpliedEndTags("p")
  1110. if self.tree.openElements[-1].name != "p":
  1111. self.parser.parseError("unexpected-end-tag", {"name": "p"})
  1112. node = self.tree.openElements.pop()
  1113. while node.name != "p":
  1114. node = self.tree.openElements.pop()
  1115. def endTagBody(self, token):
  1116. if not self.tree.elementInScope("body"):
  1117. self.parser.parseError()
  1118. return
  1119. elif self.tree.openElements[-1].name != "body":
  1120. for node in self.tree.openElements[2:]:
  1121. if node.name not in frozenset(("dd", "dt", "li", "optgroup",
  1122. "option", "p", "rp", "rt",
  1123. "tbody", "td", "tfoot",
  1124. "th", "thead", "tr", "body",
  1125. "html")):
  1126. # Not sure this is the correct name for the parse error
  1127. self.parser.parseError(
  1128. "expected-one-end-tag-but-got-another",
  1129. {"expectedName": "body", "gotName": node.name})
  1130. break
  1131. self.parser.phase = self.parser.phases["afterBody"]
  1132. def endTagHtml(self, token):
  1133. # We repeat the test for the body end tag token being ignored here
  1134. if self.tree.elementInScope("body"):
  1135. self.endTagBody(impliedTagToken("body"))
  1136. return token
  1137. def endTagBlock(self, token):
  1138. # Put us back in the right whitespace handling mode
  1139. if token["name"] == "pre":
  1140. self.processSpaceCharacters = self.processSpaceCharactersNonPre
  1141. inScope = self.tree.elementInScope(token["name"])
  1142. if inScope:
  1143. self.tree.generateImpliedEndTags()
  1144. if self.tree.openElements[-1].name != token["name"]:
  1145. self.parser.parseError("end-tag-too-early", {"name": token["name"]})
  1146. if inScope:
  1147. node = self.tree.openElements.pop()
  1148. while node.name != token["name"]:
  1149. node = self.tree.openElements.pop()
  1150. def endTagForm(self, token):
  1151. node = self.tree.formPointer
  1152. self.tree.formPointer = None
  1153. if node is None or not self.tree.elementInScope(node):
  1154. self.parser.parseError("unexpected-end-tag",
  1155. {"name": "form"})
  1156. else:
  1157. self.tree.generateImpliedEndTags()
  1158. if self.tree.openElements[-1] != node:
  1159. self.parser.parseError("end-tag-too-early-ignored",
  1160. {"name": "form"})
  1161. self.tree.openElements.remove(node)
  1162. def endTagListItem(self, token):
  1163. if token["name"] == "li":
  1164. variant = "list"
  1165. else:
  1166. variant = None
  1167. if not self.tree.elementInScope(token["name"], variant=variant):
  1168. self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
  1169. else:
  1170. self.tree.generateImpliedEndTags(exclude=token["name"])
  1171. if self.tree.openElements[-1].name != token["name"]:
  1172. self.parser.parseError(
  1173. "end-tag-too-early",
  1174. {"name": token["name"]})
  1175. node = self.tree.openElements.pop()
  1176. while node.name != token["name"]:
  1177. node = self.tree.openElements.pop()
  1178. def endTagHeading(self, token):
  1179. for item in headingElements:
  1180. if self.tree.elementInScope(item):
  1181. self.tree.generateImpliedEndTags()
  1182. break
  1183. if self.tree.openElements[-1].name != token["name"]:
  1184. self.parser.parseError("end-tag-too-early", {"name": token["name"]})
  1185. for item in headingElements:
  1186. if self.tree.elementInScope(item):
  1187. item = self.tree.openElements.pop()
  1188. while item.name not in headingElements:
  1189. item = self.tree.openElements.pop()
  1190. break
  1191. def endTagFormatting(self, token):
  1192. """The much-feared adoption agency algorithm"""
  1193. # http://svn.whatwg.org/webapps/complete.html#adoptionAgency revision 7867
  1194. # XXX Better parseError messages appreciated.
  1195. # Step 1
  1196. outerLoopCounter = 0
  1197. # Step 2
  1198. while outerLoopCounter < 8:
  1199. # Step 3
  1200. outerLoopCounter += 1
  1201. # Step 4:
  1202. # Let the formatting element be the last element in
  1203. # the list of active formatting elements that:
  1204. # - is between the end of the list and the last scope
  1205. # marker in the list, if any, or the start of the list
  1206. # otherwise, and
  1207. # - has the same tag name as the token.
  1208. formattingElement = self.tree.elementInActiveFormattingElements(
  1209. token["name"])
  1210. if (not formattingElement or
  1211. (formattingElement in self.tree.openElements and
  1212. not self.tree.elementInScope(formattingElement.name))):
  1213. # If there is no such node, then abort these steps
  1214. # and instead act as described in the "any other
  1215. # end tag" entry below.
  1216. self.endTagOther(token)
  1217. return
  1218. # Otherwise, if there is such a node, but that node is
  1219. # not in the stack of open elements, then this is a
  1220. # parse error; remove the element from the list, and
  1221. # abort these steps.
  1222. elif formattingElement not in self.tree.openElements:
  1223. self.parser.parseError("adoption-agency-1.2", {"name": token["name"]})
  1224. self.tree.activeFormattingElements.remove(formattingElement)
  1225. return
  1226. # Otherwise, if there is such a node, and that node is
  1227. # also in the stack of open elements, but the element
  1228. # is not in scope, then this is a parse error; ignore
  1229. # the token, and abort these steps.
  1230. elif not self.tree.elementInScope(formattingElement.name):
  1231. self.parser.parseError("adoption-agency-4.4", {"name": token["name"]})
  1232. return
  1233. # Otherwise, there is a formatting element and that
  1234. # element is in the stack and is in scope. If the
  1235. # element is not the current node, this is a parse
  1236. # error. In any case, proceed with the algorithm as
  1237. # written in the following steps.
  1238. else:
  1239. if formattingElement != self.tree.openElements[-1]:
  1240. self.parser.parseError("adoption-agency-1.3", {"name": token["name"]})
  1241. # Step 5:
  1242. # Let the furthest block be the topmost node in the
  1243. # stack of open elements that is lower in the stack
  1244. # than the formatting element, and is an element in
  1245. # the special category. There might not be one.
  1246. afeIndex = self.tree.openElements.index(formattingElement)
  1247. furthestBlock = None
  1248. for element in self.tree.openElements[afeIndex:]:
  1249. if element.nameTuple in specialElements:
  1250. furthestBlock = element
  1251. break
  1252. # Step 6:
  1253. # If there is no furthest block, then the UA must
  1254. # first pop all the nodes from the bottom of the stack
  1255. # of open elements, from the current node up to and
  1256. # including the formatting element, then remove the
  1257. # formatting element from the list of active
  1258. # formatting elements, and finally abort these steps.
  1259. if furthestBlock is None:
  1260. element = self.tree.openElements.pop()
  1261. while element != formattingElement:
  1262. element = self.tree.openElements.pop()
  1263. self.tree.activeFormattingElements.remove(element)
  1264. return
  1265. # Step 7
  1266. commonAncestor = self.tree.openElements[afeIndex - 1]
  1267. # Step 8:
  1268. # The bookmark is supposed to help us identify where to reinsert
  1269. # nodes in step 15. We have to ensure that we reinsert nodes after
  1270. # the node before the active formatting element. Note the bookmark
  1271. # can move in step 9.7
  1272. bookmark = self.tree.activeFormattingElements.index(formattingElement)
  1273. # Step 9
  1274. lastNode = node = furthestBlock
  1275. innerLoopCounter = 0
  1276. index = self.tree.openElements.index(node)
  1277. while innerLoopCounter < 3:
  1278. innerLoopCounter += 1
  1279. # Node is element before node in open elements
  1280. index -= 1
  1281. node = self.tree.openElements[index]
  1282. if node not in self.tree.activeFormattingElements:
  1283. self.tree.openElements.remove(node)
  1284. continue
  1285. # Step 9.6
  1286. if node == formattingElement:
  1287. break
  1288. # Step 9.7
  1289. if lastNode == furthestBlock:
  1290. bookmark = self.tree.activeFormattingElements.index(node) + 1
  1291. # Step 9.8
  1292. clone = node.cloneNode()
  1293. # Replace node with clone
  1294. self.tree.activeFormattingElements[
  1295. self.tree.activeFormattingElements.index(node)] = clone
  1296. self.tree.openElements[
  1297. self.tree.openElements.index(node)] = clone
  1298. node = clone
  1299. # Step 9.9
  1300. # Remove lastNode from its parents, if any
  1301. if lastNode.parent:
  1302. lastNode.parent.removeChild(lastNode)
  1303. node.appendChild(lastNode)
  1304. # Step 9.10
  1305. lastNode = node
  1306. # Step 10
  1307. # Foster parent lastNode if commonAncestor is a
  1308. # table, tbody, tfoot, thead, or tr we need to foster
  1309. # parent the lastNode
  1310. if lastNode.parent:
  1311. lastNode.parent.removeChild(lastNode)
  1312. if commonAncestor.name in frozenset(("table", "tbody", "tfoot", "thead", "tr")):
  1313. parent, insertBefore = self.tree.getTableMisnestedNodePosition()
  1314. parent.insertBefore(lastNode, insertBefore)
  1315. else:
  1316. commonAncestor.appendChild(lastNode)
  1317. # Step 11
  1318. clone = formattingElement.cloneNode()
  1319. # Step 12
  1320. furthestBlock.reparentChildren(clone)
  1321. # Step 13
  1322. furthestBlock.appendChild(clone)
  1323. # Step 14
  1324. self.tree.activeFormattingElements.remove(formattingElement)
  1325. self.tree.activeFormattingElements.insert(bookmark, clone)
  1326. # Step 15
  1327. self.tree.openElements.remove(formattingElement)
  1328. self.tree.openElements.insert(
  1329. self.tree.openElements.index(furthestBlock) + 1, clone)
  1330. def endTagAppletMarqueeObject(self, token):
  1331. if self.tree.elementInScope(token["name"]):
  1332. self.tree.generateImpliedEndTags()
  1333. if self.tree.openElements[-1].name != token["name"]:
  1334. self.parser.parseError("end-tag-too-early", {"name": token["name"]})
  1335. if self.tree.elementInScope(token["name"]):
  1336. element = self.tree.openElements.pop()
  1337. while element.name != token["name"]:
  1338. element = self.tree.openElements.pop()
  1339. self.tree.clearActiveFormattingElements()
  1340. def endTagBr(self, token):
  1341. self.parser.parseError("unexpected-end-tag-treated-as",
  1342. {"originalName": "br", "newName": "br element"})
  1343. self.tree.reconstructActiveFormattingElements()
  1344. self.tree.insertElement(impliedTagToken("br", "StartTag"))
  1345. self.tree.openElements.pop()
  1346. def endTagOther(self, token):
  1347. for node in self.tree.openElements[::-1]:
  1348. if node.name == token["name"]:
  1349. self.tree.generateImpliedEndTags(exclude=token["name"])
  1350. if self.tree.openElements[-1].name != token["name"]:
  1351. self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
  1352. while self.tree.openElements.pop() != node:
  1353. pass
  1354. break
  1355. else:
  1356. if node.nameTuple in specialElements:
  1357. self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
  1358. break
  1359. class TextPhase(Phase):
  1360. def __init__(self, parser, tree):
  1361. Phase.__init__(self, parser, tree)
  1362. self.startTagHandler = utils.MethodDispatcher([])
  1363. self.startTagHandler.default = self.startTagOther
  1364. self.endTagHandler = utils.MethodDispatcher([
  1365. ("script", self.endTagScript)])
  1366. self.endTagHandler.default = self.endTagOther
  1367. def processCharacters(self, token):
  1368. self.tree.insertText(token["data"])
  1369. def processEOF(self):
  1370. self.parser.parseError("expected-named-closing-tag-but-got-eof",
  1371. {"name": self.tree.openElements[-1].name})
  1372. self.tree.openElements.pop()
  1373. self.parser.phase = self.parser.originalPhase
  1374. return True
  1375. def startTagOther(self, token):
  1376. assert False, "Tried to process start tag %s in RCDATA/RAWTEXT mode" % token['name']
  1377. def endTagScript(self, token):
  1378. node = self.tree.openElements.pop()
  1379. assert node.name == "script"
  1380. self.parser.phase = self.parser.originalPhase
  1381. # The rest of this method is all stuff that only happens if
  1382. # document.write works
  1383. def endTagOther(self, token):
  1384. self.tree.openElements.pop()
  1385. self.parser.phase = self.parser.originalPhase
  1386. class InTablePhase(Phase):
  1387. # http://www.whatwg.org/specs/web-apps/current-work/#in-table
  1388. def __init__(self, parser, tree):
  1389. Phase.__init__(self, parser, tree)
  1390. self.startTagHandler = utils.MethodDispatcher([
  1391. ("html", self.startTagHtml),
  1392. ("caption", self.startTagCaption),
  1393. ("colgroup", self.startTagColgroup),
  1394. ("col", self.startTagCol),
  1395. (("tbody", "tfoot", "thead"), self.startTagRowGroup),
  1396. (("td", "th", "tr"), self.startTagImplyTbody),
  1397. ("table", self.startTagTable),
  1398. (("style", "script"), self.startTagStyleScript),
  1399. ("input", self.startTagInput),
  1400. ("form", self.startTagForm)
  1401. ])
  1402. self.startTagHandler.default = self.startTagOther
  1403. self.endTagHandler = utils.MethodDispatcher([
  1404. ("table", self.endTagTable),
  1405. (("body", "caption", "col", "colgroup", "html", "tbody", "td",
  1406. "tfoot", "th", "thead", "tr"), self.endTagIgnore)
  1407. ])
  1408. self.endTagHandler.default = self.endTagOther
  1409. # helper methods
  1410. def clearStackToTableContext(self):
  1411. # "clear the stack back to a table context"
  1412. while self.tree.openElements[-1].name not in ("table", "html"):
  1413. # self.parser.parseError("unexpected-implied-end-tag-in-table",
  1414. # {"name": self.tree.openElements[-1].name})
  1415. self.tree.openElements.pop()
  1416. # When the current node is <html> it's an innerHTML case
  1417. # processing methods
  1418. def processEOF(self):
  1419. if self.tree.openElements[-1].name != "html":
  1420. self.parser.parseError("eof-in-table")
  1421. else:
  1422. assert self.parser.innerHTML
  1423. # Stop parsing
  1424. def processSpaceCharacters(self, token):
  1425. originalPhase = self.parser.phase
  1426. self.parser.phase = self.parser.phases["inTableText"]
  1427. self.parser.phase.originalPhase = originalPhase
  1428. self.parser.phase.processSpaceCharacters(token)
  1429. def processCharacters(self, token):
  1430. originalPhase = self.parser.phase
  1431. self.parser.phase = self.parser.phases["inTableText"]
  1432. self.parser.phase.originalPhase = originalPhase
  1433. self.parser.phase.processCharacters(token)
  1434. def insertText(self, token):
  1435. # If we get here there must be at least one non-whitespace character
  1436. # Do the table magic!
  1437. self.tree.insertFromTable = True
  1438. self.parser.phases["inBody"].processCharacters(token)
  1439. self.tree.insertFromTable = False
  1440. def startTagCaption(self, token):
  1441. self.clearStackToTableContext()
  1442. self.tree.activeFormattingElements.append(Marker)
  1443. self.tree.insertElement(token)
  1444. self.parser.phase = self.parser.phases["inCaption"]
  1445. def startTagColgroup(self, token):
  1446. self.clearStackToTableContext()
  1447. self.tree.insertElement(token)
  1448. self.parser.phase = self.parser.phases["inColumnGroup"]
  1449. def startTagCol(self, token):
  1450. self.startTagColgroup(impliedTagToken("colgroup", "StartTag"))
  1451. return token
  1452. def startTagRowGroup(self, token):
  1453. self.clearStackToTableContext()
  1454. self.tree.insertElement(token)
  1455. self.parser.phase = self.parser.phases["inTableBody"]
  1456. def startTagImplyTbody(self, token):
  1457. self.startTagRowGroup(impliedTagToken("tbody", "StartTag"))
  1458. return token
  1459. def startTagTable(self, token):
  1460. self.parser.parseError("unexpected-start-tag-implies-end-tag",
  1461. {"startName": "table", "endName": "table"})
  1462. self.parser.phase.processEndTag(impliedTagToken("table"))
  1463. if not self.parser.innerHTML:
  1464. return token
  1465. def startTagStyleScript(self, token):
  1466. return self.parser.phases["inHead"].processStartTag(token)
  1467. def startTagInput(self, token):
  1468. if ("type" in token["data"] and
  1469. token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):
  1470. self.parser.parseError("unexpected-hidden-input-in-table")
  1471. self.tree.insertElement(token)
  1472. # XXX associate with form
  1473. self.tree.openElements.pop()
  1474. else:
  1475. self.startTagOther(token)
  1476. def startTagForm(self, token):
  1477. self.parser.parseError("unexpected-form-in-table")
  1478. if self.tree.formPointer is None:
  1479. self.tree.insertElement(token)
  1480. self.tree.formPointer = self.tree.openElements[-1]
  1481. self.tree.openElements.pop()
  1482. def startTagOther(self, token):
  1483. self.parser.parseError("unexpected-start-tag-implies-table-voodoo", {"name": token["name"]})
  1484. # Do the table magic!
  1485. self.tree.insertFromTable = True
  1486. self.parser.phases["inBody"].processStartTag(token)
  1487. self.tree.insertFromTable = False
  1488. def endTagTable(self, token):
  1489. if self.tree.elementInScope("table", variant="table"):
  1490. self.tree.generateImpliedEndTags()
  1491. if self.tree.openElements[-1].name != "table":
  1492. self.parser.parseError("end-tag-too-early-named",
  1493. {"gotName": "table",
  1494. "expectedName": self.tree.openElements[-1].name})
  1495. while self.tree.openElements[-1].name != "table":
  1496. self.tree.openElements.pop()
  1497. self.tree.openElements.pop()
  1498. self.parser.resetInsertionMode()
  1499. else:
  1500. # innerHTML case
  1501. assert self.parser.innerHTML
  1502. self.parser.parseError()
  1503. def endTagIgnore(self, token):
  1504. self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
  1505. def endTagOther(self, token):
  1506. self.parser.parseError("unexpected-end-tag-implies-table-voodoo", {"name": token["name"]})
  1507. # Do the table magic!
  1508. self.tree.insertFromTable = True
  1509. self.parser.phases["inBody"].processEndTag(token)
  1510. self.tree.insertFromTable = False
  1511. class InTableTextPhase(Phase):
  1512. def __init__(self, parser, tree):
  1513. Phase.__init__(self, parser, tree)
  1514. self.originalPhase = None
  1515. self.characterTokens = []
  1516. def flushCharacters(self):
  1517. data = "".join([item["data"] for item in self.characterTokens])
  1518. if any([item not in spaceCharacters for item in data]):
  1519. token = {"type": tokenTypes["Characters"], "data": data}
  1520. self.parser.phases["inTable"].insertText(token)
  1521. elif data:
  1522. self.tree.insertText(data)
  1523. self.characterTokens = []
  1524. def processComment(self, token):
  1525. self.flushCharacters()
  1526. self.parser.phase = self.originalPhase
  1527. return token
  1528. def processEOF(self):
  1529. self.flushCharacters()
  1530. self.parser.phase = self.originalPhase
  1531. return True
  1532. def processCharacters(self, token):
  1533. if token["data"] == "\u0000":
  1534. return
  1535. self.characterTokens.append(token)
  1536. def processSpaceCharacters(self, token):
  1537. # pretty sure we should never reach here
  1538. self.characterTokens.append(token)
  1539. # assert False
  1540. def processStartTag(self, token):
  1541. self.flushCharacters()
  1542. self.parser.phase = self.originalPhase
  1543. return token
  1544. def processEndTag(self, token):
  1545. self.flushCharacters()
  1546. self.parser.phase = self.originalPhase
  1547. return token
  1548. class InCaptionPhase(Phase):
  1549. # http://www.whatwg.org/specs/web-apps/current-work/#in-caption
  1550. def __init__(self, parser, tree):
  1551. Phase.__init__(self, parser, tree)
  1552. self.startTagHandler = utils.MethodDispatcher([
  1553. ("html", self.startTagHtml),
  1554. (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
  1555. "thead", "tr"), self.startTagTableElement)
  1556. ])
  1557. self.startTagHandler.default = self.startTagOther
  1558. self.endTagHandler = utils.MethodDispatcher([
  1559. ("caption", self.endTagCaption),
  1560. ("table", self.endTagTable),
  1561. (("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th",
  1562. "thead", "tr"), self.endTagIgnore)
  1563. ])
  1564. self.endTagHandler.default = self.endTagOther
  1565. def ignoreEndTagCaption(self):
  1566. return not self.tree.elementInScope("caption", variant="table")
  1567. def processEOF(self):
  1568. self.parser.phases["inBody"].processEOF()
  1569. def processCharacters(self, token):
  1570. return self.parser.phases["inBody"].processCharacters(token)
  1571. def startTagTableElement(self, token):
  1572. self.parser.parseError()
  1573. # XXX Have to duplicate logic here to find out if the tag is ignored
  1574. ignoreEndTag = self.ignoreEndTagCaption()
  1575. self.parser.phase.processEndTag(impliedTagToken("caption"))
  1576. if not ignoreEndTag:
  1577. return token
  1578. def startTagOther(self, token):
  1579. return self.parser.phases["inBody"].processStartTag(token)
  1580. def endTagCaption(self, token):
  1581. if not self.ignoreEndTagCaption():
  1582. # AT this code is quite similar to endTagTable in "InTable"
  1583. self.tree.generateImpliedEndTags()
  1584. if self.tree.openElements[-1].name != "caption":
  1585. self.parser.parseError("expected-one-end-tag-but-got-another",
  1586. {"gotName": "caption",
  1587. "expectedName": self.tree.openElements[-1].name})
  1588. while self.tree.openElements[-1].name != "caption":
  1589. self.tree.openElements.pop()
  1590. self.tree.openElements.pop()
  1591. self.tree.clearActiveFormattingElements()
  1592. self.parser.phase = self.parser.phases["inTable"]
  1593. else:
  1594. # innerHTML case
  1595. assert self.parser.innerHTML
  1596. self.parser.parseError()
  1597. def endTagTable(self, token):
  1598. self.parser.parseError()
  1599. ignoreEndTag = self.ignoreEndTagCaption()
  1600. self.parser.phase.processEndTag(impliedTagToken("caption"))
  1601. if not ignoreEndTag:
  1602. return token
  1603. def endTagIgnore(self, token):
  1604. self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
  1605. def endTagOther(self, token):
  1606. return self.parser.phases["inBody"].processEndTag(token)
  1607. class InColumnGroupPhase(Phase):
  1608. # http://www.whatwg.org/specs/web-apps/current-work/#in-column
  1609. def __init__(self, parser, tree):
  1610. Phase.__init__(self, parser, tree)
  1611. self.startTagHandler = utils.MethodDispatcher([
  1612. ("html", self.startTagHtml),
  1613. ("col", self.startTagCol)
  1614. ])
  1615. self.startTagHandler.default = self.startTagOther
  1616. self.endTagHandler = utils.MethodDispatcher([
  1617. ("colgroup", self.endTagColgroup),
  1618. ("col", self.endTagCol)
  1619. ])
  1620. self.endTagHandler.default = self.endTagOther
  1621. def ignoreEndTagColgroup(self):
  1622. return self.tree.openElements[-1].name == "html"
  1623. def processEOF(self):
  1624. if self.tree.openElements[-1].name == "html":
  1625. assert self.parser.innerHTML
  1626. return
  1627. else:
  1628. ignoreEndTag = self.ignoreEndTagColgroup()
  1629. self.endTagColgroup(impliedTagToken("colgroup"))
  1630. if not ignoreEndTag:
  1631. return True
  1632. def processCharacters(self, token):
  1633. ignoreEndTag = self.ignoreEndTagColgroup()
  1634. self.endTagColgroup(impliedTagToken("colgroup"))
  1635. if not ignoreEndTag:
  1636. return token
  1637. def startTagCol(self, token):
  1638. self.tree.insertElement(token)
  1639. self.tree.openElements.pop()
  1640. def startTagOther(self, token):
  1641. ignoreEndTag = self.ignoreEndTagColgroup()
  1642. self.endTagColgroup(impliedTagToken("colgroup"))
  1643. if not ignoreEndTag:
  1644. return token
  1645. def endTagColgroup(self, token):
  1646. if self.ignoreEndTagColgroup():
  1647. # innerHTML case
  1648. assert self.parser.innerHTML
  1649. self.parser.parseError()
  1650. else:
  1651. self.tree.openElements.pop()
  1652. self.parser.phase = self.parser.phases["inTable"]
  1653. def endTagCol(self, token):
  1654. self.parser.parseError("no-end-tag", {"name": "col"})
  1655. def endTagOther(self, token):
  1656. ignoreEndTag = self.ignoreEndTagColgroup()
  1657. self.endTagColgroup(impliedTagToken("colgroup"))
  1658. if not ignoreEndTag:
  1659. return token
  1660. class InTableBodyPhase(Phase):
  1661. # http://www.whatwg.org/specs/web-apps/current-work/#in-table0
  1662. def __init__(self, parser, tree):
  1663. Phase.__init__(self, parser, tree)
  1664. self.startTagHandler = utils.MethodDispatcher([
  1665. ("html", self.startTagHtml),
  1666. ("tr", self.startTagTr),
  1667. (("td", "th"), self.startTagTableCell),
  1668. (("caption", "col", "colgroup", "tbody", "tfoot", "thead"),
  1669. self.startTagTableOther)
  1670. ])
  1671. self.startTagHandler.default = self.startTagOther
  1672. self.endTagHandler = utils.MethodDispatcher([
  1673. (("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
  1674. ("table", self.endTagTable),
  1675. (("body", "caption", "col", "colgroup", "html", "td", "th",
  1676. "tr"), self.endTagIgnore)
  1677. ])
  1678. self.endTagHandler.default = self.endTagOther
  1679. # helper methods
  1680. def clearStackToTableBodyContext(self):
  1681. while self.tree.openElements[-1].name not in ("tbody", "tfoot",
  1682. "thead", "html"):
  1683. # self.parser.parseError("unexpected-implied-end-tag-in-table",
  1684. # {"name": self.tree.openElements[-1].name})
  1685. self.tree.openElements.pop()
  1686. if self.tree.openElements[-1].name == "html":
  1687. assert self.parser.innerHTML
  1688. # the rest
  1689. def processEOF(self):
  1690. self.parser.phases["inTable"].processEOF()
  1691. def processSpaceCharacters(self, token):
  1692. return self.parser.phases["inTable"].processSpaceCharacters(token)
  1693. def processCharacters(self, token):
  1694. return self.parser.phases["inTable"].processCharacters(token)
  1695. def startTagTr(self, token):
  1696. self.clearStackToTableBodyContext()
  1697. self.tree.insertElement(token)
  1698. self.parser.phase = self.parser.phases["inRow"]
  1699. def startTagTableCell(self, token):
  1700. self.parser.parseError("unexpected-cell-in-table-body",
  1701. {"name": token["name"]})
  1702. self.startTagTr(impliedTagToken("tr", "StartTag"))
  1703. return token
  1704. def startTagTableOther(self, token):
  1705. # XXX AT Any ideas on how to share this with endTagTable?
  1706. if (self.tree.elementInScope("tbody", variant="table") or
  1707. self.tree.elementInScope("thead", variant="table") or
  1708. self.tree.elementInScope("tfoot", variant="table")):
  1709. self.clearStackToTableBodyContext()
  1710. self.endTagTableRowGroup(
  1711. impliedTagToken(self.tree.openElements[-1].name))
  1712. return token
  1713. else:
  1714. # innerHTML case
  1715. assert self.parser.innerHTML
  1716. self.parser.parseError()
  1717. def startTagOther(self, token):
  1718. return self.parser.phases["inTable"].processStartTag(token)
  1719. def endTagTableRowGroup(self, token):
  1720. if self.tree.elementInScope(token["name"], variant="table"):
  1721. self.clearStackToTableBodyContext()
  1722. self.tree.openElements.pop()
  1723. self.parser.phase = self.parser.phases["inTable"]
  1724. else:
  1725. self.parser.parseError("unexpected-end-tag-in-table-body",
  1726. {"name": token["name"]})
  1727. def endTagTable(self, token):
  1728. if (self.tree.elementInScope("tbody", variant="table") or
  1729. self.tree.elementInScope("thead", variant="table") or
  1730. self.tree.elementInScope("tfoot", variant="table")):
  1731. self.clearStackToTableBodyContext()
  1732. self.endTagTableRowGroup(
  1733. impliedTagToken(self.tree.openElements[-1].name))
  1734. return token
  1735. else:
  1736. # innerHTML case
  1737. assert self.parser.innerHTML
  1738. self.parser.parseError()
  1739. def endTagIgnore(self, token):
  1740. self.parser.parseError("unexpected-end-tag-in-table-body",
  1741. {"name": token["name"]})
  1742. def endTagOther(self, token):
  1743. return self.parser.phases["inTable"].processEndTag(token)
  1744. class InRowPhase(Phase):
  1745. # http://www.whatwg.org/specs/web-apps/current-work/#in-row
  1746. def __init__(self, parser, tree):
  1747. Phase.__init__(self, parser, tree)
  1748. self.startTagHandler = utils.MethodDispatcher([
  1749. ("html", self.startTagHtml),
  1750. (("td", "th"), self.startTagTableCell),
  1751. (("caption", "col", "colgroup", "tbody", "tfoot", "thead",
  1752. "tr"), self.startTagTableOther)
  1753. ])
  1754. self.startTagHandler.default = self.startTagOther
  1755. self.endTagHandler = utils.MethodDispatcher([
  1756. ("tr", self.endTagTr),
  1757. ("table", self.endTagTable),
  1758. (("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
  1759. (("body", "caption", "col", "colgroup", "html", "td", "th"),
  1760. self.endTagIgnore)
  1761. ])
  1762. self.endTagHandler.default = self.endTagOther
  1763. # helper methods (XXX unify this with other table helper methods)
  1764. def clearStackToTableRowContext(self):
  1765. while self.tree.openElements[-1].name not in ("tr", "html"):
  1766. self.parser.parseError("unexpected-implied-end-tag-in-table-row",
  1767. {"name": self.tree.openElements[-1].name})
  1768. self.tree.openElements.pop()
  1769. def ignoreEndTagTr(self):
  1770. return not self.tree.elementInScope("tr", variant="table")
  1771. # the rest
  1772. def processEOF(self):
  1773. self.parser.phases["inTable"].processEOF()
  1774. def processSpaceCharacters(self, token):
  1775. return self.parser.phases["inTable"].processSpaceCharacters(token)
  1776. def processCharacters(self, token):
  1777. return self.parser.phases["inTable"].processCharacters(token)
  1778. def startTagTableCell(self, token):
  1779. self.clearStackToTableRowContext()
  1780. self.tree.insertElement(token)
  1781. self.parser.phase = self.parser.phases["inCell"]
  1782. self.tree.activeFormattingElements.append(Marker)
  1783. def startTagTableOther(self, token):
  1784. ignoreEndTag = self.ignoreEndTagTr()
  1785. self.endTagTr(impliedTagToken("tr"))
  1786. # XXX how are we sure it's always ignored in the innerHTML case?
  1787. if not ignoreEndTag:
  1788. return token
  1789. def startTagOther(self, token):
  1790. return self.parser.phases["inTable"].processStartTag(token)
  1791. def endTagTr(self, token):
  1792. if not self.ignoreEndTagTr():
  1793. self.clearStackToTableRowContext()
  1794. self.tree.openElements.pop()
  1795. self.parser.phase = self.parser.phases["inTableBody"]
  1796. else:
  1797. # innerHTML case
  1798. assert self.parser.innerHTML
  1799. self.parser.parseError()
  1800. def endTagTable(self, token):
  1801. ignoreEndTag = self.ignoreEndTagTr()
  1802. self.endTagTr(impliedTagToken("tr"))
  1803. # Reprocess the current tag if the tr end tag was not ignored
  1804. # XXX how are we sure it's always ignored in the innerHTML case?
  1805. if not ignoreEndTag:
  1806. return token
  1807. def endTagTableRowGroup(self, token):
  1808. if self.tree.elementInScope(token["name"], variant="table"):
  1809. self.endTagTr(impliedTagToken("tr"))
  1810. return token
  1811. else:
  1812. self.parser.parseError()
  1813. def endTagIgnore(self, token):
  1814. self.parser.parseError("unexpected-end-tag-in-table-row",
  1815. {"name": token["name"]})
  1816. def endTagOther(self, token):
  1817. return self.parser.phases["inTable"].processEndTag(token)
  1818. class InCellPhase(Phase):
  1819. # http://www.whatwg.org/specs/web-apps/current-work/#in-cell
  1820. def __init__(self, parser, tree):
  1821. Phase.__init__(self, parser, tree)
  1822. self.startTagHandler = utils.MethodDispatcher([
  1823. ("html", self.startTagHtml),
  1824. (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
  1825. "thead", "tr"), self.startTagTableOther)
  1826. ])
  1827. self.startTagHandler.default = self.startTagOther
  1828. self.endTagHandler = utils.MethodDispatcher([
  1829. (("td", "th"), self.endTagTableCell),
  1830. (("body", "caption", "col", "colgroup", "html"), self.endTagIgnore),
  1831. (("table", "tbody", "tfoot", "thead", "tr"), self.endTagImply)
  1832. ])
  1833. self.endTagHandler.default = self.endTagOther
  1834. # helper
  1835. def closeCell(self):
  1836. if self.tree.elementInScope("td", variant="table"):
  1837. self.endTagTableCell(impliedTagToken("td"))
  1838. elif self.tree.elementInScope("th", variant="table"):
  1839. self.endTagTableCell(impliedTagToken("th"))
  1840. # the rest
  1841. def processEOF(self):
  1842. self.parser.phases["inBody"].processEOF()
  1843. def processCharacters(self, token):
  1844. return self.parser.phases["inBody"].processCharacters(token)
  1845. def startTagTableOther(self, token):
  1846. if (self.tree.elementInScope("td", variant="table") or
  1847. self.tree.elementInScope("th", variant="table")):
  1848. self.closeCell()
  1849. return token
  1850. else:
  1851. # innerHTML case
  1852. assert self.parser.innerHTML
  1853. self.parser.parseError()
  1854. def startTagOther(self, token):
  1855. return self.parser.phases["inBody"].processStartTag(token)
  1856. def endTagTableCell(self, token):
  1857. if self.tree.elementInScope(token["name"], variant="table"):
  1858. self.tree.generateImpliedEndTags(token["name"])
  1859. if self.tree.openElements[-1].name != token["name"]:
  1860. self.parser.parseError("unexpected-cell-end-tag",
  1861. {"name": token["name"]})
  1862. while True:
  1863. node = self.tree.openElements.pop()
  1864. if node.name == token["name"]:
  1865. break
  1866. else:
  1867. self.tree.openElements.pop()
  1868. self.tree.clearActiveFormattingElements()
  1869. self.parser.phase = self.parser.phases["inRow"]
  1870. else:
  1871. self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
  1872. def endTagIgnore(self, token):
  1873. self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
  1874. def endTagImply(self, token):
  1875. if self.tree.elementInScope(token["name"], variant="table"):
  1876. self.closeCell()
  1877. return token
  1878. else:
  1879. # sometimes innerHTML case
  1880. self.parser.parseError()
  1881. def endTagOther(self, token):
  1882. return self.parser.phases["inBody"].processEndTag(token)
  1883. class InSelectPhase(Phase):
  1884. def __init__(self, parser, tree):
  1885. Phase.__init__(self, parser, tree)
  1886. self.startTagHandler = utils.MethodDispatcher([
  1887. ("html", self.startTagHtml),
  1888. ("option", self.startTagOption),
  1889. ("optgroup", self.startTagOptgroup),
  1890. ("select", self.startTagSelect),
  1891. (("input", "keygen", "textarea"), self.startTagInput),
  1892. ("script", self.startTagScript)
  1893. ])
  1894. self.startTagHandler.default = self.startTagOther
  1895. self.endTagHandler = utils.MethodDispatcher([
  1896. ("option", self.endTagOption),
  1897. ("optgroup", self.endTagOptgroup),
  1898. ("select", self.endTagSelect)
  1899. ])
  1900. self.endTagHandler.default = self.endTagOther
  1901. # http://www.whatwg.org/specs/web-apps/current-work/#in-select
  1902. def processEOF(self):
  1903. if self.tree.openElements[-1].name != "html":
  1904. self.parser.parseError("eof-in-select")
  1905. else:
  1906. assert self.parser.innerHTML
  1907. def processCharacters(self, token):
  1908. if token["data"] == "\u0000":
  1909. return
  1910. self.tree.insertText(token["data"])
  1911. def startTagOption(self, token):
  1912. # We need to imply </option> if <option> is the current node.
  1913. if self.tree.openElements[-1].name == "option":
  1914. self.tree.openElements.pop()
  1915. self.tree.insertElement(token)
  1916. def startTagOptgroup(self, token):
  1917. if self.tree.openElements[-1].name == "option":
  1918. self.tree.openElements.pop()
  1919. if self.tree.openElements[-1].name == "optgroup":
  1920. self.tree.openElements.pop()
  1921. self.tree.insertElement(token)
  1922. def startTagSelect(self, token):
  1923. self.parser.parseError("unexpected-select-in-select")
  1924. self.endTagSelect(impliedTagToken("select"))
  1925. def startTagInput(self, token):
  1926. self.parser.parseError("unexpected-input-in-select")
  1927. if self.tree.elementInScope("select", variant="select"):
  1928. self.endTagSelect(impliedTagToken("select"))
  1929. return token
  1930. else:
  1931. assert self.parser.innerHTML
  1932. def startTagScript(self, token):
  1933. return self.parser.phases["inHead"].processStartTag(token)
  1934. def startTagOther(self, token):
  1935. self.parser.parseError("unexpected-start-tag-in-select",
  1936. {"name": token["name"]})
  1937. def endTagOption(self, token):
  1938. if self.tree.openElements[-1].name == "option":
  1939. self.tree.openElements.pop()
  1940. else:
  1941. self.parser.parseError("unexpected-end-tag-in-select",
  1942. {"name": "option"})
  1943. def endTagOptgroup(self, token):
  1944. # </optgroup> implicitly closes <option>
  1945. if (self.tree.openElements[-1].name == "option" and
  1946. self.tree.openElements[-2].name == "optgroup"):
  1947. self.tree.openElements.pop()
  1948. # It also closes </optgroup>
  1949. if self.tree.openElements[-1].name == "optgroup":
  1950. self.tree.openElements.pop()
  1951. # But nothing else
  1952. else:
  1953. self.parser.parseError("unexpected-end-tag-in-select",
  1954. {"name": "optgroup"})
  1955. def endTagSelect(self, token):
  1956. if self.tree.elementInScope("select", variant="select"):
  1957. node = self.tree.openElements.pop()
  1958. while node.name != "select":
  1959. node = self.tree.openElements.pop()
  1960. self.parser.resetInsertionMode()
  1961. else:
  1962. # innerHTML case
  1963. assert self.parser.innerHTML
  1964. self.parser.parseError()
  1965. def endTagOther(self, token):
  1966. self.parser.parseError("unexpected-end-tag-in-select",
  1967. {"name": token["name"]})
  1968. class InSelectInTablePhase(Phase):
  1969. def __init__(self, parser, tree):
  1970. Phase.__init__(self, parser, tree)
  1971. self.startTagHandler = utils.MethodDispatcher([
  1972. (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
  1973. self.startTagTable)
  1974. ])
  1975. self.startTagHandler.default = self.startTagOther
  1976. self.endTagHandler = utils.MethodDispatcher([
  1977. (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
  1978. self.endTagTable)
  1979. ])
  1980. self.endTagHandler.default = self.endTagOther
  1981. def processEOF(self):
  1982. self.parser.phases["inSelect"].processEOF()
  1983. def processCharacters(self, token):
  1984. return self.parser.phases["inSelect"].processCharacters(token)
  1985. def startTagTable(self, token):
  1986. self.parser.parseError("unexpected-table-element-start-tag-in-select-in-table", {"name": token["name"]})
  1987. self.endTagOther(impliedTagToken("select"))
  1988. return token
  1989. def startTagOther(self, token):
  1990. return self.parser.phases["inSelect"].processStartTag(token)
  1991. def endTagTable(self, token):
  1992. self.parser.parseError("unexpected-table-element-end-tag-in-select-in-table", {"name": token["name"]})
  1993. if self.tree.elementInScope(token["name"], variant="table"):
  1994. self.endTagOther(impliedTagToken("select"))
  1995. return token
  1996. def endTagOther(self, token):
  1997. return self.parser.phases["inSelect"].processEndTag(token)
  1998. class InForeignContentPhase(Phase):
  1999. breakoutElements = frozenset(["b", "big", "blockquote", "body", "br",
  2000. "center", "code", "dd", "div", "dl", "dt",
  2001. "em", "embed", "h1", "h2", "h3",
  2002. "h4", "h5", "h6", "head", "hr", "i", "img",
  2003. "li", "listing", "menu", "meta", "nobr",
  2004. "ol", "p", "pre", "ruby", "s", "small",
  2005. "span", "strong", "strike", "sub", "sup",
  2006. "table", "tt", "u", "ul", "var"])
  2007. def __init__(self, parser, tree):
  2008. Phase.__init__(self, parser, tree)
  2009. def adjustSVGTagNames(self, token):
  2010. replacements = {"altglyph": "altGlyph",
  2011. "altglyphdef": "altGlyphDef",
  2012. "altglyphitem": "altGlyphItem",
  2013. "animatecolor": "animateColor",
  2014. "animatemotion": "animateMotion",
  2015. "animatetransform": "animateTransform",
  2016. "clippath": "clipPath",
  2017. "feblend": "feBlend",
  2018. "fecolormatrix": "feColorMatrix",
  2019. "fecomponenttransfer": "feComponentTransfer",
  2020. "fecomposite": "feComposite",
  2021. "feconvolvematrix": "feConvolveMatrix",
  2022. "fediffuselighting": "feDiffuseLighting",
  2023. "fedisplacementmap": "feDisplacementMap",
  2024. "fedistantlight": "feDistantLight",
  2025. "feflood": "feFlood",
  2026. "fefunca": "feFuncA",
  2027. "fefuncb": "feFuncB",
  2028. "fefuncg": "feFuncG",
  2029. "fefuncr": "feFuncR",
  2030. "fegaussianblur": "feGaussianBlur",
  2031. "feimage": "feImage",
  2032. "femerge": "feMerge",
  2033. "femergenode": "feMergeNode",
  2034. "femorphology": "feMorphology",
  2035. "feoffset": "feOffset",
  2036. "fepointlight": "fePointLight",
  2037. "fespecularlighting": "feSpecularLighting",
  2038. "fespotlight": "feSpotLight",
  2039. "fetile": "feTile",
  2040. "feturbulence": "feTurbulence",
  2041. "foreignobject": "foreignObject",
  2042. "glyphref": "glyphRef",
  2043. "lineargradient": "linearGradient",
  2044. "radialgradient": "radialGradient",
  2045. "textpath": "textPath"}
  2046. if token["name"] in replacements:
  2047. token["name"] = replacements[token["name"]]
  2048. def processCharacters(self, token):
  2049. if token["data"] == "\u0000":
  2050. token["data"] = "\uFFFD"
  2051. elif (self.parser.framesetOK and
  2052. any(char not in spaceCharacters for char in token["data"])):
  2053. self.parser.framesetOK = False
  2054. Phase.processCharacters(self, token)
  2055. def processStartTag(self, token):
  2056. currentNode = self.tree.openElements[-1]
  2057. if (token["name"] in self.breakoutElements or
  2058. (token["name"] == "font" and
  2059. set(token["data"].keys()) & set(["color", "face", "size"]))):
  2060. self.parser.parseError("unexpected-html-element-in-foreign-content",
  2061. {"name": token["name"]})
  2062. while (self.tree.openElements[-1].namespace !=
  2063. self.tree.defaultNamespace and
  2064. not self.parser.isHTMLIntegrationPoint(self.tree.openElements[-1]) and
  2065. not self.parser.isMathMLTextIntegrationPoint(self.tree.openElements[-1])):
  2066. self.tree.openElements.pop()
  2067. return token
  2068. else:
  2069. if currentNode.namespace == namespaces["mathml"]:
  2070. self.parser.adjustMathMLAttributes(token)
  2071. elif currentNode.namespace == namespaces["svg"]:
  2072. self.adjustSVGTagNames(token)
  2073. self.parser.adjustSVGAttributes(token)
  2074. self.parser.adjustForeignAttributes(token)
  2075. token["namespace"] = currentNode.namespace
  2076. self.tree.insertElement(token)
  2077. if token["selfClosing"]:
  2078. self.tree.openElements.pop()
  2079. token["selfClosingAcknowledged"] = True
  2080. def processEndTag(self, token):
  2081. nodeIndex = len(self.tree.openElements) - 1
  2082. node = self.tree.openElements[-1]
  2083. if node.name != token["name"]:
  2084. self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
  2085. while True:
  2086. if node.name.translate(asciiUpper2Lower) == token["name"]:
  2087. # XXX this isn't in the spec but it seems necessary
  2088. if self.parser.phase == self.parser.phases["inTableText"]:
  2089. self.parser.phase.flushCharacters()
  2090. self.parser.phase = self.parser.phase.originalPhase
  2091. while self.tree.openElements.pop() != node:
  2092. assert self.tree.openElements
  2093. new_token = None
  2094. break
  2095. nodeIndex -= 1
  2096. node = self.tree.openElements[nodeIndex]
  2097. if node.namespace != self.tree.defaultNamespace:
  2098. continue
  2099. else:
  2100. new_token = self.parser.phase.processEndTag(token)
  2101. break
  2102. return new_token
  2103. class AfterBodyPhase(Phase):
  2104. def __init__(self, parser, tree):
  2105. Phase.__init__(self, parser, tree)
  2106. self.startTagHandler = utils.MethodDispatcher([
  2107. ("html", self.startTagHtml)
  2108. ])
  2109. self.startTagHandler.default = self.startTagOther
  2110. self.endTagHandler = utils.MethodDispatcher([("html", self.endTagHtml)])
  2111. self.endTagHandler.default = self.endTagOther
  2112. def processEOF(self):
  2113. # Stop parsing
  2114. pass
  2115. def processComment(self, token):
  2116. # This is needed because data is to be appended to the <html> element
  2117. # here and not to whatever is currently open.
  2118. self.tree.insertComment(token, self.tree.openElements[0])
  2119. def processCharacters(self, token):
  2120. self.parser.parseError("unexpected-char-after-body")
  2121. self.parser.phase = self.parser.phases["inBody"]
  2122. return token
  2123. def startTagHtml(self, token):
  2124. return self.parser.phases["inBody"].processStartTag(token)
  2125. def startTagOther(self, token):
  2126. self.parser.parseError("unexpected-start-tag-after-body",
  2127. {"name": token["name"]})
  2128. self.parser.phase = self.parser.phases["inBody"]
  2129. return token
  2130. def endTagHtml(self, name):
  2131. if self.parser.innerHTML:
  2132. self.parser.parseError("unexpected-end-tag-after-body-innerhtml")
  2133. else:
  2134. self.parser.phase = self.parser.phases["afterAfterBody"]
  2135. def endTagOther(self, token):
  2136. self.parser.parseError("unexpected-end-tag-after-body",
  2137. {"name": token["name"]})
  2138. self.parser.phase = self.parser.phases["inBody"]
  2139. return token
  2140. class InFramesetPhase(Phase):
  2141. # http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
  2142. def __init__(self, parser, tree):
  2143. Phase.__init__(self, parser, tree)
  2144. self.startTagHandler = utils.MethodDispatcher([
  2145. ("html", self.startTagHtml),
  2146. ("frameset", self.startTagFrameset),
  2147. ("frame", self.startTagFrame),
  2148. ("noframes", self.startTagNoframes)
  2149. ])
  2150. self.startTagHandler.default = self.startTagOther
  2151. self.endTagHandler = utils.MethodDispatcher([
  2152. ("frameset", self.endTagFrameset)
  2153. ])
  2154. self.endTagHandler.default = self.endTagOther
  2155. def processEOF(self):
  2156. if self.tree.openElements[-1].name != "html":
  2157. self.parser.parseError("eof-in-frameset")
  2158. else:
  2159. assert self.parser.innerHTML
  2160. def processCharacters(self, token):
  2161. self.parser.parseError("unexpected-char-in-frameset")
  2162. def startTagFrameset(self, token):
  2163. self.tree.insertElement(token)
  2164. def startTagFrame(self, token):
  2165. self.tree.insertElement(token)
  2166. self.tree.openElements.pop()
  2167. def startTagNoframes(self, token):
  2168. return self.parser.phases["inBody"].processStartTag(token)
  2169. def startTagOther(self, token):
  2170. self.parser.parseError("unexpected-start-tag-in-frameset",
  2171. {"name": token["name"]})
  2172. def endTagFrameset(self, token):
  2173. if self.tree.openElements[-1].name == "html":
  2174. # innerHTML case
  2175. self.parser.parseError("unexpected-frameset-in-frameset-innerhtml")
  2176. else:
  2177. self.tree.openElements.pop()
  2178. if (not self.parser.innerHTML and
  2179. self.tree.openElements[-1].name != "frameset"):
  2180. # If we're not in innerHTML mode and the the current node is not a
  2181. # "frameset" element (anymore) then switch.
  2182. self.parser.phase = self.parser.phases["afterFrameset"]
  2183. def endTagOther(self, token):
  2184. self.parser.parseError("unexpected-end-tag-in-frameset",
  2185. {"name": token["name"]})
  2186. class AfterFramesetPhase(Phase):
  2187. # http://www.whatwg.org/specs/web-apps/current-work/#after3
  2188. def __init__(self, parser, tree):
  2189. Phase.__init__(self, parser, tree)
  2190. self.startTagHandler = utils.MethodDispatcher([
  2191. ("html", self.startTagHtml),
  2192. ("noframes", self.startTagNoframes)
  2193. ])
  2194. self.startTagHandler.default = self.startTagOther
  2195. self.endTagHandler = utils.MethodDispatcher([
  2196. ("html", self.endTagHtml)
  2197. ])
  2198. self.endTagHandler.default = self.endTagOther
  2199. def processEOF(self):
  2200. # Stop parsing
  2201. pass
  2202. def processCharacters(self, token):
  2203. self.parser.parseError("unexpected-char-after-frameset")
  2204. def startTagNoframes(self, token):
  2205. return self.parser.phases["inHead"].processStartTag(token)
  2206. def startTagOther(self, token):
  2207. self.parser.parseError("unexpected-start-tag-after-frameset",
  2208. {"name": token["name"]})
  2209. def endTagHtml(self, token):
  2210. self.parser.phase = self.parser.phases["afterAfterFrameset"]
  2211. def endTagOther(self, token):
  2212. self.parser.parseError("unexpected-end-tag-after-frameset",
  2213. {"name": token["name"]})
  2214. class AfterAfterBodyPhase(Phase):
  2215. def __init__(self, parser, tree):
  2216. Phase.__init__(self, parser, tree)
  2217. self.startTagHandler = utils.MethodDispatcher([
  2218. ("html", self.startTagHtml)
  2219. ])
  2220. self.startTagHandler.default = self.startTagOther
  2221. def processEOF(self):
  2222. pass
  2223. def processComment(self, token):
  2224. self.tree.insertComment(token, self.tree.document)
  2225. def processSpaceCharacters(self, token):
  2226. return self.parser.phases["inBody"].processSpaceCharacters(token)
  2227. def processCharacters(self, token):
  2228. self.parser.parseError("expected-eof-but-got-char")
  2229. self.parser.phase = self.parser.phases["inBody"]
  2230. return token
  2231. def startTagHtml(self, token):
  2232. return self.parser.phases["inBody"].processStartTag(token)
  2233. def startTagOther(self, token):
  2234. self.parser.parseError("expected-eof-but-got-start-tag",
  2235. {"name": token["name"]})
  2236. self.parser.phase = self.parser.phases["inBody"]
  2237. return token
  2238. def processEndTag(self, token):
  2239. self.parser.parseError("expected-eof-but-got-end-tag",
  2240. {"name": token["name"]})
  2241. self.parser.phase = self.parser.phases["inBody"]
  2242. return token
  2243. class AfterAfterFramesetPhase(Phase):
  2244. def __init__(self, parser, tree):
  2245. Phase.__init__(self, parser, tree)
  2246. self.startTagHandler = utils.MethodDispatcher([
  2247. ("html", self.startTagHtml),
  2248. ("noframes", self.startTagNoFrames)
  2249. ])
  2250. self.startTagHandler.default = self.startTagOther
  2251. def processEOF(self):
  2252. pass
  2253. def processComment(self, token):
  2254. self.tree.insertComment(token, self.tree.document)
  2255. def processSpaceCharacters(self, token):
  2256. return self.parser.phases["inBody"].processSpaceCharacters(token)
  2257. def processCharacters(self, token):
  2258. self.parser.parseError("expected-eof-but-got-char")
  2259. def startTagHtml(self, token):
  2260. return self.parser.phases["inBody"].processStartTag(token)
  2261. def startTagNoFrames(self, token):
  2262. return self.parser.phases["inHead"].processStartTag(token)
  2263. def startTagOther(self, token):
  2264. self.parser.parseError("expected-eof-but-got-start-tag",
  2265. {"name": token["name"]})
  2266. def processEndTag(self, token):
  2267. self.parser.parseError("expected-eof-but-got-end-tag",
  2268. {"name": token["name"]})
  2269. return {
  2270. "initial": InitialPhase,
  2271. "beforeHtml": BeforeHtmlPhase,
  2272. "beforeHead": BeforeHeadPhase,
  2273. "inHead": InHeadPhase,
  2274. # XXX "inHeadNoscript": InHeadNoScriptPhase,
  2275. "afterHead": AfterHeadPhase,
  2276. "inBody": InBodyPhase,
  2277. "text": TextPhase,
  2278. "inTable": InTablePhase,
  2279. "inTableText": InTableTextPhase,
  2280. "inCaption": InCaptionPhase,
  2281. "inColumnGroup": InColumnGroupPhase,
  2282. "inTableBody": InTableBodyPhase,
  2283. "inRow": InRowPhase,
  2284. "inCell": InCellPhase,
  2285. "inSelect": InSelectPhase,
  2286. "inSelectInTable": InSelectInTablePhase,
  2287. "inForeignContent": InForeignContentPhase,
  2288. "afterBody": AfterBodyPhase,
  2289. "inFrameset": InFramesetPhase,
  2290. "afterFrameset": AfterFramesetPhase,
  2291. "afterAfterBody": AfterAfterBodyPhase,
  2292. "afterAfterFrameset": AfterAfterFramesetPhase,
  2293. # XXX after after frameset
  2294. }
  2295. def impliedTagToken(name, type="EndTag", attributes=None,
  2296. selfClosing=False):
  2297. if attributes is None:
  2298. attributes = {}
  2299. return {"type": tokenTypes[type], "name": name, "data": attributes,
  2300. "selfClosing": selfClosing}
  2301. class ParseError(Exception):
  2302. """Error in parsed document"""
  2303. pass