123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731 |
- from __future__ import absolute_import, division, unicode_literals
- try:
- chr = unichr # flake8: noqa
- except NameError:
- pass
- from collections import deque
- from .constants import spaceCharacters
- from .constants import entities
- from .constants import asciiLetters, asciiUpper2Lower
- from .constants import digits, hexDigits, EOF
- from .constants import tokenTypes, tagTokenTypes
- from .constants import replacementCharacters
- from .inputstream import HTMLInputStream
- from .trie import Trie
- entitiesTrie = Trie(entities)
- class HTMLTokenizer(object):
- """ This class takes care of tokenizing HTML.
- * self.currentToken
- Holds the token that is currently being processed.
- * self.state
- Holds a reference to the method to be invoked... XXX
- * self.stream
- Points to HTMLInputStream object.
- """
- def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
- lowercaseElementName=True, lowercaseAttrName=True, parser=None):
- self.stream = HTMLInputStream(stream, encoding, parseMeta, useChardet)
- self.parser = parser
- # Perform case conversions?
- self.lowercaseElementName = lowercaseElementName
- self.lowercaseAttrName = lowercaseAttrName
- # Setup the initial tokenizer state
- self.escapeFlag = False
- self.lastFourChars = []
- self.state = self.dataState
- self.escape = False
- # The current token being created
- self.currentToken = None
- super(HTMLTokenizer, self).__init__()
- def __iter__(self):
- """ This is where the magic happens.
- We do our usually processing through the states and when we have a token
- to return we yield the token which pauses processing until the next token
- is requested.
- """
- self.tokenQueue = deque([])
- # Start processing. When EOF is reached self.state will return False
- # instead of True and the loop will terminate.
- while self.state():
- while self.stream.errors:
- yield {"type": tokenTypes["ParseError"], "data": self.stream.errors.pop(0)}
- while self.tokenQueue:
- yield self.tokenQueue.popleft()
- def consumeNumberEntity(self, isHex):
- """This function returns either U+FFFD or the character based on the
- decimal or hexadecimal representation. It also discards ";" if present.
- If not present self.tokenQueue.append({"type": tokenTypes["ParseError"]}) is invoked.
- """
- allowed = digits
- radix = 10
- if isHex:
- allowed = hexDigits
- radix = 16
- charStack = []
- # Consume all the characters that are in range while making sure we
- # don't hit an EOF.
- c = self.stream.char()
- while c in allowed and c is not EOF:
- charStack.append(c)
- c = self.stream.char()
- # Convert the set of characters consumed to an int.
- charAsInt = int("".join(charStack), radix)
- # Certain characters get replaced with others
- if charAsInt in replacementCharacters:
- char = replacementCharacters[charAsInt]
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "illegal-codepoint-for-numeric-entity",
- "datavars": {"charAsInt": charAsInt}})
- elif ((0xD800 <= charAsInt <= 0xDFFF) or
- (charAsInt > 0x10FFFF)):
- char = "\uFFFD"
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "illegal-codepoint-for-numeric-entity",
- "datavars": {"charAsInt": charAsInt}})
- else:
- # Should speed up this check somehow (e.g. move the set to a constant)
- if ((0x0001 <= charAsInt <= 0x0008) or
- (0x000E <= charAsInt <= 0x001F) or
- (0x007F <= charAsInt <= 0x009F) or
- (0xFDD0 <= charAsInt <= 0xFDEF) or
- charAsInt in frozenset([0x000B, 0xFFFE, 0xFFFF, 0x1FFFE,
- 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
- 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE,
- 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
- 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE,
- 0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE,
- 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
- 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE,
- 0xFFFFF, 0x10FFFE, 0x10FFFF])):
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data":
- "illegal-codepoint-for-numeric-entity",
- "datavars": {"charAsInt": charAsInt}})
- try:
- # Try/except needed as UCS-2 Python builds' unichar only works
- # within the BMP.
- char = chr(charAsInt)
- except ValueError:
- v = charAsInt - 0x10000
- char = chr(0xD800 | (v >> 10)) + chr(0xDC00 | (v & 0x3FF))
- # Discard the ; if present. Otherwise, put it back on the queue and
- # invoke parseError on parser.
- if c != ";":
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "numeric-entity-without-semicolon"})
- self.stream.unget(c)
- return char
- def consumeEntity(self, allowedChar=None, fromAttribute=False):
- # Initialise to the default output for when no entity is matched
- output = "&"
- charStack = [self.stream.char()]
- if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&")
- or (allowedChar is not None and allowedChar == charStack[0])):
- self.stream.unget(charStack[0])
- elif charStack[0] == "#":
- # Read the next character to see if it's hex or decimal
- hex = False
- charStack.append(self.stream.char())
- if charStack[-1] in ("x", "X"):
- hex = True
- charStack.append(self.stream.char())
- # charStack[-1] should be the first digit
- if (hex and charStack[-1] in hexDigits) \
- or (not hex and charStack[-1] in digits):
- # At least one digit found, so consume the whole number
- self.stream.unget(charStack[-1])
- output = self.consumeNumberEntity(hex)
- else:
- # No digits found
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "expected-numeric-entity"})
- self.stream.unget(charStack.pop())
- output = "&" + "".join(charStack)
- else:
- # At this point in the process might have named entity. Entities
- # are stored in the global variable "entities".
- #
- # Consume characters and compare to these to a substring of the
- # entity names in the list until the substring no longer matches.
- while (charStack[-1] is not EOF):
- if not entitiesTrie.has_keys_with_prefix("".join(charStack)):
- break
- charStack.append(self.stream.char())
- # At this point we have a string that starts with some characters
- # that may match an entity
- # Try to find the longest entity the string will match to take care
- # of ¬i for instance.
- try:
- entityName = entitiesTrie.longest_prefix("".join(charStack[:-1]))
- entityLength = len(entityName)
- except KeyError:
- entityName = None
- if entityName is not None:
- if entityName[-1] != ";":
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "named-entity-without-semicolon"})
- if (entityName[-1] != ";" and fromAttribute and
- (charStack[entityLength] in asciiLetters or
- charStack[entityLength] in digits or
- charStack[entityLength] == "=")):
- self.stream.unget(charStack.pop())
- output = "&" + "".join(charStack)
- else:
- output = entities[entityName]
- self.stream.unget(charStack.pop())
- output += "".join(charStack[entityLength:])
- else:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "expected-named-entity"})
- self.stream.unget(charStack.pop())
- output = "&" + "".join(charStack)
- if fromAttribute:
- self.currentToken["data"][-1][1] += output
- else:
- if output in spaceCharacters:
- tokenType = "SpaceCharacters"
- else:
- tokenType = "Characters"
- self.tokenQueue.append({"type": tokenTypes[tokenType], "data": output})
- def processEntityInAttribute(self, allowedChar):
- """This method replaces the need for "entityInAttributeValueState".
- """
- self.consumeEntity(allowedChar=allowedChar, fromAttribute=True)
- def emitCurrentToken(self):
- """This method is a generic handler for emitting the tags. It also sets
- the state to "data" because that's what's needed after a token has been
- emitted.
- """
- token = self.currentToken
- # Add token to the queue to be yielded
- if (token["type"] in tagTokenTypes):
- if self.lowercaseElementName:
- token["name"] = token["name"].translate(asciiUpper2Lower)
- if token["type"] == tokenTypes["EndTag"]:
- if token["data"]:
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "attributes-in-end-tag"})
- if token["selfClosing"]:
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "self-closing-flag-on-end-tag"})
- self.tokenQueue.append(token)
- self.state = self.dataState
- # Below are the various tokenizer states worked out.
- def dataState(self):
- data = self.stream.char()
- if data == "&":
- self.state = self.entityDataState
- elif data == "<":
- self.state = self.tagOpenState
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.tokenQueue.append({"type": tokenTypes["Characters"],
- "data": "\u0000"})
- elif data is EOF:
- # Tokenization ends.
- return False
- elif data in spaceCharacters:
- # Directly after emitting a token you switch back to the "data
- # state". At that point spaceCharacters are important so they are
- # emitted separately.
- self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data":
- data + self.stream.charsUntil(spaceCharacters, True)})
- # No need to update lastFourChars here, since the first space will
- # have already been appended to lastFourChars and will have broken
- # any <!-- or --> sequences
- else:
- chars = self.stream.charsUntil(("&", "<", "\u0000"))
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
- data + chars})
- return True
- def entityDataState(self):
- self.consumeEntity()
- self.state = self.dataState
- return True
- def rcdataState(self):
- data = self.stream.char()
- if data == "&":
- self.state = self.characterReferenceInRcdata
- elif data == "<":
- self.state = self.rcdataLessThanSignState
- elif data == EOF:
- # Tokenization ends.
- return False
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.tokenQueue.append({"type": tokenTypes["Characters"],
- "data": "\uFFFD"})
- elif data in spaceCharacters:
- # Directly after emitting a token you switch back to the "data
- # state". At that point spaceCharacters are important so they are
- # emitted separately.
- self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data":
- data + self.stream.charsUntil(spaceCharacters, True)})
- # No need to update lastFourChars here, since the first space will
- # have already been appended to lastFourChars and will have broken
- # any <!-- or --> sequences
- else:
- chars = self.stream.charsUntil(("&", "<", "\u0000"))
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
- data + chars})
- return True
- def characterReferenceInRcdata(self):
- self.consumeEntity()
- self.state = self.rcdataState
- return True
- def rawtextState(self):
- data = self.stream.char()
- if data == "<":
- self.state = self.rawtextLessThanSignState
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.tokenQueue.append({"type": tokenTypes["Characters"],
- "data": "\uFFFD"})
- elif data == EOF:
- # Tokenization ends.
- return False
- else:
- chars = self.stream.charsUntil(("<", "\u0000"))
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
- data + chars})
- return True
- def scriptDataState(self):
- data = self.stream.char()
- if data == "<":
- self.state = self.scriptDataLessThanSignState
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.tokenQueue.append({"type": tokenTypes["Characters"],
- "data": "\uFFFD"})
- elif data == EOF:
- # Tokenization ends.
- return False
- else:
- chars = self.stream.charsUntil(("<", "\u0000"))
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
- data + chars})
- return True
- def plaintextState(self):
- data = self.stream.char()
- if data == EOF:
- # Tokenization ends.
- return False
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.tokenQueue.append({"type": tokenTypes["Characters"],
- "data": "\uFFFD"})
- else:
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
- data + self.stream.charsUntil("\u0000")})
- return True
- def tagOpenState(self):
- data = self.stream.char()
- if data == "!":
- self.state = self.markupDeclarationOpenState
- elif data == "/":
- self.state = self.closeTagOpenState
- elif data in asciiLetters:
- self.currentToken = {"type": tokenTypes["StartTag"],
- "name": data, "data": [],
- "selfClosing": False,
- "selfClosingAcknowledged": False}
- self.state = self.tagNameState
- elif data == ">":
- # XXX In theory it could be something besides a tag name. But
- # do we really care?
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "expected-tag-name-but-got-right-bracket"})
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<>"})
- self.state = self.dataState
- elif data == "?":
- # XXX In theory it could be something besides a tag name. But
- # do we really care?
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "expected-tag-name-but-got-question-mark"})
- self.stream.unget(data)
- self.state = self.bogusCommentState
- else:
- # XXX
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "expected-tag-name"})
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
- self.stream.unget(data)
- self.state = self.dataState
- return True
- def closeTagOpenState(self):
- data = self.stream.char()
- if data in asciiLetters:
- self.currentToken = {"type": tokenTypes["EndTag"], "name": data,
- "data": [], "selfClosing": False}
- self.state = self.tagNameState
- elif data == ">":
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "expected-closing-tag-but-got-right-bracket"})
- self.state = self.dataState
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "expected-closing-tag-but-got-eof"})
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
- self.state = self.dataState
- else:
- # XXX data can be _'_...
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "expected-closing-tag-but-got-char",
- "datavars": {"data": data}})
- self.stream.unget(data)
- self.state = self.bogusCommentState
- return True
- def tagNameState(self):
- data = self.stream.char()
- if data in spaceCharacters:
- self.state = self.beforeAttributeNameState
- elif data == ">":
- self.emitCurrentToken()
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "eof-in-tag-name"})
- self.state = self.dataState
- elif data == "/":
- self.state = self.selfClosingStartTagState
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.currentToken["name"] += "\uFFFD"
- else:
- self.currentToken["name"] += data
- # (Don't use charsUntil here, because tag names are
- # very short and it's faster to not do anything fancy)
- return True
- def rcdataLessThanSignState(self):
- data = self.stream.char()
- if data == "/":
- self.temporaryBuffer = ""
- self.state = self.rcdataEndTagOpenState
- else:
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
- self.stream.unget(data)
- self.state = self.rcdataState
- return True
- def rcdataEndTagOpenState(self):
- data = self.stream.char()
- if data in asciiLetters:
- self.temporaryBuffer += data
- self.state = self.rcdataEndTagNameState
- else:
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
- self.stream.unget(data)
- self.state = self.rcdataState
- return True
- def rcdataEndTagNameState(self):
- appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
- data = self.stream.char()
- if data in spaceCharacters and appropriate:
- self.currentToken = {"type": tokenTypes["EndTag"],
- "name": self.temporaryBuffer,
- "data": [], "selfClosing": False}
- self.state = self.beforeAttributeNameState
- elif data == "/" and appropriate:
- self.currentToken = {"type": tokenTypes["EndTag"],
- "name": self.temporaryBuffer,
- "data": [], "selfClosing": False}
- self.state = self.selfClosingStartTagState
- elif data == ">" and appropriate:
- self.currentToken = {"type": tokenTypes["EndTag"],
- "name": self.temporaryBuffer,
- "data": [], "selfClosing": False}
- self.emitCurrentToken()
- self.state = self.dataState
- elif data in asciiLetters:
- self.temporaryBuffer += data
- else:
- self.tokenQueue.append({"type": tokenTypes["Characters"],
- "data": "</" + self.temporaryBuffer})
- self.stream.unget(data)
- self.state = self.rcdataState
- return True
- def rawtextLessThanSignState(self):
- data = self.stream.char()
- if data == "/":
- self.temporaryBuffer = ""
- self.state = self.rawtextEndTagOpenState
- else:
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
- self.stream.unget(data)
- self.state = self.rawtextState
- return True
- def rawtextEndTagOpenState(self):
- data = self.stream.char()
- if data in asciiLetters:
- self.temporaryBuffer += data
- self.state = self.rawtextEndTagNameState
- else:
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
- self.stream.unget(data)
- self.state = self.rawtextState
- return True
- def rawtextEndTagNameState(self):
- appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
- data = self.stream.char()
- if data in spaceCharacters and appropriate:
- self.currentToken = {"type": tokenTypes["EndTag"],
- "name": self.temporaryBuffer,
- "data": [], "selfClosing": False}
- self.state = self.beforeAttributeNameState
- elif data == "/" and appropriate:
- self.currentToken = {"type": tokenTypes["EndTag"],
- "name": self.temporaryBuffer,
- "data": [], "selfClosing": False}
- self.state = self.selfClosingStartTagState
- elif data == ">" and appropriate:
- self.currentToken = {"type": tokenTypes["EndTag"],
- "name": self.temporaryBuffer,
- "data": [], "selfClosing": False}
- self.emitCurrentToken()
- self.state = self.dataState
- elif data in asciiLetters:
- self.temporaryBuffer += data
- else:
- self.tokenQueue.append({"type": tokenTypes["Characters"],
- "data": "</" + self.temporaryBuffer})
- self.stream.unget(data)
- self.state = self.rawtextState
- return True
- def scriptDataLessThanSignState(self):
- data = self.stream.char()
- if data == "/":
- self.temporaryBuffer = ""
- self.state = self.scriptDataEndTagOpenState
- elif data == "!":
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<!"})
- self.state = self.scriptDataEscapeStartState
- else:
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
- self.stream.unget(data)
- self.state = self.scriptDataState
- return True
- def scriptDataEndTagOpenState(self):
- data = self.stream.char()
- if data in asciiLetters:
- self.temporaryBuffer += data
- self.state = self.scriptDataEndTagNameState
- else:
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
- self.stream.unget(data)
- self.state = self.scriptDataState
- return True
- def scriptDataEndTagNameState(self):
- appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
- data = self.stream.char()
- if data in spaceCharacters and appropriate:
- self.currentToken = {"type": tokenTypes["EndTag"],
- "name": self.temporaryBuffer,
- "data": [], "selfClosing": False}
- self.state = self.beforeAttributeNameState
- elif data == "/" and appropriate:
- self.currentToken = {"type": tokenTypes["EndTag"],
- "name": self.temporaryBuffer,
- "data": [], "selfClosing": False}
- self.state = self.selfClosingStartTagState
- elif data == ">" and appropriate:
- self.currentToken = {"type": tokenTypes["EndTag"],
- "name": self.temporaryBuffer,
- "data": [], "selfClosing": False}
- self.emitCurrentToken()
- self.state = self.dataState
- elif data in asciiLetters:
- self.temporaryBuffer += data
- else:
- self.tokenQueue.append({"type": tokenTypes["Characters"],
- "data": "</" + self.temporaryBuffer})
- self.stream.unget(data)
- self.state = self.scriptDataState
- return True
- def scriptDataEscapeStartState(self):
- data = self.stream.char()
- if data == "-":
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
- self.state = self.scriptDataEscapeStartDashState
- else:
- self.stream.unget(data)
- self.state = self.scriptDataState
- return True
- def scriptDataEscapeStartDashState(self):
- data = self.stream.char()
- if data == "-":
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
- self.state = self.scriptDataEscapedDashDashState
- else:
- self.stream.unget(data)
- self.state = self.scriptDataState
- return True
- def scriptDataEscapedState(self):
- data = self.stream.char()
- if data == "-":
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
- self.state = self.scriptDataEscapedDashState
- elif data == "<":
- self.state = self.scriptDataEscapedLessThanSignState
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.tokenQueue.append({"type": tokenTypes["Characters"],
- "data": "\uFFFD"})
- elif data == EOF:
- self.state = self.dataState
- else:
- chars = self.stream.charsUntil(("<", "-", "\u0000"))
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
- data + chars})
- return True
- def scriptDataEscapedDashState(self):
- data = self.stream.char()
- if data == "-":
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
- self.state = self.scriptDataEscapedDashDashState
- elif data == "<":
- self.state = self.scriptDataEscapedLessThanSignState
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.tokenQueue.append({"type": tokenTypes["Characters"],
- "data": "\uFFFD"})
- self.state = self.scriptDataEscapedState
- elif data == EOF:
- self.state = self.dataState
- else:
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
- self.state = self.scriptDataEscapedState
- return True
- def scriptDataEscapedDashDashState(self):
- data = self.stream.char()
- if data == "-":
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
- elif data == "<":
- self.state = self.scriptDataEscapedLessThanSignState
- elif data == ">":
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"})
- self.state = self.scriptDataState
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.tokenQueue.append({"type": tokenTypes["Characters"],
- "data": "\uFFFD"})
- self.state = self.scriptDataEscapedState
- elif data == EOF:
- self.state = self.dataState
- else:
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
- self.state = self.scriptDataEscapedState
- return True
- def scriptDataEscapedLessThanSignState(self):
- data = self.stream.char()
- if data == "/":
- self.temporaryBuffer = ""
- self.state = self.scriptDataEscapedEndTagOpenState
- elif data in asciiLetters:
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<" + data})
- self.temporaryBuffer = data
- self.state = self.scriptDataDoubleEscapeStartState
- else:
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
- self.stream.unget(data)
- self.state = self.scriptDataEscapedState
- return True
- def scriptDataEscapedEndTagOpenState(self):
- data = self.stream.char()
- if data in asciiLetters:
- self.temporaryBuffer = data
- self.state = self.scriptDataEscapedEndTagNameState
- else:
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
- self.stream.unget(data)
- self.state = self.scriptDataEscapedState
- return True
- def scriptDataEscapedEndTagNameState(self):
- appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
- data = self.stream.char()
- if data in spaceCharacters and appropriate:
- self.currentToken = {"type": tokenTypes["EndTag"],
- "name": self.temporaryBuffer,
- "data": [], "selfClosing": False}
- self.state = self.beforeAttributeNameState
- elif data == "/" and appropriate:
- self.currentToken = {"type": tokenTypes["EndTag"],
- "name": self.temporaryBuffer,
- "data": [], "selfClosing": False}
- self.state = self.selfClosingStartTagState
- elif data == ">" and appropriate:
- self.currentToken = {"type": tokenTypes["EndTag"],
- "name": self.temporaryBuffer,
- "data": [], "selfClosing": False}
- self.emitCurrentToken()
- self.state = self.dataState
- elif data in asciiLetters:
- self.temporaryBuffer += data
- else:
- self.tokenQueue.append({"type": tokenTypes["Characters"],
- "data": "</" + self.temporaryBuffer})
- self.stream.unget(data)
- self.state = self.scriptDataEscapedState
- return True
- def scriptDataDoubleEscapeStartState(self):
- data = self.stream.char()
- if data in (spaceCharacters | frozenset(("/", ">"))):
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
- if self.temporaryBuffer.lower() == "script":
- self.state = self.scriptDataDoubleEscapedState
- else:
- self.state = self.scriptDataEscapedState
- elif data in asciiLetters:
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
- self.temporaryBuffer += data
- else:
- self.stream.unget(data)
- self.state = self.scriptDataEscapedState
- return True
- def scriptDataDoubleEscapedState(self):
- data = self.stream.char()
- if data == "-":
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
- self.state = self.scriptDataDoubleEscapedDashState
- elif data == "<":
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
- self.state = self.scriptDataDoubleEscapedLessThanSignState
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.tokenQueue.append({"type": tokenTypes["Characters"],
- "data": "\uFFFD"})
- elif data == EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "eof-in-script-in-script"})
- self.state = self.dataState
- else:
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
- return True
- def scriptDataDoubleEscapedDashState(self):
- data = self.stream.char()
- if data == "-":
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
- self.state = self.scriptDataDoubleEscapedDashDashState
- elif data == "<":
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
- self.state = self.scriptDataDoubleEscapedLessThanSignState
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.tokenQueue.append({"type": tokenTypes["Characters"],
- "data": "\uFFFD"})
- self.state = self.scriptDataDoubleEscapedState
- elif data == EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "eof-in-script-in-script"})
- self.state = self.dataState
- else:
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
- self.state = self.scriptDataDoubleEscapedState
- return True
- def scriptDataDoubleEscapedDashDashState(self):
- data = self.stream.char()
- if data == "-":
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
- elif data == "<":
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
- self.state = self.scriptDataDoubleEscapedLessThanSignState
- elif data == ">":
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"})
- self.state = self.scriptDataState
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.tokenQueue.append({"type": tokenTypes["Characters"],
- "data": "\uFFFD"})
- self.state = self.scriptDataDoubleEscapedState
- elif data == EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "eof-in-script-in-script"})
- self.state = self.dataState
- else:
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
- self.state = self.scriptDataDoubleEscapedState
- return True
- def scriptDataDoubleEscapedLessThanSignState(self):
- data = self.stream.char()
- if data == "/":
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "/"})
- self.temporaryBuffer = ""
- self.state = self.scriptDataDoubleEscapeEndState
- else:
- self.stream.unget(data)
- self.state = self.scriptDataDoubleEscapedState
- return True
- def scriptDataDoubleEscapeEndState(self):
- data = self.stream.char()
- if data in (spaceCharacters | frozenset(("/", ">"))):
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
- if self.temporaryBuffer.lower() == "script":
- self.state = self.scriptDataEscapedState
- else:
- self.state = self.scriptDataDoubleEscapedState
- elif data in asciiLetters:
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
- self.temporaryBuffer += data
- else:
- self.stream.unget(data)
- self.state = self.scriptDataDoubleEscapedState
- return True
- def beforeAttributeNameState(self):
- data = self.stream.char()
- if data in spaceCharacters:
- self.stream.charsUntil(spaceCharacters, True)
- elif data in asciiLetters:
- self.currentToken["data"].append([data, ""])
- self.state = self.attributeNameState
- elif data == ">":
- self.emitCurrentToken()
- elif data == "/":
- self.state = self.selfClosingStartTagState
- elif data in ("'", '"', "=", "<"):
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "invalid-character-in-attribute-name"})
- self.currentToken["data"].append([data, ""])
- self.state = self.attributeNameState
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.currentToken["data"].append(["\uFFFD", ""])
- self.state = self.attributeNameState
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "expected-attribute-name-but-got-eof"})
- self.state = self.dataState
- else:
- self.currentToken["data"].append([data, ""])
- self.state = self.attributeNameState
- return True
- def attributeNameState(self):
- data = self.stream.char()
- leavingThisState = True
- emitToken = False
- if data == "=":
- self.state = self.beforeAttributeValueState
- elif data in asciiLetters:
- self.currentToken["data"][-1][0] += data +\
- self.stream.charsUntil(asciiLetters, True)
- leavingThisState = False
- elif data == ">":
- # XXX If we emit here the attributes are converted to a dict
- # without being checked and when the code below runs we error
- # because data is a dict not a list
- emitToken = True
- elif data in spaceCharacters:
- self.state = self.afterAttributeNameState
- elif data == "/":
- self.state = self.selfClosingStartTagState
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.currentToken["data"][-1][0] += "\uFFFD"
- leavingThisState = False
- elif data in ("'", '"', "<"):
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data":
- "invalid-character-in-attribute-name"})
- self.currentToken["data"][-1][0] += data
- leavingThisState = False
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "eof-in-attribute-name"})
- self.state = self.dataState
- else:
- self.currentToken["data"][-1][0] += data
- leavingThisState = False
- if leavingThisState:
- # Attributes are not dropped at this stage. That happens when the
- # start tag token is emitted so values can still be safely appended
- # to attributes, but we do want to report the parse error in time.
- if self.lowercaseAttrName:
- self.currentToken["data"][-1][0] = (
- self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
- for name, value in self.currentToken["data"][:-1]:
- if self.currentToken["data"][-1][0] == name:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "duplicate-attribute"})
- break
- # XXX Fix for above XXX
- if emitToken:
- self.emitCurrentToken()
- return True
- def afterAttributeNameState(self):
- data = self.stream.char()
- if data in spaceCharacters:
- self.stream.charsUntil(spaceCharacters, True)
- elif data == "=":
- self.state = self.beforeAttributeValueState
- elif data == ">":
- self.emitCurrentToken()
- elif data in asciiLetters:
- self.currentToken["data"].append([data, ""])
- self.state = self.attributeNameState
- elif data == "/":
- self.state = self.selfClosingStartTagState
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.currentToken["data"].append(["\uFFFD", ""])
- self.state = self.attributeNameState
- elif data in ("'", '"', "<"):
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "invalid-character-after-attribute-name"})
- self.currentToken["data"].append([data, ""])
- self.state = self.attributeNameState
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "expected-end-of-tag-but-got-eof"})
- self.state = self.dataState
- else:
- self.currentToken["data"].append([data, ""])
- self.state = self.attributeNameState
- return True
- def beforeAttributeValueState(self):
- data = self.stream.char()
- if data in spaceCharacters:
- self.stream.charsUntil(spaceCharacters, True)
- elif data == "\"":
- self.state = self.attributeValueDoubleQuotedState
- elif data == "&":
- self.state = self.attributeValueUnQuotedState
- self.stream.unget(data)
- elif data == "'":
- self.state = self.attributeValueSingleQuotedState
- elif data == ">":
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "expected-attribute-value-but-got-right-bracket"})
- self.emitCurrentToken()
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.currentToken["data"][-1][1] += "\uFFFD"
- self.state = self.attributeValueUnQuotedState
- elif data in ("=", "<", "`"):
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "equals-in-unquoted-attribute-value"})
- self.currentToken["data"][-1][1] += data
- self.state = self.attributeValueUnQuotedState
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "expected-attribute-value-but-got-eof"})
- self.state = self.dataState
- else:
- self.currentToken["data"][-1][1] += data
- self.state = self.attributeValueUnQuotedState
- return True
- def attributeValueDoubleQuotedState(self):
- data = self.stream.char()
- if data == "\"":
- self.state = self.afterAttributeValueState
- elif data == "&":
- self.processEntityInAttribute('"')
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.currentToken["data"][-1][1] += "\uFFFD"
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "eof-in-attribute-value-double-quote"})
- self.state = self.dataState
- else:
- self.currentToken["data"][-1][1] += data +\
- self.stream.charsUntil(("\"", "&", "\u0000"))
- return True
- def attributeValueSingleQuotedState(self):
- data = self.stream.char()
- if data == "'":
- self.state = self.afterAttributeValueState
- elif data == "&":
- self.processEntityInAttribute("'")
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.currentToken["data"][-1][1] += "\uFFFD"
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "eof-in-attribute-value-single-quote"})
- self.state = self.dataState
- else:
- self.currentToken["data"][-1][1] += data +\
- self.stream.charsUntil(("'", "&", "\u0000"))
- return True
- def attributeValueUnQuotedState(self):
- data = self.stream.char()
- if data in spaceCharacters:
- self.state = self.beforeAttributeNameState
- elif data == "&":
- self.processEntityInAttribute(">")
- elif data == ">":
- self.emitCurrentToken()
- elif data in ('"', "'", "=", "<", "`"):
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "unexpected-character-in-unquoted-attribute-value"})
- self.currentToken["data"][-1][1] += data
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.currentToken["data"][-1][1] += "\uFFFD"
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "eof-in-attribute-value-no-quotes"})
- self.state = self.dataState
- else:
- self.currentToken["data"][-1][1] += data + self.stream.charsUntil(
- frozenset(("&", ">", '"', "'", "=", "<", "`", "\u0000")) | spaceCharacters)
- return True
- def afterAttributeValueState(self):
- data = self.stream.char()
- if data in spaceCharacters:
- self.state = self.beforeAttributeNameState
- elif data == ">":
- self.emitCurrentToken()
- elif data == "/":
- self.state = self.selfClosingStartTagState
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "unexpected-EOF-after-attribute-value"})
- self.stream.unget(data)
- self.state = self.dataState
- else:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "unexpected-character-after-attribute-value"})
- self.stream.unget(data)
- self.state = self.beforeAttributeNameState
- return True
- def selfClosingStartTagState(self):
- data = self.stream.char()
- if data == ">":
- self.currentToken["selfClosing"] = True
- self.emitCurrentToken()
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data":
- "unexpected-EOF-after-solidus-in-tag"})
- self.stream.unget(data)
- self.state = self.dataState
- else:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "unexpected-character-after-solidus-in-tag"})
- self.stream.unget(data)
- self.state = self.beforeAttributeNameState
- return True
- def bogusCommentState(self):
- # Make a new comment token and give it as value all the characters
- # until the first > or EOF (charsUntil checks for EOF automatically)
- # and emit it.
- data = self.stream.charsUntil(">")
- data = data.replace("\u0000", "\uFFFD")
- self.tokenQueue.append(
- {"type": tokenTypes["Comment"], "data": data})
- # Eat the character directly after the bogus comment which is either a
- # ">" or an EOF.
- self.stream.char()
- self.state = self.dataState
- return True
- def markupDeclarationOpenState(self):
- charStack = [self.stream.char()]
- if charStack[-1] == "-":
- charStack.append(self.stream.char())
- if charStack[-1] == "-":
- self.currentToken = {"type": tokenTypes["Comment"], "data": ""}
- self.state = self.commentStartState
- return True
- elif charStack[-1] in ('d', 'D'):
- matched = True
- for expected in (('o', 'O'), ('c', 'C'), ('t', 'T'),
- ('y', 'Y'), ('p', 'P'), ('e', 'E')):
- charStack.append(self.stream.char())
- if charStack[-1] not in expected:
- matched = False
- break
- if matched:
- self.currentToken = {"type": tokenTypes["Doctype"],
- "name": "",
- "publicId": None, "systemId": None,
- "correct": True}
- self.state = self.doctypeState
- return True
- elif (charStack[-1] == "[" and
- self.parser is not None and
- self.parser.tree.openElements and
- self.parser.tree.openElements[-1].namespace != self.parser.tree.defaultNamespace):
- matched = True
- for expected in ["C", "D", "A", "T", "A", "["]:
- charStack.append(self.stream.char())
- if charStack[-1] != expected:
- matched = False
- break
- if matched:
- self.state = self.cdataSectionState
- return True
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "expected-dashes-or-doctype"})
- while charStack:
- self.stream.unget(charStack.pop())
- self.state = self.bogusCommentState
- return True
- def commentStartState(self):
- data = self.stream.char()
- if data == "-":
- self.state = self.commentStartDashState
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.currentToken["data"] += "\uFFFD"
- elif data == ">":
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "incorrect-comment"})
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "eof-in-comment"})
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- else:
- self.currentToken["data"] += data
- self.state = self.commentState
- return True
- def commentStartDashState(self):
- data = self.stream.char()
- if data == "-":
- self.state = self.commentEndState
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.currentToken["data"] += "-\uFFFD"
- elif data == ">":
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "incorrect-comment"})
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "eof-in-comment"})
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- else:
- self.currentToken["data"] += "-" + data
- self.state = self.commentState
- return True
- def commentState(self):
- data = self.stream.char()
- if data == "-":
- self.state = self.commentEndDashState
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.currentToken["data"] += "\uFFFD"
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "eof-in-comment"})
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- else:
- self.currentToken["data"] += data + \
- self.stream.charsUntil(("-", "\u0000"))
- return True
- def commentEndDashState(self):
- data = self.stream.char()
- if data == "-":
- self.state = self.commentEndState
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.currentToken["data"] += "-\uFFFD"
- self.state = self.commentState
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "eof-in-comment-end-dash"})
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- else:
- self.currentToken["data"] += "-" + data
- self.state = self.commentState
- return True
- def commentEndState(self):
- data = self.stream.char()
- if data == ">":
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.currentToken["data"] += "--\uFFFD"
- self.state = self.commentState
- elif data == "!":
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "unexpected-bang-after-double-dash-in-comment"})
- self.state = self.commentEndBangState
- elif data == "-":
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "unexpected-dash-after-double-dash-in-comment"})
- self.currentToken["data"] += data
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "eof-in-comment-double-dash"})
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- else:
- # XXX
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "unexpected-char-in-comment"})
- self.currentToken["data"] += "--" + data
- self.state = self.commentState
- return True
- def commentEndBangState(self):
- data = self.stream.char()
- if data == ">":
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- elif data == "-":
- self.currentToken["data"] += "--!"
- self.state = self.commentEndDashState
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.currentToken["data"] += "--!\uFFFD"
- self.state = self.commentState
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "eof-in-comment-end-bang-state"})
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- else:
- self.currentToken["data"] += "--!" + data
- self.state = self.commentState
- return True
- def doctypeState(self):
- data = self.stream.char()
- if data in spaceCharacters:
- self.state = self.beforeDoctypeNameState
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "expected-doctype-name-but-got-eof"})
- self.currentToken["correct"] = False
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- else:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "need-space-after-doctype"})
- self.stream.unget(data)
- self.state = self.beforeDoctypeNameState
- return True
- def beforeDoctypeNameState(self):
- data = self.stream.char()
- if data in spaceCharacters:
- pass
- elif data == ">":
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "expected-doctype-name-but-got-right-bracket"})
- self.currentToken["correct"] = False
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.currentToken["name"] = "\uFFFD"
- self.state = self.doctypeNameState
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "expected-doctype-name-but-got-eof"})
- self.currentToken["correct"] = False
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- else:
- self.currentToken["name"] = data
- self.state = self.doctypeNameState
- return True
- def doctypeNameState(self):
- data = self.stream.char()
- if data in spaceCharacters:
- self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
- self.state = self.afterDoctypeNameState
- elif data == ">":
- self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.currentToken["name"] += "\uFFFD"
- self.state = self.doctypeNameState
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "eof-in-doctype-name"})
- self.currentToken["correct"] = False
- self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- else:
- self.currentToken["name"] += data
- return True
- def afterDoctypeNameState(self):
- data = self.stream.char()
- if data in spaceCharacters:
- pass
- elif data == ">":
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- elif data is EOF:
- self.currentToken["correct"] = False
- self.stream.unget(data)
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "eof-in-doctype"})
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- else:
- if data in ("p", "P"):
- matched = True
- for expected in (("u", "U"), ("b", "B"), ("l", "L"),
- ("i", "I"), ("c", "C")):
- data = self.stream.char()
- if data not in expected:
- matched = False
- break
- if matched:
- self.state = self.afterDoctypePublicKeywordState
- return True
- elif data in ("s", "S"):
- matched = True
- for expected in (("y", "Y"), ("s", "S"), ("t", "T"),
- ("e", "E"), ("m", "M")):
- data = self.stream.char()
- if data not in expected:
- matched = False
- break
- if matched:
- self.state = self.afterDoctypeSystemKeywordState
- return True
- # All the characters read before the current 'data' will be
- # [a-zA-Z], so they're garbage in the bogus doctype and can be
- # discarded; only the latest character might be '>' or EOF
- # and needs to be ungetted
- self.stream.unget(data)
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "expected-space-or-right-bracket-in-doctype", "datavars":
- {"data": data}})
- self.currentToken["correct"] = False
- self.state = self.bogusDoctypeState
- return True
- def afterDoctypePublicKeywordState(self):
- data = self.stream.char()
- if data in spaceCharacters:
- self.state = self.beforeDoctypePublicIdentifierState
- elif data in ("'", '"'):
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "unexpected-char-in-doctype"})
- self.stream.unget(data)
- self.state = self.beforeDoctypePublicIdentifierState
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "eof-in-doctype"})
- self.currentToken["correct"] = False
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- else:
- self.stream.unget(data)
- self.state = self.beforeDoctypePublicIdentifierState
- return True
- def beforeDoctypePublicIdentifierState(self):
- data = self.stream.char()
- if data in spaceCharacters:
- pass
- elif data == "\"":
- self.currentToken["publicId"] = ""
- self.state = self.doctypePublicIdentifierDoubleQuotedState
- elif data == "'":
- self.currentToken["publicId"] = ""
- self.state = self.doctypePublicIdentifierSingleQuotedState
- elif data == ">":
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "unexpected-end-of-doctype"})
- self.currentToken["correct"] = False
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "eof-in-doctype"})
- self.currentToken["correct"] = False
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- else:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "unexpected-char-in-doctype"})
- self.currentToken["correct"] = False
- self.state = self.bogusDoctypeState
- return True
- def doctypePublicIdentifierDoubleQuotedState(self):
- data = self.stream.char()
- if data == "\"":
- self.state = self.afterDoctypePublicIdentifierState
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.currentToken["publicId"] += "\uFFFD"
- elif data == ">":
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "unexpected-end-of-doctype"})
- self.currentToken["correct"] = False
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "eof-in-doctype"})
- self.currentToken["correct"] = False
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- else:
- self.currentToken["publicId"] += data
- return True
- def doctypePublicIdentifierSingleQuotedState(self):
- data = self.stream.char()
- if data == "'":
- self.state = self.afterDoctypePublicIdentifierState
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.currentToken["publicId"] += "\uFFFD"
- elif data == ">":
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "unexpected-end-of-doctype"})
- self.currentToken["correct"] = False
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "eof-in-doctype"})
- self.currentToken["correct"] = False
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- else:
- self.currentToken["publicId"] += data
- return True
- def afterDoctypePublicIdentifierState(self):
- data = self.stream.char()
- if data in spaceCharacters:
- self.state = self.betweenDoctypePublicAndSystemIdentifiersState
- elif data == ">":
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- elif data == '"':
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "unexpected-char-in-doctype"})
- self.currentToken["systemId"] = ""
- self.state = self.doctypeSystemIdentifierDoubleQuotedState
- elif data == "'":
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "unexpected-char-in-doctype"})
- self.currentToken["systemId"] = ""
- self.state = self.doctypeSystemIdentifierSingleQuotedState
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "eof-in-doctype"})
- self.currentToken["correct"] = False
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- else:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "unexpected-char-in-doctype"})
- self.currentToken["correct"] = False
- self.state = self.bogusDoctypeState
- return True
- def betweenDoctypePublicAndSystemIdentifiersState(self):
- data = self.stream.char()
- if data in spaceCharacters:
- pass
- elif data == ">":
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- elif data == '"':
- self.currentToken["systemId"] = ""
- self.state = self.doctypeSystemIdentifierDoubleQuotedState
- elif data == "'":
- self.currentToken["systemId"] = ""
- self.state = self.doctypeSystemIdentifierSingleQuotedState
- elif data == EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "eof-in-doctype"})
- self.currentToken["correct"] = False
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- else:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "unexpected-char-in-doctype"})
- self.currentToken["correct"] = False
- self.state = self.bogusDoctypeState
- return True
- def afterDoctypeSystemKeywordState(self):
- data = self.stream.char()
- if data in spaceCharacters:
- self.state = self.beforeDoctypeSystemIdentifierState
- elif data in ("'", '"'):
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "unexpected-char-in-doctype"})
- self.stream.unget(data)
- self.state = self.beforeDoctypeSystemIdentifierState
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "eof-in-doctype"})
- self.currentToken["correct"] = False
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- else:
- self.stream.unget(data)
- self.state = self.beforeDoctypeSystemIdentifierState
- return True
- def beforeDoctypeSystemIdentifierState(self):
- data = self.stream.char()
- if data in spaceCharacters:
- pass
- elif data == "\"":
- self.currentToken["systemId"] = ""
- self.state = self.doctypeSystemIdentifierDoubleQuotedState
- elif data == "'":
- self.currentToken["systemId"] = ""
- self.state = self.doctypeSystemIdentifierSingleQuotedState
- elif data == ">":
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "unexpected-char-in-doctype"})
- self.currentToken["correct"] = False
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "eof-in-doctype"})
- self.currentToken["correct"] = False
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- else:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "unexpected-char-in-doctype"})
- self.currentToken["correct"] = False
- self.state = self.bogusDoctypeState
- return True
- def doctypeSystemIdentifierDoubleQuotedState(self):
- data = self.stream.char()
- if data == "\"":
- self.state = self.afterDoctypeSystemIdentifierState
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.currentToken["systemId"] += "\uFFFD"
- elif data == ">":
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "unexpected-end-of-doctype"})
- self.currentToken["correct"] = False
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "eof-in-doctype"})
- self.currentToken["correct"] = False
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- else:
- self.currentToken["systemId"] += data
- return True
- def doctypeSystemIdentifierSingleQuotedState(self):
- data = self.stream.char()
- if data == "'":
- self.state = self.afterDoctypeSystemIdentifierState
- elif data == "\u0000":
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- self.currentToken["systemId"] += "\uFFFD"
- elif data == ">":
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "unexpected-end-of-doctype"})
- self.currentToken["correct"] = False
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "eof-in-doctype"})
- self.currentToken["correct"] = False
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- else:
- self.currentToken["systemId"] += data
- return True
- def afterDoctypeSystemIdentifierState(self):
- data = self.stream.char()
- if data in spaceCharacters:
- pass
- elif data == ">":
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- elif data is EOF:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "eof-in-doctype"})
- self.currentToken["correct"] = False
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- else:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "unexpected-char-in-doctype"})
- self.state = self.bogusDoctypeState
- return True
- def bogusDoctypeState(self):
- data = self.stream.char()
- if data == ">":
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- elif data is EOF:
- # XXX EMIT
- self.stream.unget(data)
- self.tokenQueue.append(self.currentToken)
- self.state = self.dataState
- else:
- pass
- return True
- def cdataSectionState(self):
- data = []
- while True:
- data.append(self.stream.charsUntil("]"))
- data.append(self.stream.charsUntil(">"))
- char = self.stream.char()
- if char == EOF:
- break
- else:
- assert char == ">"
- if data[-1][-2:] == "]]":
- data[-1] = data[-1][:-2]
- break
- else:
- data.append(char)
- data = "".join(data)
- # Deal with null here rather than in the parser
- nullCount = data.count("\u0000")
- if nullCount > 0:
- for i in range(nullCount):
- self.tokenQueue.append({"type": tokenTypes["ParseError"],
- "data": "invalid-codepoint"})
- data = data.replace("\u0000", "\uFFFD")
- if data:
- self.tokenQueue.append({"type": tokenTypes["Characters"],
- "data": data})
- self.state = self.dataState
- return True
|