inputstream.py 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886
  1. from __future__ import absolute_import, division, unicode_literals
  2. from pip._vendor.six import text_type
  3. from pip._vendor.six.moves import http_client
  4. import codecs
  5. import re
  6. from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
  7. from .constants import encodings, ReparseException
  8. from . import utils
  9. from io import StringIO
  10. try:
  11. from io import BytesIO
  12. except ImportError:
  13. BytesIO = StringIO
  14. try:
  15. from io import BufferedIOBase
  16. except ImportError:
  17. class BufferedIOBase(object):
  18. pass
  19. # Non-unicode versions of constants for use in the pre-parser
  20. spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters])
  21. asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
  22. asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
  23. spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
  24. invalid_unicode_re = re.compile("[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")
  25. non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
  26. 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
  27. 0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE,
  28. 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
  29. 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
  30. 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
  31. 0x10FFFE, 0x10FFFF])
  32. ascii_punctuation_re = re.compile("[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]")
  33. # Cache for charsUntil()
  34. charsUntilRegEx = {}
  35. class BufferedStream(object):
  36. """Buffering for streams that do not have buffering of their own
  37. The buffer is implemented as a list of chunks on the assumption that
  38. joining many strings will be slow since it is O(n**2)
  39. """
  40. def __init__(self, stream):
  41. self.stream = stream
  42. self.buffer = []
  43. self.position = [-1, 0] # chunk number, offset
  44. def tell(self):
  45. pos = 0
  46. for chunk in self.buffer[:self.position[0]]:
  47. pos += len(chunk)
  48. pos += self.position[1]
  49. return pos
  50. def seek(self, pos):
  51. assert pos <= self._bufferedBytes()
  52. offset = pos
  53. i = 0
  54. while len(self.buffer[i]) < offset:
  55. offset -= len(self.buffer[i])
  56. i += 1
  57. self.position = [i, offset]
  58. def read(self, bytes):
  59. if not self.buffer:
  60. return self._readStream(bytes)
  61. elif (self.position[0] == len(self.buffer) and
  62. self.position[1] == len(self.buffer[-1])):
  63. return self._readStream(bytes)
  64. else:
  65. return self._readFromBuffer(bytes)
  66. def _bufferedBytes(self):
  67. return sum([len(item) for item in self.buffer])
  68. def _readStream(self, bytes):
  69. data = self.stream.read(bytes)
  70. self.buffer.append(data)
  71. self.position[0] += 1
  72. self.position[1] = len(data)
  73. return data
  74. def _readFromBuffer(self, bytes):
  75. remainingBytes = bytes
  76. rv = []
  77. bufferIndex = self.position[0]
  78. bufferOffset = self.position[1]
  79. while bufferIndex < len(self.buffer) and remainingBytes != 0:
  80. assert remainingBytes > 0
  81. bufferedData = self.buffer[bufferIndex]
  82. if remainingBytes <= len(bufferedData) - bufferOffset:
  83. bytesToRead = remainingBytes
  84. self.position = [bufferIndex, bufferOffset + bytesToRead]
  85. else:
  86. bytesToRead = len(bufferedData) - bufferOffset
  87. self.position = [bufferIndex, len(bufferedData)]
  88. bufferIndex += 1
  89. rv.append(bufferedData[bufferOffset:bufferOffset + bytesToRead])
  90. remainingBytes -= bytesToRead
  91. bufferOffset = 0
  92. if remainingBytes:
  93. rv.append(self._readStream(remainingBytes))
  94. return b"".join(rv)
  95. def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
  96. if isinstance(source, http_client.HTTPResponse):
  97. # Work around Python bug #20007: read(0) closes the connection.
  98. # http://bugs.python.org/issue20007
  99. isUnicode = False
  100. elif hasattr(source, "read"):
  101. isUnicode = isinstance(source.read(0), text_type)
  102. else:
  103. isUnicode = isinstance(source, text_type)
  104. if isUnicode:
  105. if encoding is not None:
  106. raise TypeError("Cannot explicitly set an encoding with a unicode string")
  107. return HTMLUnicodeInputStream(source)
  108. else:
  109. return HTMLBinaryInputStream(source, encoding, parseMeta, chardet)
  110. class HTMLUnicodeInputStream(object):
  111. """Provides a unicode stream of characters to the HTMLTokenizer.
  112. This class takes care of character encoding and removing or replacing
  113. incorrect byte-sequences and also provides column and line tracking.
  114. """
  115. _defaultChunkSize = 10240
  116. def __init__(self, source):
  117. """Initialises the HTMLInputStream.
  118. HTMLInputStream(source, [encoding]) -> Normalized stream from source
  119. for use by html5lib.
  120. source can be either a file-object, local filename or a string.
  121. The optional encoding parameter must be a string that indicates
  122. the encoding. If specified, that encoding will be used,
  123. regardless of any BOM or later declaration (such as in a meta
  124. element)
  125. parseMeta - Look for a <meta> element containing encoding information
  126. """
  127. # Craziness
  128. if len("\U0010FFFF") == 1:
  129. self.reportCharacterErrors = self.characterErrorsUCS4
  130. self.replaceCharactersRegexp = re.compile("[\uD800-\uDFFF]")
  131. else:
  132. self.reportCharacterErrors = self.characterErrorsUCS2
  133. self.replaceCharactersRegexp = re.compile("([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF])")
  134. # List of where new lines occur
  135. self.newLines = [0]
  136. self.charEncoding = ("utf-8", "certain")
  137. self.dataStream = self.openStream(source)
  138. self.reset()
  139. def reset(self):
  140. self.chunk = ""
  141. self.chunkSize = 0
  142. self.chunkOffset = 0
  143. self.errors = []
  144. # number of (complete) lines in previous chunks
  145. self.prevNumLines = 0
  146. # number of columns in the last line of the previous chunk
  147. self.prevNumCols = 0
  148. # Deal with CR LF and surrogates split over chunk boundaries
  149. self._bufferedCharacter = None
  150. def openStream(self, source):
  151. """Produces a file object from source.
  152. source can be either a file object, local filename or a string.
  153. """
  154. # Already a file object
  155. if hasattr(source, 'read'):
  156. stream = source
  157. else:
  158. stream = StringIO(source)
  159. return stream
  160. def _position(self, offset):
  161. chunk = self.chunk
  162. nLines = chunk.count('\n', 0, offset)
  163. positionLine = self.prevNumLines + nLines
  164. lastLinePos = chunk.rfind('\n', 0, offset)
  165. if lastLinePos == -1:
  166. positionColumn = self.prevNumCols + offset
  167. else:
  168. positionColumn = offset - (lastLinePos + 1)
  169. return (positionLine, positionColumn)
  170. def position(self):
  171. """Returns (line, col) of the current position in the stream."""
  172. line, col = self._position(self.chunkOffset)
  173. return (line + 1, col)
  174. def char(self):
  175. """ Read one character from the stream or queue if available. Return
  176. EOF when EOF is reached.
  177. """
  178. # Read a new chunk from the input stream if necessary
  179. if self.chunkOffset >= self.chunkSize:
  180. if not self.readChunk():
  181. return EOF
  182. chunkOffset = self.chunkOffset
  183. char = self.chunk[chunkOffset]
  184. self.chunkOffset = chunkOffset + 1
  185. return char
  186. def readChunk(self, chunkSize=None):
  187. if chunkSize is None:
  188. chunkSize = self._defaultChunkSize
  189. self.prevNumLines, self.prevNumCols = self._position(self.chunkSize)
  190. self.chunk = ""
  191. self.chunkSize = 0
  192. self.chunkOffset = 0
  193. data = self.dataStream.read(chunkSize)
  194. # Deal with CR LF and surrogates broken across chunks
  195. if self._bufferedCharacter:
  196. data = self._bufferedCharacter + data
  197. self._bufferedCharacter = None
  198. elif not data:
  199. # We have no more data, bye-bye stream
  200. return False
  201. if len(data) > 1:
  202. lastv = ord(data[-1])
  203. if lastv == 0x0D or 0xD800 <= lastv <= 0xDBFF:
  204. self._bufferedCharacter = data[-1]
  205. data = data[:-1]
  206. self.reportCharacterErrors(data)
  207. # Replace invalid characters
  208. # Note U+0000 is dealt with in the tokenizer
  209. data = self.replaceCharactersRegexp.sub("\ufffd", data)
  210. data = data.replace("\r\n", "\n")
  211. data = data.replace("\r", "\n")
  212. self.chunk = data
  213. self.chunkSize = len(data)
  214. return True
  215. def characterErrorsUCS4(self, data):
  216. for i in range(len(invalid_unicode_re.findall(data))):
  217. self.errors.append("invalid-codepoint")
  218. def characterErrorsUCS2(self, data):
  219. # Someone picked the wrong compile option
  220. # You lose
  221. skip = False
  222. for match in invalid_unicode_re.finditer(data):
  223. if skip:
  224. continue
  225. codepoint = ord(match.group())
  226. pos = match.start()
  227. # Pretty sure there should be endianness issues here
  228. if utils.isSurrogatePair(data[pos:pos + 2]):
  229. # We have a surrogate pair!
  230. char_val = utils.surrogatePairToCodepoint(data[pos:pos + 2])
  231. if char_val in non_bmp_invalid_codepoints:
  232. self.errors.append("invalid-codepoint")
  233. skip = True
  234. elif (codepoint >= 0xD800 and codepoint <= 0xDFFF and
  235. pos == len(data) - 1):
  236. self.errors.append("invalid-codepoint")
  237. else:
  238. skip = False
  239. self.errors.append("invalid-codepoint")
  240. def charsUntil(self, characters, opposite=False):
  241. """ Returns a string of characters from the stream up to but not
  242. including any character in 'characters' or EOF. 'characters' must be
  243. a container that supports the 'in' method and iteration over its
  244. characters.
  245. """
  246. # Use a cache of regexps to find the required characters
  247. try:
  248. chars = charsUntilRegEx[(characters, opposite)]
  249. except KeyError:
  250. if __debug__:
  251. for c in characters:
  252. assert(ord(c) < 128)
  253. regex = "".join(["\\x%02x" % ord(c) for c in characters])
  254. if not opposite:
  255. regex = "^%s" % regex
  256. chars = charsUntilRegEx[(characters, opposite)] = re.compile("[%s]+" % regex)
  257. rv = []
  258. while True:
  259. # Find the longest matching prefix
  260. m = chars.match(self.chunk, self.chunkOffset)
  261. if m is None:
  262. # If nothing matched, and it wasn't because we ran out of chunk,
  263. # then stop
  264. if self.chunkOffset != self.chunkSize:
  265. break
  266. else:
  267. end = m.end()
  268. # If not the whole chunk matched, return everything
  269. # up to the part that didn't match
  270. if end != self.chunkSize:
  271. rv.append(self.chunk[self.chunkOffset:end])
  272. self.chunkOffset = end
  273. break
  274. # If the whole remainder of the chunk matched,
  275. # use it all and read the next chunk
  276. rv.append(self.chunk[self.chunkOffset:])
  277. if not self.readChunk():
  278. # Reached EOF
  279. break
  280. r = "".join(rv)
  281. return r
  282. def unget(self, char):
  283. # Only one character is allowed to be ungotten at once - it must
  284. # be consumed again before any further call to unget
  285. if char is not None:
  286. if self.chunkOffset == 0:
  287. # unget is called quite rarely, so it's a good idea to do
  288. # more work here if it saves a bit of work in the frequently
  289. # called char and charsUntil.
  290. # So, just prepend the ungotten character onto the current
  291. # chunk:
  292. self.chunk = char + self.chunk
  293. self.chunkSize += 1
  294. else:
  295. self.chunkOffset -= 1
  296. assert self.chunk[self.chunkOffset] == char
  297. class HTMLBinaryInputStream(HTMLUnicodeInputStream):
  298. """Provides a unicode stream of characters to the HTMLTokenizer.
  299. This class takes care of character encoding and removing or replacing
  300. incorrect byte-sequences and also provides column and line tracking.
  301. """
  302. def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
  303. """Initialises the HTMLInputStream.
  304. HTMLInputStream(source, [encoding]) -> Normalized stream from source
  305. for use by html5lib.
  306. source can be either a file-object, local filename or a string.
  307. The optional encoding parameter must be a string that indicates
  308. the encoding. If specified, that encoding will be used,
  309. regardless of any BOM or later declaration (such as in a meta
  310. element)
  311. parseMeta - Look for a <meta> element containing encoding information
  312. """
  313. # Raw Stream - for unicode objects this will encode to utf-8 and set
  314. # self.charEncoding as appropriate
  315. self.rawStream = self.openStream(source)
  316. HTMLUnicodeInputStream.__init__(self, self.rawStream)
  317. self.charEncoding = (codecName(encoding), "certain")
  318. # Encoding Information
  319. # Number of bytes to use when looking for a meta element with
  320. # encoding information
  321. self.numBytesMeta = 512
  322. # Number of bytes to use when using detecting encoding using chardet
  323. self.numBytesChardet = 100
  324. # Encoding to use if no other information can be found
  325. self.defaultEncoding = "windows-1252"
  326. # Detect encoding iff no explicit "transport level" encoding is supplied
  327. if (self.charEncoding[0] is None):
  328. self.charEncoding = self.detectEncoding(parseMeta, chardet)
  329. # Call superclass
  330. self.reset()
  331. def reset(self):
  332. self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream,
  333. 'replace')
  334. HTMLUnicodeInputStream.reset(self)
  335. def openStream(self, source):
  336. """Produces a file object from source.
  337. source can be either a file object, local filename or a string.
  338. """
  339. # Already a file object
  340. if hasattr(source, 'read'):
  341. stream = source
  342. else:
  343. stream = BytesIO(source)
  344. try:
  345. stream.seek(stream.tell())
  346. except:
  347. stream = BufferedStream(stream)
  348. return stream
  349. def detectEncoding(self, parseMeta=True, chardet=True):
  350. # First look for a BOM
  351. # This will also read past the BOM if present
  352. encoding = self.detectBOM()
  353. confidence = "certain"
  354. # If there is no BOM need to look for meta elements with encoding
  355. # information
  356. if encoding is None and parseMeta:
  357. encoding = self.detectEncodingMeta()
  358. confidence = "tentative"
  359. # Guess with chardet, if avaliable
  360. if encoding is None and chardet:
  361. confidence = "tentative"
  362. try:
  363. try:
  364. from charade.universaldetector import UniversalDetector
  365. except ImportError:
  366. from chardet.universaldetector import UniversalDetector
  367. buffers = []
  368. detector = UniversalDetector()
  369. while not detector.done:
  370. buffer = self.rawStream.read(self.numBytesChardet)
  371. assert isinstance(buffer, bytes)
  372. if not buffer:
  373. break
  374. buffers.append(buffer)
  375. detector.feed(buffer)
  376. detector.close()
  377. encoding = detector.result['encoding']
  378. self.rawStream.seek(0)
  379. except ImportError:
  380. pass
  381. # If all else fails use the default encoding
  382. if encoding is None:
  383. confidence = "tentative"
  384. encoding = self.defaultEncoding
  385. # Substitute for equivalent encodings:
  386. encodingSub = {"iso-8859-1": "windows-1252"}
  387. if encoding.lower() in encodingSub:
  388. encoding = encodingSub[encoding.lower()]
  389. return encoding, confidence
  390. def changeEncoding(self, newEncoding):
  391. assert self.charEncoding[1] != "certain"
  392. newEncoding = codecName(newEncoding)
  393. if newEncoding in ("utf-16", "utf-16-be", "utf-16-le"):
  394. newEncoding = "utf-8"
  395. if newEncoding is None:
  396. return
  397. elif newEncoding == self.charEncoding[0]:
  398. self.charEncoding = (self.charEncoding[0], "certain")
  399. else:
  400. self.rawStream.seek(0)
  401. self.reset()
  402. self.charEncoding = (newEncoding, "certain")
  403. raise ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding))
  404. def detectBOM(self):
  405. """Attempts to detect at BOM at the start of the stream. If
  406. an encoding can be determined from the BOM return the name of the
  407. encoding otherwise return None"""
  408. bomDict = {
  409. codecs.BOM_UTF8: 'utf-8',
  410. codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be',
  411. codecs.BOM_UTF32_LE: 'utf-32-le', codecs.BOM_UTF32_BE: 'utf-32-be'
  412. }
  413. # Go to beginning of file and read in 4 bytes
  414. string = self.rawStream.read(4)
  415. assert isinstance(string, bytes)
  416. # Try detecting the BOM using bytes from the string
  417. encoding = bomDict.get(string[:3]) # UTF-8
  418. seek = 3
  419. if not encoding:
  420. # Need to detect UTF-32 before UTF-16
  421. encoding = bomDict.get(string) # UTF-32
  422. seek = 4
  423. if not encoding:
  424. encoding = bomDict.get(string[:2]) # UTF-16
  425. seek = 2
  426. # Set the read position past the BOM if one was found, otherwise
  427. # set it to the start of the stream
  428. self.rawStream.seek(encoding and seek or 0)
  429. return encoding
  430. def detectEncodingMeta(self):
  431. """Report the encoding declared by the meta element
  432. """
  433. buffer = self.rawStream.read(self.numBytesMeta)
  434. assert isinstance(buffer, bytes)
  435. parser = EncodingParser(buffer)
  436. self.rawStream.seek(0)
  437. encoding = parser.getEncoding()
  438. if encoding in ("utf-16", "utf-16-be", "utf-16-le"):
  439. encoding = "utf-8"
  440. return encoding
  441. class EncodingBytes(bytes):
  442. """String-like object with an associated position and various extra methods
  443. If the position is ever greater than the string length then an exception is
  444. raised"""
  445. def __new__(self, value):
  446. assert isinstance(value, bytes)
  447. return bytes.__new__(self, value.lower())
  448. def __init__(self, value):
  449. self._position = -1
  450. def __iter__(self):
  451. return self
  452. def __next__(self):
  453. p = self._position = self._position + 1
  454. if p >= len(self):
  455. raise StopIteration
  456. elif p < 0:
  457. raise TypeError
  458. return self[p:p + 1]
  459. def next(self):
  460. # Py2 compat
  461. return self.__next__()
  462. def previous(self):
  463. p = self._position
  464. if p >= len(self):
  465. raise StopIteration
  466. elif p < 0:
  467. raise TypeError
  468. self._position = p = p - 1
  469. return self[p:p + 1]
  470. def setPosition(self, position):
  471. if self._position >= len(self):
  472. raise StopIteration
  473. self._position = position
  474. def getPosition(self):
  475. if self._position >= len(self):
  476. raise StopIteration
  477. if self._position >= 0:
  478. return self._position
  479. else:
  480. return None
  481. position = property(getPosition, setPosition)
  482. def getCurrentByte(self):
  483. return self[self.position:self.position + 1]
  484. currentByte = property(getCurrentByte)
  485. def skip(self, chars=spaceCharactersBytes):
  486. """Skip past a list of characters"""
  487. p = self.position # use property for the error-checking
  488. while p < len(self):
  489. c = self[p:p + 1]
  490. if c not in chars:
  491. self._position = p
  492. return c
  493. p += 1
  494. self._position = p
  495. return None
  496. def skipUntil(self, chars):
  497. p = self.position
  498. while p < len(self):
  499. c = self[p:p + 1]
  500. if c in chars:
  501. self._position = p
  502. return c
  503. p += 1
  504. self._position = p
  505. return None
  506. def matchBytes(self, bytes):
  507. """Look for a sequence of bytes at the start of a string. If the bytes
  508. are found return True and advance the position to the byte after the
  509. match. Otherwise return False and leave the position alone"""
  510. p = self.position
  511. data = self[p:p + len(bytes)]
  512. rv = data.startswith(bytes)
  513. if rv:
  514. self.position += len(bytes)
  515. return rv
  516. def jumpTo(self, bytes):
  517. """Look for the next sequence of bytes matching a given sequence. If
  518. a match is found advance the position to the last byte of the match"""
  519. newPosition = self[self.position:].find(bytes)
  520. if newPosition > -1:
  521. # XXX: This is ugly, but I can't see a nicer way to fix this.
  522. if self._position == -1:
  523. self._position = 0
  524. self._position += (newPosition + len(bytes) - 1)
  525. return True
  526. else:
  527. raise StopIteration
  528. class EncodingParser(object):
  529. """Mini parser for detecting character encoding from meta elements"""
  530. def __init__(self, data):
  531. """string - the data to work on for encoding detection"""
  532. self.data = EncodingBytes(data)
  533. self.encoding = None
  534. def getEncoding(self):
  535. methodDispatch = (
  536. (b"<!--", self.handleComment),
  537. (b"<meta", self.handleMeta),
  538. (b"</", self.handlePossibleEndTag),
  539. (b"<!", self.handleOther),
  540. (b"<?", self.handleOther),
  541. (b"<", self.handlePossibleStartTag))
  542. for byte in self.data:
  543. keepParsing = True
  544. for key, method in methodDispatch:
  545. if self.data.matchBytes(key):
  546. try:
  547. keepParsing = method()
  548. break
  549. except StopIteration:
  550. keepParsing = False
  551. break
  552. if not keepParsing:
  553. break
  554. return self.encoding
  555. def handleComment(self):
  556. """Skip over comments"""
  557. return self.data.jumpTo(b"-->")
  558. def handleMeta(self):
  559. if self.data.currentByte not in spaceCharactersBytes:
  560. # if we have <meta not followed by a space so just keep going
  561. return True
  562. # We have a valid meta element we want to search for attributes
  563. hasPragma = False
  564. pendingEncoding = None
  565. while True:
  566. # Try to find the next attribute after the current position
  567. attr = self.getAttribute()
  568. if attr is None:
  569. return True
  570. else:
  571. if attr[0] == b"http-equiv":
  572. hasPragma = attr[1] == b"content-type"
  573. if hasPragma and pendingEncoding is not None:
  574. self.encoding = pendingEncoding
  575. return False
  576. elif attr[0] == b"charset":
  577. tentativeEncoding = attr[1]
  578. codec = codecName(tentativeEncoding)
  579. if codec is not None:
  580. self.encoding = codec
  581. return False
  582. elif attr[0] == b"content":
  583. contentParser = ContentAttrParser(EncodingBytes(attr[1]))
  584. tentativeEncoding = contentParser.parse()
  585. if tentativeEncoding is not None:
  586. codec = codecName(tentativeEncoding)
  587. if codec is not None:
  588. if hasPragma:
  589. self.encoding = codec
  590. return False
  591. else:
  592. pendingEncoding = codec
  593. def handlePossibleStartTag(self):
  594. return self.handlePossibleTag(False)
  595. def handlePossibleEndTag(self):
  596. next(self.data)
  597. return self.handlePossibleTag(True)
  598. def handlePossibleTag(self, endTag):
  599. data = self.data
  600. if data.currentByte not in asciiLettersBytes:
  601. # If the next byte is not an ascii letter either ignore this
  602. # fragment (possible start tag case) or treat it according to
  603. # handleOther
  604. if endTag:
  605. data.previous()
  606. self.handleOther()
  607. return True
  608. c = data.skipUntil(spacesAngleBrackets)
  609. if c == b"<":
  610. # return to the first step in the overall "two step" algorithm
  611. # reprocessing the < byte
  612. data.previous()
  613. else:
  614. # Read all attributes
  615. attr = self.getAttribute()
  616. while attr is not None:
  617. attr = self.getAttribute()
  618. return True
  619. def handleOther(self):
  620. return self.data.jumpTo(b">")
  621. def getAttribute(self):
  622. """Return a name,value pair for the next attribute in the stream,
  623. if one is found, or None"""
  624. data = self.data
  625. # Step 1 (skip chars)
  626. c = data.skip(spaceCharactersBytes | frozenset([b"/"]))
  627. assert c is None or len(c) == 1
  628. # Step 2
  629. if c in (b">", None):
  630. return None
  631. # Step 3
  632. attrName = []
  633. attrValue = []
  634. # Step 4 attribute name
  635. while True:
  636. if c == b"=" and attrName:
  637. break
  638. elif c in spaceCharactersBytes:
  639. # Step 6!
  640. c = data.skip()
  641. break
  642. elif c in (b"/", b">"):
  643. return b"".join(attrName), b""
  644. elif c in asciiUppercaseBytes:
  645. attrName.append(c.lower())
  646. elif c is None:
  647. return None
  648. else:
  649. attrName.append(c)
  650. # Step 5
  651. c = next(data)
  652. # Step 7
  653. if c != b"=":
  654. data.previous()
  655. return b"".join(attrName), b""
  656. # Step 8
  657. next(data)
  658. # Step 9
  659. c = data.skip()
  660. # Step 10
  661. if c in (b"'", b'"'):
  662. # 10.1
  663. quoteChar = c
  664. while True:
  665. # 10.2
  666. c = next(data)
  667. # 10.3
  668. if c == quoteChar:
  669. next(data)
  670. return b"".join(attrName), b"".join(attrValue)
  671. # 10.4
  672. elif c in asciiUppercaseBytes:
  673. attrValue.append(c.lower())
  674. # 10.5
  675. else:
  676. attrValue.append(c)
  677. elif c == b">":
  678. return b"".join(attrName), b""
  679. elif c in asciiUppercaseBytes:
  680. attrValue.append(c.lower())
  681. elif c is None:
  682. return None
  683. else:
  684. attrValue.append(c)
  685. # Step 11
  686. while True:
  687. c = next(data)
  688. if c in spacesAngleBrackets:
  689. return b"".join(attrName), b"".join(attrValue)
  690. elif c in asciiUppercaseBytes:
  691. attrValue.append(c.lower())
  692. elif c is None:
  693. return None
  694. else:
  695. attrValue.append(c)
  696. class ContentAttrParser(object):
  697. def __init__(self, data):
  698. assert isinstance(data, bytes)
  699. self.data = data
  700. def parse(self):
  701. try:
  702. # Check if the attr name is charset
  703. # otherwise return
  704. self.data.jumpTo(b"charset")
  705. self.data.position += 1
  706. self.data.skip()
  707. if not self.data.currentByte == b"=":
  708. # If there is no = sign keep looking for attrs
  709. return None
  710. self.data.position += 1
  711. self.data.skip()
  712. # Look for an encoding between matching quote marks
  713. if self.data.currentByte in (b'"', b"'"):
  714. quoteMark = self.data.currentByte
  715. self.data.position += 1
  716. oldPosition = self.data.position
  717. if self.data.jumpTo(quoteMark):
  718. return self.data[oldPosition:self.data.position]
  719. else:
  720. return None
  721. else:
  722. # Unquoted value
  723. oldPosition = self.data.position
  724. try:
  725. self.data.skipUntil(spaceCharactersBytes)
  726. return self.data[oldPosition:self.data.position]
  727. except StopIteration:
  728. # Return the whole remaining value
  729. return self.data[oldPosition:]
  730. except StopIteration:
  731. return None
  732. def codecName(encoding):
  733. """Return the python codec name corresponding to an encoding or None if the
  734. string doesn't correspond to a valid encoding."""
  735. if isinstance(encoding, bytes):
  736. try:
  737. encoding = encoding.decode("ascii")
  738. except UnicodeDecodeError:
  739. return None
  740. if encoding:
  741. canonicalName = ascii_punctuation_re.sub("", encoding).lower()
  742. return encodings.get(canonicalName, None)
  743. else:
  744. return None