web_reader_tool.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350
  1. import hashlib
  2. import json
  3. import mimetypes
  4. import os
  5. import re
  6. import site
  7. import subprocess
  8. import tempfile
  9. import unicodedata
  10. from contextlib import contextmanager
  11. from urllib.parse import unquote
  12. import requests
  13. from bs4 import BeautifulSoup, CData, Comment, NavigableString
  14. from newspaper import Article
  15. from regex import regex
  16. from core.rag.extractor import extract_processor
  17. from core.rag.extractor.extract_processor import ExtractProcessor
  18. FULL_TEMPLATE = """
  19. TITLE: {title}
  20. AUTHORS: {authors}
  21. PUBLISH DATE: {publish_date}
  22. TOP_IMAGE_URL: {top_image}
  23. TEXT:
  24. {text}
  25. """
  26. def page_result(text: str, cursor: int, max_length: int) -> str:
  27. """Page through `text` and return a substring of `max_length` characters starting from `cursor`."""
  28. return text[cursor: cursor + max_length]
  29. def get_url(url: str, user_agent: str = None) -> str:
  30. """Fetch URL and return the contents as a string."""
  31. headers = {
  32. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
  33. }
  34. if user_agent:
  35. headers["User-Agent"] = user_agent
  36. main_content_type = None
  37. supported_content_types = extract_processor.SUPPORT_URL_CONTENT_TYPES + ["text/html"]
  38. response = requests.head(url, headers=headers, allow_redirects=True, timeout=(5, 10))
  39. if response.status_code != 200:
  40. return "URL returned status code {}.".format(response.status_code)
  41. # check content-type
  42. content_type = response.headers.get('Content-Type')
  43. if content_type:
  44. main_content_type = response.headers.get('Content-Type').split(';')[0].strip()
  45. else:
  46. content_disposition = response.headers.get('Content-Disposition')
  47. filename_match = re.search(r'filename="([^"]+)"', content_disposition)
  48. if filename_match:
  49. filename = unquote(filename_match.group(1))
  50. extension = re.search(r'\.(\w+)$', filename)
  51. if extension:
  52. main_content_type = mimetypes.guess_type(filename)[0]
  53. if main_content_type not in supported_content_types:
  54. return "Unsupported content-type [{}] of URL.".format(main_content_type)
  55. if main_content_type in extract_processor.SUPPORT_URL_CONTENT_TYPES:
  56. return ExtractProcessor.load_from_url(url, return_text=True)
  57. response = requests.get(url, headers=headers, allow_redirects=True, timeout=(120, 300))
  58. a = extract_using_readabilipy(response.text)
  59. if not a['plain_text'] or not a['plain_text'].strip():
  60. return get_url_from_newspaper3k(url)
  61. res = FULL_TEMPLATE.format(
  62. title=a['title'],
  63. authors=a['byline'],
  64. publish_date=a['date'],
  65. top_image="",
  66. text=a['plain_text'] if a['plain_text'] else "",
  67. )
  68. return res
  69. def get_url_from_newspaper3k(url: str) -> str:
  70. a = Article(url)
  71. a.download()
  72. a.parse()
  73. res = FULL_TEMPLATE.format(
  74. title=a.title,
  75. authors=a.authors,
  76. publish_date=a.publish_date,
  77. top_image=a.top_image,
  78. text=a.text,
  79. )
  80. return res
  81. def extract_using_readabilipy(html):
  82. with tempfile.NamedTemporaryFile(delete=False, mode='w+') as f_html:
  83. f_html.write(html)
  84. f_html.close()
  85. html_path = f_html.name
  86. # Call Mozilla's Readability.js Readability.parse() function via node, writing output to a temporary file
  87. article_json_path = html_path + ".json"
  88. jsdir = os.path.join(find_module_path('readabilipy'), 'javascript')
  89. with chdir(jsdir):
  90. subprocess.check_call(["node", "ExtractArticle.js", "-i", html_path, "-o", article_json_path])
  91. # Read output of call to Readability.parse() from JSON file and return as Python dictionary
  92. with open(article_json_path, encoding="utf-8") as json_file:
  93. input_json = json.loads(json_file.read())
  94. # Deleting files after processing
  95. os.unlink(article_json_path)
  96. os.unlink(html_path)
  97. article_json = {
  98. "title": None,
  99. "byline": None,
  100. "date": None,
  101. "content": None,
  102. "plain_content": None,
  103. "plain_text": None
  104. }
  105. # Populate article fields from readability fields where present
  106. if input_json:
  107. if input_json.get("title"):
  108. article_json["title"] = input_json["title"]
  109. if input_json.get("byline"):
  110. article_json["byline"] = input_json["byline"]
  111. if input_json.get("date"):
  112. article_json["date"] = input_json["date"]
  113. if input_json.get("content"):
  114. article_json["content"] = input_json["content"]
  115. article_json["plain_content"] = plain_content(article_json["content"], False, False)
  116. article_json["plain_text"] = extract_text_blocks_as_plain_text(article_json["plain_content"])
  117. if input_json.get("textContent"):
  118. article_json["plain_text"] = input_json["textContent"]
  119. article_json["plain_text"] = re.sub(r'\n\s*\n', '\n', article_json["plain_text"])
  120. return article_json
  121. def find_module_path(module_name):
  122. for package_path in site.getsitepackages():
  123. potential_path = os.path.join(package_path, module_name)
  124. if os.path.exists(potential_path):
  125. return potential_path
  126. return None
  127. @contextmanager
  128. def chdir(path):
  129. """Change directory in context and return to original on exit"""
  130. # From https://stackoverflow.com/a/37996581, couldn't find a built-in
  131. original_path = os.getcwd()
  132. os.chdir(path)
  133. try:
  134. yield
  135. finally:
  136. os.chdir(original_path)
  137. def extract_text_blocks_as_plain_text(paragraph_html):
  138. # Load article as DOM
  139. soup = BeautifulSoup(paragraph_html, 'html.parser')
  140. # Select all lists
  141. list_elements = soup.find_all(['ul', 'ol'])
  142. # Prefix text in all list items with "* " and make lists paragraphs
  143. for list_element in list_elements:
  144. plain_items = "".join(list(filter(None, [plain_text_leaf_node(li)["text"] for li in list_element.find_all('li')])))
  145. list_element.string = plain_items
  146. list_element.name = "p"
  147. # Select all text blocks
  148. text_blocks = [s.parent for s in soup.find_all(string=True)]
  149. text_blocks = [plain_text_leaf_node(block) for block in text_blocks]
  150. # Drop empty paragraphs
  151. text_blocks = list(filter(lambda p: p["text"] is not None, text_blocks))
  152. return text_blocks
  153. def plain_text_leaf_node(element):
  154. # Extract all text, stripped of any child HTML elements and normalise it
  155. plain_text = normalise_text(element.get_text())
  156. if plain_text != "" and element.name == "li":
  157. plain_text = "* {}, ".format(plain_text)
  158. if plain_text == "":
  159. plain_text = None
  160. if "data-node-index" in element.attrs:
  161. plain = {"node_index": element["data-node-index"], "text": plain_text}
  162. else:
  163. plain = {"text": plain_text}
  164. return plain
  165. def plain_content(readability_content, content_digests, node_indexes):
  166. # Load article as DOM
  167. soup = BeautifulSoup(readability_content, 'html.parser')
  168. # Make all elements plain
  169. elements = plain_elements(soup.contents, content_digests, node_indexes)
  170. if node_indexes:
  171. # Add node index attributes to nodes
  172. elements = [add_node_indexes(element) for element in elements]
  173. # Replace article contents with plain elements
  174. soup.contents = elements
  175. return str(soup)
  176. def plain_elements(elements, content_digests, node_indexes):
  177. # Get plain content versions of all elements
  178. elements = [plain_element(element, content_digests, node_indexes)
  179. for element in elements]
  180. if content_digests:
  181. # Add content digest attribute to nodes
  182. elements = [add_content_digest(element) for element in elements]
  183. return elements
  184. def plain_element(element, content_digests, node_indexes):
  185. # For lists, we make each item plain text
  186. if is_leaf(element):
  187. # For leaf node elements, extract the text content, discarding any HTML tags
  188. # 1. Get element contents as text
  189. plain_text = element.get_text()
  190. # 2. Normalise the extracted text string to a canonical representation
  191. plain_text = normalise_text(plain_text)
  192. # 3. Update element content to be plain text
  193. element.string = plain_text
  194. elif is_text(element):
  195. if is_non_printing(element):
  196. # The simplified HTML may have come from Readability.js so might
  197. # have non-printing text (e.g. Comment or CData). In this case, we
  198. # keep the structure, but ensure that the string is empty.
  199. element = type(element)("")
  200. else:
  201. plain_text = element.string
  202. plain_text = normalise_text(plain_text)
  203. element = type(element)(plain_text)
  204. else:
  205. # If not a leaf node or leaf type call recursively on child nodes, replacing
  206. element.contents = plain_elements(element.contents, content_digests, node_indexes)
  207. return element
  208. def add_node_indexes(element, node_index="0"):
  209. # Can't add attributes to string types
  210. if is_text(element):
  211. return element
  212. # Add index to current element
  213. element["data-node-index"] = node_index
  214. # Add index to child elements
  215. for local_idx, child in enumerate(
  216. [c for c in element.contents if not is_text(c)], start=1):
  217. # Can't add attributes to leaf string types
  218. child_index = "{stem}.{local}".format(
  219. stem=node_index, local=local_idx)
  220. add_node_indexes(child, node_index=child_index)
  221. return element
  222. def normalise_text(text):
  223. """Normalise unicode and whitespace."""
  224. # Normalise unicode first to try and standardise whitespace characters as much as possible before normalising them
  225. text = strip_control_characters(text)
  226. text = normalise_unicode(text)
  227. text = normalise_whitespace(text)
  228. return text
  229. def strip_control_characters(text):
  230. """Strip out unicode control characters which might break the parsing."""
  231. # Unicode control characters
  232. # [Cc]: Other, Control [includes new lines]
  233. # [Cf]: Other, Format
  234. # [Cn]: Other, Not Assigned
  235. # [Co]: Other, Private Use
  236. # [Cs]: Other, Surrogate
  237. control_chars = set(['Cc', 'Cf', 'Cn', 'Co', 'Cs'])
  238. retained_chars = ['\t', '\n', '\r', '\f']
  239. # Remove non-printing control characters
  240. return "".join(["" if (unicodedata.category(char) in control_chars) and (char not in retained_chars) else char for char in text])
  241. def normalise_unicode(text):
  242. """Normalise unicode such that things that are visually equivalent map to the same unicode string where possible."""
  243. normal_form = "NFKC"
  244. text = unicodedata.normalize(normal_form, text)
  245. return text
  246. def normalise_whitespace(text):
  247. """Replace runs of whitespace characters with a single space as this is what happens when HTML text is displayed."""
  248. text = regex.sub(r"\s+", " ", text)
  249. # Remove leading and trailing whitespace
  250. text = text.strip()
  251. return text
  252. def is_leaf(element):
  253. return (element.name in ['p', 'li'])
  254. def is_text(element):
  255. return isinstance(element, NavigableString)
  256. def is_non_printing(element):
  257. return any(isinstance(element, _e) for _e in [Comment, CData])
  258. def add_content_digest(element):
  259. if not is_text(element):
  260. element["data-content-digest"] = content_digest(element)
  261. return element
  262. def content_digest(element):
  263. if is_text(element):
  264. # Hash
  265. trimmed_string = element.string.strip()
  266. if trimmed_string == "":
  267. digest = ""
  268. else:
  269. digest = hashlib.sha256(trimmed_string.encode('utf-8')).hexdigest()
  270. else:
  271. contents = element.contents
  272. num_contents = len(contents)
  273. if num_contents == 0:
  274. # No hash when no child elements exist
  275. digest = ""
  276. elif num_contents == 1:
  277. # If single child, use digest of child
  278. digest = content_digest(contents[0])
  279. else:
  280. # Build content digest from the "non-empty" digests of child nodes
  281. digest = hashlib.sha256()
  282. child_digests = list(
  283. filter(lambda x: x != "", [content_digest(content) for content in contents]))
  284. for child in child_digests:
  285. digest.update(child.encode('utf-8'))
  286. digest = digest.hexdigest()
  287. return digest