1 year ago · 4e2fba404d
--- a/api/core/tools/utils/web_reader_tool.py
+++ b/api/core/tools/utils/web_reader_tool.py
@@ -10,6 +10,7 @@ import unicodedata
 
				 from contextlib import contextmanager
			
 
				 from urllib.parse import unquote
			
 
				 
			
 
				+import cloudscraper
			
 
				 import requests
			
 
				 from bs4 import BeautifulSoup, CData, Comment, NavigableString
			
 
				 from newspaper import Article
			
@@ -46,29 +47,34 @@ def get_url(url: str, user_agent: str = None) -> str:
 
				     supported_content_types = extract_processor.SUPPORT_URL_CONTENT_TYPES + ["text/html"]
			
 
				     response = requests.head(url, headers=headers, allow_redirects=True, timeout=(5, 10))
			
 
				 
			
 
				-    if response.status_code != 200:
			
 
				-        return "URL returned status code {}.".format(response.status_code)
			
 
				+    if response.status_code == 200:
			
 
				+        # check content-type
			
 
				+        content_type = response.headers.get('Content-Type')
			
 
				+        if content_type:
			
 
				+            main_content_type = response.headers.get('Content-Type').split(';')[0].strip()
			
 
				+        else:
			
 
				+            content_disposition = response.headers.get('Content-Disposition', '')
			
 
				+            filename_match = re.search(r'filename="([^"]+)"', content_disposition)
			
 
				+            if filename_match:
			
 
				+                filename = unquote(filename_match.group(1))
			
 
				+                extension = re.search(r'\.(\w+)$', filename)
			
 
				+                if extension:
			
 
				+                    main_content_type = mimetypes.guess_type(filename)[0]
			
 
				 
			
 
				-    # check content-type
			
 
				-    content_type = response.headers.get('Content-Type')
			
 
				-    if content_type:
			
 
				-        main_content_type = response.headers.get('Content-Type').split(';')[0].strip()
			
 
				-    else:
			
 
				-        content_disposition = response.headers.get('Content-Disposition', '')
			
 
				-        filename_match = re.search(r'filename="([^"]+)"', content_disposition)
			
 
				-        if filename_match:
			
 
				-            filename = unquote(filename_match.group(1))
			
 
				-            extension = re.search(r'\.(\w+)$', filename)
			
 
				-            if extension:
			
 
				-                main_content_type = mimetypes.guess_type(filename)[0]
			
 
				+        if main_content_type not in supported_content_types:
			
 
				+            return "Unsupported content-type [{}] of URL.".format(main_content_type)
			
 
				 
			
 
				-    if main_content_type not in supported_content_types:
			
 
				-        return "Unsupported content-type [{}] of URL.".format(main_content_type)
			
 
				+        if main_content_type in extract_processor.SUPPORT_URL_CONTENT_TYPES:
			
 
				+            return ExtractProcessor.load_from_url(url, return_text=True)
			
 
				 
			
 
				-    if main_content_type in extract_processor.SUPPORT_URL_CONTENT_TYPES:
			
 
				-        return ExtractProcessor.load_from_url(url, return_text=True)
			
 
				+        response = requests.get(url, headers=headers, allow_redirects=True, timeout=(120, 300))
			
 
				+    elif response.status_code == 403:
			
 
				+        scraper = cloudscraper.create_scraper()
			
 
				+        response = scraper.get(url, headers=headers, allow_redirects=True, timeout=(120, 300))
			
 
				+
			
 
				+    if response.status_code != 200:
			
 
				+        return "URL returned status code {}.".format(response.status_code)
			
 
				 
			
 
				-    response = requests.get(url, headers=headers, allow_redirects=True, timeout=(120, 300))
			
 
				     a = extract_using_readabilipy(response.text)
			
 
				 
			
 
				     if not a['plain_text'] or not a['plain_text'].strip():
			
--- a/api/poetry.lock
+++ b/api/poetry.lock
@@ -1610,6 +1610,22 @@ lz4 = ["clickhouse-cityhash (>=1.0.2.1)", "lz4", "lz4 (<=3.0.1)"]
 
				 numpy = ["numpy (>=1.12.0)", "pandas (>=0.24.0)"]
			
 
				 zstd = ["clickhouse-cityhash (>=1.0.2.1)", "zstd"]
			
 
				 
			
 
				+[[package]]
			
 
				+name = "cloudscraper"
			
 
				+version = "1.2.71"
			
 
				+description = "A Python module to bypass Cloudflare's anti-bot page."
			
 
				+optional = false
			
 
				+python-versions = "*"
			
 
				+files = [
			
 
				+    {file = "cloudscraper-1.2.71-py2.py3-none-any.whl", hash = "sha256:76f50ca529ed2279e220837befdec892626f9511708e200d48d5bb76ded679b0"},
			
 
				+    {file = "cloudscraper-1.2.71.tar.gz", hash = "sha256:429c6e8aa6916d5bad5c8a5eac50f3ea53c9ac22616f6cb21b18dcc71517d0d3"},
			
 
				+]
			
 
				+
			
 
				+[package.dependencies]
			
 
				+pyparsing = ">=2.4.7"
			
 
				+requests = ">=2.9.2"
			
 
				+requests-toolbelt = ">=0.9.1"
			
 
				+
			
 
				 [[package]]
			
 
				 name = "cohere"
			
 
				 version = "5.2.6"
			
@@ -7304,6 +7320,20 @@ requests = ">=2.0.0"
 
				 [package.extras]
			
 
				 rsa = ["oauthlib[signedtoken] (>=3.0.0)"]
			
 
				 
			
 
				+[[package]]
			
 
				+name = "requests-toolbelt"
			
 
				+version = "1.0.0"
			
 
				+description = "A utility belt for advanced users of python-requests"
			
 
				+optional = false
			
 
				+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
			
 
				+files = [
			
 
				+    {file = "requests-toolbelt-1.0.0.tar.gz", hash = "sha256:7681a0a3d047012b5bdc0ee37d7f8f07ebe76ab08caeccfc3921ce23c88d5bc6"},
			
 
				+    {file = "requests_toolbelt-1.0.0-py2.py3-none-any.whl", hash = "sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06"},
			
 
				+]
			
 
				+
			
 
				+[package.dependencies]
			
 
				+requests = ">=2.0.1,<3.0.0"
			
 
				+
			
 
				 [[package]]
			
 
				 name = "resend"
			
 
				 version = "0.7.2"
			
@@ -9408,4 +9438,4 @@ cffi = ["cffi (>=1.11)"]
 
				 [metadata]
			
 
				 lock-version = "2.0"
			
 
				 python-versions = "^3.10"
			
 
				-content-hash = "8d2a12543340f6f4fa6dcb27f93d8b3f5380e7a3e7eb5e399e76e6b8588b4611"
			
 
				+content-hash = "9b1821b6e5d6d44947cc011c2d635a366557582b4540b99e0ff53a3078a989e5"
			
--- a/api/pyproject.toml
+++ b/api/pyproject.toml
@@ -193,6 +193,7 @@ twilio = "~9.0.4"
 
				 vanna = { version = "0.5.5", extras = ["postgres", "mysql", "clickhouse", "duckdb"] }
			
 
				 wikipedia = "1.4.0"
			
 
				 yfinance = "~0.2.40"
			
 
				+cloudscraper = "1.2.71"
			
 
				 
			
 
				 ############################################################
			
 
				 # VDB dependencies required by vector store clients