|
@@ -10,6 +10,7 @@ import unicodedata
|
|
|
from contextlib import contextmanager
|
|
|
from urllib.parse import unquote
|
|
|
|
|
|
+import chardet
|
|
|
import cloudscraper
|
|
|
from bs4 import BeautifulSoup, CData, Comment, NavigableString
|
|
|
from regex import regex
|
|
@@ -75,7 +76,18 @@ def get_url(url: str, user_agent: str = None) -> str:
|
|
|
if response.status_code != 200:
|
|
|
return "URL returned status code {}.".format(response.status_code)
|
|
|
|
|
|
- a = extract_using_readabilipy(response.text)
|
|
|
+ # Detect encoding using chardet
|
|
|
+ detected_encoding = chardet.detect(response.content)
|
|
|
+ encoding = detected_encoding['encoding']
|
|
|
+ if encoding:
|
|
|
+ try:
|
|
|
+ content = response.content.decode(encoding)
|
|
|
+ except (UnicodeDecodeError, TypeError):
|
|
|
+ content = response.text
|
|
|
+ else:
|
|
|
+ content = response.text
|
|
|
+
|
|
|
+ a = extract_using_readabilipy(content)
|
|
|
|
|
|
if not a['plain_text'] or not a['plain_text'].strip():
|
|
|
return ''
|