| 
					
				 | 
			
			
				@@ -10,6 +10,7 @@ import unicodedata 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from contextlib import contextmanager 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from urllib.parse import unquote 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+import cloudscraper 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 import requests 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from bs4 import BeautifulSoup, CData, Comment, NavigableString 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from newspaper import Article 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -46,29 +47,34 @@ def get_url(url: str, user_agent: str = None) -> str: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     supported_content_types = extract_processor.SUPPORT_URL_CONTENT_TYPES + ["text/html"] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     response = requests.head(url, headers=headers, allow_redirects=True, timeout=(5, 10)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    if response.status_code != 200: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        return "URL returned status code {}.".format(response.status_code) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    if response.status_code == 200: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        # check content-type 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        content_type = response.headers.get('Content-Type') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        if content_type: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            main_content_type = response.headers.get('Content-Type').split(';')[0].strip() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        else: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            content_disposition = response.headers.get('Content-Disposition', '') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            filename_match = re.search(r'filename="([^"]+)"', content_disposition) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            if filename_match: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                filename = unquote(filename_match.group(1)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                extension = re.search(r'\.(\w+)$', filename) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                if extension: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                    main_content_type = mimetypes.guess_type(filename)[0] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    # check content-type 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    content_type = response.headers.get('Content-Type') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    if content_type: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        main_content_type = response.headers.get('Content-Type').split(';')[0].strip() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    else: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        content_disposition = response.headers.get('Content-Disposition', '') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        filename_match = re.search(r'filename="([^"]+)"', content_disposition) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        if filename_match: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            filename = unquote(filename_match.group(1)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            extension = re.search(r'\.(\w+)$', filename) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            if extension: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                main_content_type = mimetypes.guess_type(filename)[0] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        if main_content_type not in supported_content_types: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            return "Unsupported content-type [{}] of URL.".format(main_content_type) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    if main_content_type not in supported_content_types: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        return "Unsupported content-type [{}] of URL.".format(main_content_type) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        if main_content_type in extract_processor.SUPPORT_URL_CONTENT_TYPES: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            return ExtractProcessor.load_from_url(url, return_text=True) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    if main_content_type in extract_processor.SUPPORT_URL_CONTENT_TYPES: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        return ExtractProcessor.load_from_url(url, return_text=True) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        response = requests.get(url, headers=headers, allow_redirects=True, timeout=(120, 300)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    elif response.status_code == 403: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        scraper = cloudscraper.create_scraper() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        response = scraper.get(url, headers=headers, allow_redirects=True, timeout=(120, 300)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    if response.status_code != 200: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        return "URL returned status code {}.".format(response.status_code) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    response = requests.get(url, headers=headers, allow_redirects=True, timeout=(120, 300)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     a = extract_using_readabilipy(response.text) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     if not a['plain_text'] or not a['plain_text'].strip(): 
			 |