|  | @@ -10,6 +10,7 @@ import unicodedata
 | 
	
		
			
				|  |  |  from contextlib import contextmanager
 | 
	
		
			
				|  |  |  from urllib.parse import unquote
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | +import cloudscraper
 | 
	
		
			
				|  |  |  import requests
 | 
	
		
			
				|  |  |  from bs4 import BeautifulSoup, CData, Comment, NavigableString
 | 
	
		
			
				|  |  |  from newspaper import Article
 | 
	
	
		
			
				|  | @@ -46,29 +47,34 @@ def get_url(url: str, user_agent: str = None) -> str:
 | 
	
		
			
				|  |  |      supported_content_types = extract_processor.SUPPORT_URL_CONTENT_TYPES + ["text/html"]
 | 
	
		
			
				|  |  |      response = requests.head(url, headers=headers, allow_redirects=True, timeout=(5, 10))
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -    if response.status_code != 200:
 | 
	
		
			
				|  |  | -        return "URL returned status code {}.".format(response.status_code)
 | 
	
		
			
				|  |  | +    if response.status_code == 200:
 | 
	
		
			
				|  |  | +        # check content-type
 | 
	
		
			
				|  |  | +        content_type = response.headers.get('Content-Type')
 | 
	
		
			
				|  |  | +        if content_type:
 | 
	
		
			
				|  |  | +            main_content_type = response.headers.get('Content-Type').split(';')[0].strip()
 | 
	
		
			
				|  |  | +        else:
 | 
	
		
			
				|  |  | +            content_disposition = response.headers.get('Content-Disposition', '')
 | 
	
		
			
				|  |  | +            filename_match = re.search(r'filename="([^"]+)"', content_disposition)
 | 
	
		
			
				|  |  | +            if filename_match:
 | 
	
		
			
				|  |  | +                filename = unquote(filename_match.group(1))
 | 
	
		
			
				|  |  | +                extension = re.search(r'\.(\w+)$', filename)
 | 
	
		
			
				|  |  | +                if extension:
 | 
	
		
			
				|  |  | +                    main_content_type = mimetypes.guess_type(filename)[0]
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -    # check content-type
 | 
	
		
			
				|  |  | -    content_type = response.headers.get('Content-Type')
 | 
	
		
			
				|  |  | -    if content_type:
 | 
	
		
			
				|  |  | -        main_content_type = response.headers.get('Content-Type').split(';')[0].strip()
 | 
	
		
			
				|  |  | -    else:
 | 
	
		
			
				|  |  | -        content_disposition = response.headers.get('Content-Disposition', '')
 | 
	
		
			
				|  |  | -        filename_match = re.search(r'filename="([^"]+)"', content_disposition)
 | 
	
		
			
				|  |  | -        if filename_match:
 | 
	
		
			
				|  |  | -            filename = unquote(filename_match.group(1))
 | 
	
		
			
				|  |  | -            extension = re.search(r'\.(\w+)$', filename)
 | 
	
		
			
				|  |  | -            if extension:
 | 
	
		
			
				|  |  | -                main_content_type = mimetypes.guess_type(filename)[0]
 | 
	
		
			
				|  |  | +        if main_content_type not in supported_content_types:
 | 
	
		
			
				|  |  | +            return "Unsupported content-type [{}] of URL.".format(main_content_type)
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -    if main_content_type not in supported_content_types:
 | 
	
		
			
				|  |  | -        return "Unsupported content-type [{}] of URL.".format(main_content_type)
 | 
	
		
			
				|  |  | +        if main_content_type in extract_processor.SUPPORT_URL_CONTENT_TYPES:
 | 
	
		
			
				|  |  | +            return ExtractProcessor.load_from_url(url, return_text=True)
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -    if main_content_type in extract_processor.SUPPORT_URL_CONTENT_TYPES:
 | 
	
		
			
				|  |  | -        return ExtractProcessor.load_from_url(url, return_text=True)
 | 
	
		
			
				|  |  | +        response = requests.get(url, headers=headers, allow_redirects=True, timeout=(120, 300))
 | 
	
		
			
				|  |  | +    elif response.status_code == 403:
 | 
	
		
			
				|  |  | +        scraper = cloudscraper.create_scraper()
 | 
	
		
			
				|  |  | +        response = scraper.get(url, headers=headers, allow_redirects=True, timeout=(120, 300))
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    if response.status_code != 200:
 | 
	
		
			
				|  |  | +        return "URL returned status code {}.".format(response.status_code)
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -    response = requests.get(url, headers=headers, allow_redirects=True, timeout=(120, 300))
 | 
	
		
			
				|  |  |      a = extract_using_readabilipy(response.text)
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |      if not a['plain_text'] or not a['plain_text'].strip():
 |