|  | @@ -1,98 +1,93 @@
 | 
	
		
			
				|  |  |  import time
 | 
	
		
			
				|  |  | +from collections.abc import Mapping
 | 
	
		
			
				|  |  | +from typing import Any
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |  import requests
 | 
	
		
			
				|  |  | +from requests.exceptions import HTTPError
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |  class FirecrawlApp:
 | 
	
		
			
				|  |  | -    def __init__(self, api_key=None, base_url=None):
 | 
	
		
			
				|  |  | +    def __init__(self, api_key: str | None = None, base_url: str | None = None):
 | 
	
		
			
				|  |  |          self.api_key = api_key
 | 
	
		
			
				|  |  |          self.base_url = base_url or 'https://api.firecrawl.dev'
 | 
	
		
			
				|  |  | -        if self.api_key is None and self.base_url == 'https://api.firecrawl.dev':
 | 
	
		
			
				|  |  | -            raise ValueError('No API key provided')
 | 
	
		
			
				|  |  | +        if not self.api_key:
 | 
	
		
			
				|  |  | +            raise ValueError("API key is required")
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -    def scrape_url(self, url, params=None) -> dict:
 | 
	
		
			
				|  |  | +    def _prepare_headers(self, idempotency_key: str | None = None):
 | 
	
		
			
				|  |  |          headers = {
 | 
	
		
			
				|  |  |              'Content-Type': 'application/json',
 | 
	
		
			
				|  |  |              'Authorization': f'Bearer {self.api_key}'
 | 
	
		
			
				|  |  |          }
 | 
	
		
			
				|  |  | -        json_data = {'url': url}
 | 
	
		
			
				|  |  | -        if params:
 | 
	
		
			
				|  |  | -            json_data.update(params)
 | 
	
		
			
				|  |  | -        response = requests.post(
 | 
	
		
			
				|  |  | -            f'{self.base_url}/v0/scrape',
 | 
	
		
			
				|  |  | -            headers=headers,
 | 
	
		
			
				|  |  | -            json=json_data
 | 
	
		
			
				|  |  | -        )
 | 
	
		
			
				|  |  | -        if response.status_code == 200:
 | 
	
		
			
				|  |  | -            response = response.json()
 | 
	
		
			
				|  |  | -            if response['success'] == True:
 | 
	
		
			
				|  |  | -                return response['data']
 | 
	
		
			
				|  |  | -            else:
 | 
	
		
			
				|  |  | -                raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
 | 
	
		
			
				|  |  | +        if idempotency_key:
 | 
	
		
			
				|  |  | +            headers['Idempotency-Key'] = idempotency_key
 | 
	
		
			
				|  |  | +        return headers
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -        elif response.status_code in [402, 409, 500]:
 | 
	
		
			
				|  |  | -            error_message = response.json().get('error', 'Unknown error occurred')
 | 
	
		
			
				|  |  | -            raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}')
 | 
	
		
			
				|  |  | -        else:
 | 
	
		
			
				|  |  | -            raise Exception(f'Failed to scrape URL. Status code: {response.status_code}')
 | 
	
		
			
				|  |  | +    def _request(
 | 
	
		
			
				|  |  | +        self,
 | 
	
		
			
				|  |  | +        method: str,
 | 
	
		
			
				|  |  | +        url: str,
 | 
	
		
			
				|  |  | +        data: Mapping[str, Any] | None = None,
 | 
	
		
			
				|  |  | +        headers: Mapping[str, str] | None = None,
 | 
	
		
			
				|  |  | +        retries: int = 3,
 | 
	
		
			
				|  |  | +        backoff_factor: float = 0.3,
 | 
	
		
			
				|  |  | +    ) -> Mapping[str, Any] | None:
 | 
	
		
			
				|  |  | +        for i in range(retries):
 | 
	
		
			
				|  |  | +            try:
 | 
	
		
			
				|  |  | +                response = requests.request(method, url, json=data, headers=headers)
 | 
	
		
			
				|  |  | +                response.raise_for_status()
 | 
	
		
			
				|  |  | +                return response.json()
 | 
	
		
			
				|  |  | +            except requests.exceptions.RequestException as e:
 | 
	
		
			
				|  |  | +                if i < retries - 1:
 | 
	
		
			
				|  |  | +                    time.sleep(backoff_factor * (2 ** i))
 | 
	
		
			
				|  |  | +                else:
 | 
	
		
			
				|  |  | +                    raise
 | 
	
		
			
				|  |  | +        return None
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -    def crawl_url(self, url, params=None, wait_until_done=True, timeout=2) -> str:
 | 
	
		
			
				|  |  | +    def scrape_url(self, url: str, **kwargs):
 | 
	
		
			
				|  |  | +        endpoint = f'{self.base_url}/v0/scrape'
 | 
	
		
			
				|  |  |          headers = self._prepare_headers()
 | 
	
		
			
				|  |  | -        json_data = {'url': url}
 | 
	
		
			
				|  |  | -        if params:
 | 
	
		
			
				|  |  | -            json_data.update(params)
 | 
	
		
			
				|  |  | -        response = self._post_request(f'{self.base_url}/v0/crawl', json_data, headers)
 | 
	
		
			
				|  |  | -        if response.status_code == 200:
 | 
	
		
			
				|  |  | -            job_id = response.json().get('jobId')
 | 
	
		
			
				|  |  | -            if wait_until_done:
 | 
	
		
			
				|  |  | -                return self._monitor_job_status(job_id, headers, timeout)
 | 
	
		
			
				|  |  | -            else:
 | 
	
		
			
				|  |  | -                return {'jobId': job_id}
 | 
	
		
			
				|  |  | -        else:
 | 
	
		
			
				|  |  | -            self._handle_error(response, 'start crawl job')
 | 
	
		
			
				|  |  | +        data = {'url': url, **kwargs}
 | 
	
		
			
				|  |  | +        response = self._request('POST', endpoint, data, headers)
 | 
	
		
			
				|  |  | +        if response is None:
 | 
	
		
			
				|  |  | +            raise HTTPError("Failed to scrape URL after multiple retries")
 | 
	
		
			
				|  |  | +        return response
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -    def check_crawl_status(self, job_id) -> dict:
 | 
	
		
			
				|  |  | +    def search(self, query: str, **kwargs):
 | 
	
		
			
				|  |  | +        endpoint = f'{self.base_url}/v0/search'
 | 
	
		
			
				|  |  |          headers = self._prepare_headers()
 | 
	
		
			
				|  |  | -        response = self._get_request(f'{self.base_url}/v0/crawl/status/{job_id}', headers)
 | 
	
		
			
				|  |  | -        if response.status_code == 200:
 | 
	
		
			
				|  |  | -            return response.json()
 | 
	
		
			
				|  |  | -        else:
 | 
	
		
			
				|  |  | -            self._handle_error(response, 'check crawl status')
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -    def _prepare_headers(self):
 | 
	
		
			
				|  |  | -        return {
 | 
	
		
			
				|  |  | -            'Content-Type': 'application/json',
 | 
	
		
			
				|  |  | -            'Authorization': f'Bearer {self.api_key}'
 | 
	
		
			
				|  |  | -        }
 | 
	
		
			
				|  |  | +        data = {'query': query, **kwargs}
 | 
	
		
			
				|  |  | +        response = self._request('POST', endpoint, data, headers)
 | 
	
		
			
				|  |  | +        if response is None:
 | 
	
		
			
				|  |  | +            raise HTTPError("Failed to perform search after multiple retries")
 | 
	
		
			
				|  |  | +        return response
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -    def _post_request(self, url, data, headers):
 | 
	
		
			
				|  |  | -        return requests.post(url, headers=headers, json=data)
 | 
	
		
			
				|  |  | +    def crawl_url(
 | 
	
		
			
				|  |  | +        self, url: str, wait: bool = False, poll_interval: int = 5, idempotency_key: str | None = None, **kwargs
 | 
	
		
			
				|  |  | +    ):
 | 
	
		
			
				|  |  | +        endpoint = f'{self.base_url}/v0/crawl'
 | 
	
		
			
				|  |  | +        headers = self._prepare_headers(idempotency_key)
 | 
	
		
			
				|  |  | +        data = {'url': url, **kwargs}
 | 
	
		
			
				|  |  | +        response = self._request('POST', endpoint, data, headers)
 | 
	
		
			
				|  |  | +        if response is None:
 | 
	
		
			
				|  |  | +            raise HTTPError("Failed to initiate crawl after multiple retries")
 | 
	
		
			
				|  |  | +        job_id: str = response['jobId']
 | 
	
		
			
				|  |  | +        if wait:
 | 
	
		
			
				|  |  | +            return self._monitor_job_status(job_id=job_id, poll_interval=poll_interval)
 | 
	
		
			
				|  |  | +        return job_id
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -    def _get_request(self, url, headers):
 | 
	
		
			
				|  |  | -        return requests.get(url, headers=headers)
 | 
	
		
			
				|  |  | +    def check_crawl_status(self, job_id: str):
 | 
	
		
			
				|  |  | +        endpoint = f'{self.base_url}/v0/crawl/status/{job_id}'
 | 
	
		
			
				|  |  | +        headers = self._prepare_headers()
 | 
	
		
			
				|  |  | +        response = self._request('GET', endpoint, headers=headers)
 | 
	
		
			
				|  |  | +        if response is None:
 | 
	
		
			
				|  |  | +            raise HTTPError(f"Failed to check status for job {job_id} after multiple retries")
 | 
	
		
			
				|  |  | +        return response
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -    def _monitor_job_status(self, job_id, headers, timeout):
 | 
	
		
			
				|  |  | +    def _monitor_job_status(self, job_id: str, poll_interval: int):
 | 
	
		
			
				|  |  |          while True:
 | 
	
		
			
				|  |  | -            status_response = self._get_request(f'{self.base_url}/v0/crawl/status/{job_id}', headers)
 | 
	
		
			
				|  |  | -            if status_response.status_code == 200:
 | 
	
		
			
				|  |  | -                status_data = status_response.json()
 | 
	
		
			
				|  |  | -                if status_data['status'] == 'completed':
 | 
	
		
			
				|  |  | -                    if 'data' in status_data:
 | 
	
		
			
				|  |  | -                        return status_data['data']
 | 
	
		
			
				|  |  | -                    else:
 | 
	
		
			
				|  |  | -                        raise Exception('Crawl job completed but no data was returned')
 | 
	
		
			
				|  |  | -                elif status_data['status'] in ['active', 'paused', 'pending', 'queued']:
 | 
	
		
			
				|  |  | -                    if timeout < 2:
 | 
	
		
			
				|  |  | -                        timeout = 2
 | 
	
		
			
				|  |  | -                    time.sleep(timeout)  # Wait for the specified timeout before checking again
 | 
	
		
			
				|  |  | -                else:
 | 
	
		
			
				|  |  | -                    raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}')
 | 
	
		
			
				|  |  | -            else:
 | 
	
		
			
				|  |  | -                self._handle_error(status_response, 'check crawl status')
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -    def _handle_error(self, response, action):
 | 
	
		
			
				|  |  | -        if response.status_code in [402, 409, 500]:
 | 
	
		
			
				|  |  | -            error_message = response.json().get('error', 'Unknown error occurred')
 | 
	
		
			
				|  |  | -            raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}')
 | 
	
		
			
				|  |  | -        else:
 | 
	
		
			
				|  |  | -            raise Exception(f'Unexpected error occurred while trying to {action}. Status code: {response.status_code}')
 | 
	
		
			
				|  |  | +            status = self.check_crawl_status(job_id)
 | 
	
		
			
				|  |  | +            if status['status'] == 'completed':
 | 
	
		
			
				|  |  | +                return status
 | 
	
		
			
				|  |  | +            elif status['status'] == 'failed':
 | 
	
		
			
				|  |  | +                raise HTTPError(f'Job {job_id} failed: {status["error"]}')
 | 
	
		
			
				|  |  | +            time.sleep(poll_interval)
 |