spiderApp.py 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239
  1. import os
  2. from typing import Literal, Optional, TypedDict
  3. import requests
  4. class RequestParamsDict(TypedDict, total=False):
  5. url: Optional[str]
  6. request: Optional[Literal["http", "chrome", "smart"]]
  7. limit: Optional[int]
  8. return_format: Optional[Literal["raw", "markdown", "html2text", "text", "bytes"]]
  9. tld: Optional[bool]
  10. depth: Optional[int]
  11. cache: Optional[bool]
  12. budget: Optional[dict[str, int]]
  13. locale: Optional[str]
  14. cookies: Optional[str]
  15. stealth: Optional[bool]
  16. headers: Optional[dict[str, str]]
  17. anti_bot: Optional[bool]
  18. metadata: Optional[bool]
  19. viewport: Optional[dict[str, int]]
  20. encoding: Optional[str]
  21. subdomains: Optional[bool]
  22. user_agent: Optional[str]
  23. store_data: Optional[bool]
  24. gpt_config: Optional[list[str]]
  25. fingerprint: Optional[bool]
  26. storageless: Optional[bool]
  27. readability: Optional[bool]
  28. proxy_enabled: Optional[bool]
  29. respect_robots: Optional[bool]
  30. query_selector: Optional[str]
  31. full_resources: Optional[bool]
  32. request_timeout: Optional[int]
  33. run_in_background: Optional[bool]
  34. skip_config_checks: Optional[bool]
  35. class Spider:
  36. def __init__(self, api_key: Optional[str] = None):
  37. """
  38. Initialize the Spider with an API key.
  39. :param api_key: A string of the API key for Spider. Defaults to the SPIDER_API_KEY environment variable.
  40. :raises ValueError: If no API key is provided.
  41. """
  42. self.api_key = api_key or os.getenv("SPIDER_API_KEY")
  43. if self.api_key is None:
  44. raise ValueError("No API key provided")
  45. def api_post(
  46. self,
  47. endpoint: str,
  48. data: dict,
  49. stream: bool,
  50. content_type: str = "application/json",
  51. ):
  52. """
  53. Send a POST request to the specified API endpoint.
  54. :param endpoint: The API endpoint to which the POST request is sent.
  55. :param data: The data (dictionary) to be sent in the POST request.
  56. :param stream: Boolean indicating if the response should be streamed.
  57. :return: The JSON response or the raw response stream if stream is True.
  58. """
  59. headers = self._prepare_headers(content_type)
  60. response = self._post_request(
  61. f"https://api.spider.cloud/v1/{endpoint}", data, headers, stream
  62. )
  63. if stream:
  64. return response
  65. elif response.status_code == 200:
  66. return response.json()
  67. else:
  68. self._handle_error(response, f"post to {endpoint}")
  69. def api_get(
  70. self, endpoint: str, stream: bool, content_type: str = "application/json"
  71. ):
  72. """
  73. Send a GET request to the specified endpoint.
  74. :param endpoint: The API endpoint from which to retrieve data.
  75. :return: The JSON decoded response.
  76. """
  77. headers = self._prepare_headers(content_type)
  78. response = self._get_request(
  79. f"https://api.spider.cloud/v1/{endpoint}", headers, stream
  80. )
  81. if response.status_code == 200:
  82. return response.json()
  83. else:
  84. self._handle_error(response, f"get from {endpoint}")
  85. def get_credits(self):
  86. """
  87. Retrieve the account's remaining credits.
  88. :return: JSON response containing the number of credits left.
  89. """
  90. return self.api_get("credits", stream=False)
  91. def scrape_url(
  92. self,
  93. url: str,
  94. params: Optional[RequestParamsDict] = None,
  95. stream: bool = False,
  96. content_type: str = "application/json",
  97. ):
  98. """
  99. Scrape data from the specified URL.
  100. :param url: The URL from which to scrape data.
  101. :param params: Optional dictionary of additional parameters for the scrape request.
  102. :return: JSON response containing the scraping results.
  103. """
  104. params = params or {}
  105. # Add { "return_format": "markdown" } to the params if not already present
  106. if "return_format" not in params:
  107. params["return_format"] = "markdown"
  108. # Set limit to 1
  109. params["limit"] = 1
  110. return self.api_post(
  111. "crawl", {"url": url, **(params or {})}, stream, content_type
  112. )
  113. def crawl_url(
  114. self,
  115. url: str,
  116. params: Optional[RequestParamsDict] = None,
  117. stream: bool = False,
  118. content_type: str = "application/json",
  119. ):
  120. """
  121. Start crawling at the specified URL.
  122. :param url: The URL to begin crawling.
  123. :param params: Optional dictionary with additional parameters to customize the crawl.
  124. :param stream: Boolean indicating if the response should be streamed. Defaults to False.
  125. :return: JSON response or the raw response stream if streaming enabled.
  126. """
  127. params = params or {}
  128. # Add { "return_format": "markdown" } to the params if not already present
  129. if "return_format" not in params:
  130. params["return_format"] = "markdown"
  131. return self.api_post(
  132. "crawl", {"url": url, **(params or {})}, stream, content_type
  133. )
  134. def links(
  135. self,
  136. url: str,
  137. params: Optional[RequestParamsDict] = None,
  138. stream: bool = False,
  139. content_type: str = "application/json",
  140. ):
  141. """
  142. Retrieve links from the specified URL.
  143. :param url: The URL from which to extract links.
  144. :param params: Optional parameters for the link retrieval request.
  145. :return: JSON response containing the links.
  146. """
  147. return self.api_post(
  148. "links", {"url": url, **(params or {})}, stream, content_type
  149. )
  150. def extract_contacts(
  151. self,
  152. url: str,
  153. params: Optional[RequestParamsDict] = None,
  154. stream: bool = False,
  155. content_type: str = "application/json",
  156. ):
  157. """
  158. Extract contact information from the specified URL.
  159. :param url: The URL from which to extract contact information.
  160. :param params: Optional parameters for the contact extraction.
  161. :return: JSON response containing extracted contact details.
  162. """
  163. return self.api_post(
  164. "pipeline/extract-contacts",
  165. {"url": url, **(params or {})},
  166. stream,
  167. content_type,
  168. )
  169. def label(
  170. self,
  171. url: str,
  172. params: Optional[RequestParamsDict] = None,
  173. stream: bool = False,
  174. content_type: str = "application/json",
  175. ):
  176. """
  177. Apply labeling to data extracted from the specified URL.
  178. :param url: The URL to label data from.
  179. :param params: Optional parameters to guide the labeling process.
  180. :return: JSON response with labeled data.
  181. """
  182. return self.api_post(
  183. "pipeline/label", {"url": url, **(params or {})}, stream, content_type
  184. )
  185. def _prepare_headers(self, content_type: str = "application/json"):
  186. return {
  187. "Content-Type": content_type,
  188. "Authorization": f"Bearer {self.api_key}",
  189. "User-Agent": "Spider-Client/0.0.27",
  190. }
  191. def _post_request(self, url: str, data, headers, stream=False):
  192. return requests.post(url, headers=headers, json=data, stream=stream)
  193. def _get_request(self, url: str, headers, stream=False):
  194. return requests.get(url, headers=headers, stream=stream)
  195. def _delete_request(self, url: str, headers, stream=False):
  196. return requests.delete(url, headers=headers, stream=stream)
  197. def _handle_error(self, response, action):
  198. if response.status_code in [402, 409, 500]:
  199. error_message = response.json().get("error", "Unknown error occurred")
  200. raise Exception(
  201. f"Failed to {action}. Status code: {response.status_code}. Error: {error_message}"
  202. )
  203. else:
  204. raise Exception(
  205. f"Unexpected error occurred while trying to {action}. Status code: {response.status_code}"
  206. )