website_service.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236
  1. import datetime
  2. import json
  3. import requests
  4. from flask_login import current_user
  5. from core.helper import encrypter
  6. from core.rag.extractor.firecrawl.firecrawl_app import FirecrawlApp
  7. from extensions.ext_redis import redis_client
  8. from extensions.ext_storage import storage
  9. from services.auth.api_key_auth_service import ApiKeyAuthService
  10. class WebsiteService:
  11. @classmethod
  12. def document_create_args_validate(cls, args: dict):
  13. if "url" not in args or not args["url"]:
  14. raise ValueError("url is required")
  15. if "options" not in args or not args["options"]:
  16. raise ValueError("options is required")
  17. if "limit" not in args["options"] or not args["options"]["limit"]:
  18. raise ValueError("limit is required")
  19. @classmethod
  20. def crawl_url(cls, args: dict) -> dict:
  21. provider = args.get("provider")
  22. url = args.get("url")
  23. options = args.get("options")
  24. credentials = ApiKeyAuthService.get_auth_credentials(current_user.current_tenant_id, "website", provider)
  25. if provider == "firecrawl":
  26. # decrypt api_key
  27. api_key = encrypter.decrypt_token(
  28. tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
  29. )
  30. firecrawl_app = FirecrawlApp(api_key=api_key, base_url=credentials.get("config").get("base_url", None))
  31. crawl_sub_pages = options.get("crawl_sub_pages", False)
  32. only_main_content = options.get("only_main_content", False)
  33. if not crawl_sub_pages:
  34. params = {
  35. "crawlerOptions": {
  36. "includes": [],
  37. "excludes": [],
  38. "generateImgAltText": True,
  39. "limit": 1,
  40. "returnOnlyUrls": False,
  41. "pageOptions": {"onlyMainContent": only_main_content, "includeHtml": False},
  42. }
  43. }
  44. else:
  45. includes = options.get("includes").split(",") if options.get("includes") else []
  46. excludes = options.get("excludes").split(",") if options.get("excludes") else []
  47. params = {
  48. "crawlerOptions": {
  49. "includes": includes or [],
  50. "excludes": excludes or [],
  51. "generateImgAltText": True,
  52. "limit": options.get("limit", 1),
  53. "returnOnlyUrls": False,
  54. "pageOptions": {"onlyMainContent": only_main_content, "includeHtml": False},
  55. }
  56. }
  57. if options.get("max_depth"):
  58. params["crawlerOptions"]["maxDepth"] = options.get("max_depth")
  59. job_id = firecrawl_app.crawl_url(url, params)
  60. website_crawl_time_cache_key = f"website_crawl_{job_id}"
  61. time = str(datetime.datetime.now().timestamp())
  62. redis_client.setex(website_crawl_time_cache_key, 3600, time)
  63. return {"status": "active", "job_id": job_id}
  64. elif provider == "jinareader":
  65. api_key = encrypter.decrypt_token(
  66. tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
  67. )
  68. crawl_sub_pages = options.get("crawl_sub_pages", False)
  69. if not crawl_sub_pages:
  70. response = requests.get(
  71. f"https://r.jina.ai/{url}",
  72. headers={"Accept": "application/json", "Authorization": f"Bearer {api_key}"},
  73. )
  74. if response.json().get("code") != 200:
  75. raise ValueError("Failed to crawl")
  76. return {"status": "active", "data": response.json().get("data")}
  77. else:
  78. response = requests.post(
  79. "https://adaptivecrawl-kir3wx7b3a-uc.a.run.app",
  80. json={
  81. "url": url,
  82. "maxPages": options.get("limit", 1),
  83. "useSitemap": options.get("use_sitemap", True),
  84. },
  85. headers={
  86. "Content-Type": "application/json",
  87. "Authorization": f"Bearer {api_key}",
  88. },
  89. )
  90. if response.json().get("code") != 200:
  91. raise ValueError("Failed to crawl")
  92. return {"status": "active", "job_id": response.json().get("data", {}).get("taskId")}
  93. else:
  94. raise ValueError("Invalid provider")
  95. @classmethod
  96. def get_crawl_status(cls, job_id: str, provider: str) -> dict:
  97. credentials = ApiKeyAuthService.get_auth_credentials(current_user.current_tenant_id, "website", provider)
  98. if provider == "firecrawl":
  99. # decrypt api_key
  100. api_key = encrypter.decrypt_token(
  101. tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
  102. )
  103. firecrawl_app = FirecrawlApp(api_key=api_key, base_url=credentials.get("config").get("base_url", None))
  104. result = firecrawl_app.check_crawl_status(job_id)
  105. crawl_status_data = {
  106. "status": result.get("status", "active"),
  107. "job_id": job_id,
  108. "total": result.get("total", 0),
  109. "current": result.get("current", 0),
  110. "data": result.get("data", []),
  111. }
  112. if crawl_status_data["status"] == "completed":
  113. website_crawl_time_cache_key = f"website_crawl_{job_id}"
  114. start_time = redis_client.get(website_crawl_time_cache_key)
  115. if start_time:
  116. end_time = datetime.datetime.now().timestamp()
  117. time_consuming = abs(end_time - float(start_time))
  118. crawl_status_data["time_consuming"] = f"{time_consuming:.2f}"
  119. redis_client.delete(website_crawl_time_cache_key)
  120. elif provider == "jinareader":
  121. api_key = encrypter.decrypt_token(
  122. tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
  123. )
  124. response = requests.post(
  125. "https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
  126. headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
  127. json={"taskId": job_id},
  128. )
  129. data = response.json().get("data", {})
  130. crawl_status_data = {
  131. "status": data.get("status", "active"),
  132. "job_id": job_id,
  133. "total": len(data.get("urls", [])),
  134. "current": len(data.get("processed", [])) + len(data.get("failed", [])),
  135. "data": [],
  136. "time_consuming": data.get("duration", 0) / 1000,
  137. }
  138. if crawl_status_data["status"] == "completed":
  139. response = requests.post(
  140. "https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
  141. headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
  142. json={"taskId": job_id, "urls": list(data.get("processed", {}).keys())},
  143. )
  144. data = response.json().get("data", {})
  145. formatted_data = [
  146. {
  147. "title": item.get("data", {}).get("title"),
  148. "source_url": item.get("data", {}).get("url"),
  149. "description": item.get("data", {}).get("description"),
  150. "markdown": item.get("data", {}).get("content"),
  151. }
  152. for item in data.get("processed", {}).values()
  153. ]
  154. crawl_status_data["data"] = formatted_data
  155. else:
  156. raise ValueError("Invalid provider")
  157. return crawl_status_data
  158. @classmethod
  159. def get_crawl_url_data(cls, job_id: str, provider: str, url: str, tenant_id: str) -> dict | None:
  160. credentials = ApiKeyAuthService.get_auth_credentials(tenant_id, "website", provider)
  161. # decrypt api_key
  162. api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key"))
  163. if provider == "firecrawl":
  164. file_key = "website_files/" + job_id + ".txt"
  165. if storage.exists(file_key):
  166. data = storage.load_once(file_key)
  167. if data:
  168. data = json.loads(data.decode("utf-8"))
  169. else:
  170. firecrawl_app = FirecrawlApp(api_key=api_key, base_url=credentials.get("config").get("base_url", None))
  171. result = firecrawl_app.check_crawl_status(job_id)
  172. if result.get("status") != "completed":
  173. raise ValueError("Crawl job is not completed")
  174. data = result.get("data")
  175. if data:
  176. for item in data:
  177. if item.get("source_url") == url:
  178. return item
  179. return None
  180. elif provider == "jinareader":
  181. file_key = "website_files/" + job_id + ".txt"
  182. if storage.exists(file_key):
  183. data = storage.load_once(file_key)
  184. if data:
  185. data = json.loads(data.decode("utf-8"))
  186. elif not job_id:
  187. response = requests.get(
  188. f"https://r.jina.ai/{url}",
  189. headers={"Accept": "application/json", "Authorization": f"Bearer {api_key}"},
  190. )
  191. if response.json().get("code") != 200:
  192. raise ValueError("Failed to crawl")
  193. return response.json().get("data")
  194. else:
  195. api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key"))
  196. response = requests.post(
  197. "https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
  198. headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
  199. json={"taskId": job_id},
  200. )
  201. data = response.json().get("data", {})
  202. if data.get("status") != "completed":
  203. raise ValueError("Crawl job is not completed")
  204. response = requests.post(
  205. "https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
  206. headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
  207. json={"taskId": job_id, "urls": list(data.get("processed", {}).keys())},
  208. )
  209. data = response.json().get("data", {})
  210. for item in data.get("processed", {}).values():
  211. if item.get("data", {}).get("url") == url:
  212. return item.get("data", {})
  213. else:
  214. raise ValueError("Invalid provider")
  215. @classmethod
  216. def get_scrape_url_data(cls, provider: str, url: str, tenant_id: str, only_main_content: bool) -> dict | None:
  217. credentials = ApiKeyAuthService.get_auth_credentials(tenant_id, "website", provider)
  218. if provider == "firecrawl":
  219. # decrypt api_key
  220. api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key"))
  221. firecrawl_app = FirecrawlApp(api_key=api_key, base_url=credentials.get("config").get("base_url", None))
  222. params = {"pageOptions": {"onlyMainContent": only_main_content, "includeHtml": False}}
  223. result = firecrawl_app.scrape_url(url, params)
  224. return result
  225. else:
  226. raise ValueError("Invalid provider")