website_service.py 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171
  1. import datetime
  2. import json
  3. from flask_login import current_user
  4. from core.helper import encrypter
  5. from core.rag.extractor.firecrawl.firecrawl_app import FirecrawlApp
  6. from extensions.ext_redis import redis_client
  7. from extensions.ext_storage import storage
  8. from services.auth.api_key_auth_service import ApiKeyAuthService
  9. class WebsiteService:
  10. @classmethod
  11. def document_create_args_validate(cls, args: dict):
  12. if 'url' not in args or not args['url']:
  13. raise ValueError('url is required')
  14. if 'options' not in args or not args['options']:
  15. raise ValueError('options is required')
  16. if 'limit' not in args['options'] or not args['options']['limit']:
  17. raise ValueError('limit is required')
  18. @classmethod
  19. def crawl_url(cls, args: dict) -> dict:
  20. provider = args.get('provider')
  21. url = args.get('url')
  22. options = args.get('options')
  23. credentials = ApiKeyAuthService.get_auth_credentials(current_user.current_tenant_id,
  24. 'website',
  25. provider)
  26. if provider == 'firecrawl':
  27. # decrypt api_key
  28. api_key = encrypter.decrypt_token(
  29. tenant_id=current_user.current_tenant_id,
  30. token=credentials.get('config').get('api_key')
  31. )
  32. firecrawl_app = FirecrawlApp(api_key=api_key,
  33. base_url=credentials.get('config').get('base_url', None))
  34. crawl_sub_pages = options.get('crawl_sub_pages', False)
  35. only_main_content = options.get('only_main_content', False)
  36. if not crawl_sub_pages:
  37. params = {
  38. 'crawlerOptions': {
  39. "includes": [],
  40. "excludes": [],
  41. "generateImgAltText": True,
  42. "limit": 1,
  43. 'returnOnlyUrls': False,
  44. 'pageOptions': {
  45. 'onlyMainContent': only_main_content,
  46. "includeHtml": False
  47. }
  48. }
  49. }
  50. else:
  51. includes = options.get('includes').split(',') if options.get('includes') else []
  52. excludes = options.get('excludes').split(',') if options.get('excludes') else []
  53. params = {
  54. 'crawlerOptions': {
  55. "includes": includes if includes else [],
  56. "excludes": excludes if excludes else [],
  57. "generateImgAltText": True,
  58. "limit": options.get('limit', 1),
  59. 'returnOnlyUrls': False,
  60. 'pageOptions': {
  61. 'onlyMainContent': only_main_content,
  62. "includeHtml": False
  63. }
  64. }
  65. }
  66. if options.get('max_depth'):
  67. params['crawlerOptions']['maxDepth'] = options.get('max_depth')
  68. job_id = firecrawl_app.crawl_url(url, params)
  69. website_crawl_time_cache_key = f'website_crawl_{job_id}'
  70. time = str(datetime.datetime.now().timestamp())
  71. redis_client.setex(website_crawl_time_cache_key, 3600, time)
  72. return {
  73. 'status': 'active',
  74. 'job_id': job_id
  75. }
  76. else:
  77. raise ValueError('Invalid provider')
  78. @classmethod
  79. def get_crawl_status(cls, job_id: str, provider: str) -> dict:
  80. credentials = ApiKeyAuthService.get_auth_credentials(current_user.current_tenant_id,
  81. 'website',
  82. provider)
  83. if provider == 'firecrawl':
  84. # decrypt api_key
  85. api_key = encrypter.decrypt_token(
  86. tenant_id=current_user.current_tenant_id,
  87. token=credentials.get('config').get('api_key')
  88. )
  89. firecrawl_app = FirecrawlApp(api_key=api_key,
  90. base_url=credentials.get('config').get('base_url', None))
  91. result = firecrawl_app.check_crawl_status(job_id)
  92. crawl_status_data = {
  93. 'status': result.get('status', 'active'),
  94. 'job_id': job_id,
  95. 'total': result.get('total', 0),
  96. 'current': result.get('current', 0),
  97. 'data': result.get('data', [])
  98. }
  99. if crawl_status_data['status'] == 'completed':
  100. website_crawl_time_cache_key = f'website_crawl_{job_id}'
  101. start_time = redis_client.get(website_crawl_time_cache_key)
  102. if start_time:
  103. end_time = datetime.datetime.now().timestamp()
  104. time_consuming = abs(end_time - float(start_time))
  105. crawl_status_data['time_consuming'] = f"{time_consuming:.2f}"
  106. redis_client.delete(website_crawl_time_cache_key)
  107. else:
  108. raise ValueError('Invalid provider')
  109. return crawl_status_data
  110. @classmethod
  111. def get_crawl_url_data(cls, job_id: str, provider: str, url: str, tenant_id: str) -> dict | None:
  112. credentials = ApiKeyAuthService.get_auth_credentials(tenant_id,
  113. 'website',
  114. provider)
  115. if provider == 'firecrawl':
  116. file_key = 'website_files/' + job_id + '.txt'
  117. if storage.exists(file_key):
  118. data = storage.load_once(file_key)
  119. if data:
  120. data = json.loads(data.decode('utf-8'))
  121. else:
  122. # decrypt api_key
  123. api_key = encrypter.decrypt_token(
  124. tenant_id=tenant_id,
  125. token=credentials.get('config').get('api_key')
  126. )
  127. firecrawl_app = FirecrawlApp(api_key=api_key,
  128. base_url=credentials.get('config').get('base_url', None))
  129. result = firecrawl_app.check_crawl_status(job_id)
  130. if result.get('status') != 'completed':
  131. raise ValueError('Crawl job is not completed')
  132. data = result.get('data')
  133. if data:
  134. for item in data:
  135. if item.get('source_url') == url:
  136. return item
  137. return None
  138. else:
  139. raise ValueError('Invalid provider')
  140. @classmethod
  141. def get_scrape_url_data(cls, provider: str, url: str, tenant_id: str, only_main_content: bool) -> dict | None:
  142. credentials = ApiKeyAuthService.get_auth_credentials(tenant_id,
  143. 'website',
  144. provider)
  145. if provider == 'firecrawl':
  146. # decrypt api_key
  147. api_key = encrypter.decrypt_token(
  148. tenant_id=tenant_id,
  149. token=credentials.get('config').get('api_key')
  150. )
  151. firecrawl_app = FirecrawlApp(api_key=api_key,
  152. base_url=credentials.get('config').get('base_url', None))
  153. params = {
  154. 'pageOptions': {
  155. 'onlyMainContent': only_main_content,
  156. "includeHtml": False
  157. }
  158. }
  159. result = firecrawl_app.scrape_url(url, params)
  160. return result
  161. else:
  162. raise ValueError('Invalid provider')