external_knowledge_service.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274
  1. import json
  2. from copy import deepcopy
  3. from datetime import datetime, timezone
  4. from typing import Any, Optional, Union
  5. import httpx
  6. import validators
  7. # from tasks.external_document_indexing_task import external_document_indexing_task
  8. from core.helper import ssrf_proxy
  9. from extensions.ext_database import db
  10. from models.dataset import (
  11. Dataset,
  12. ExternalKnowledgeApis,
  13. ExternalKnowledgeBindings,
  14. )
  15. from services.entities.external_knowledge_entities.external_knowledge_entities import (
  16. Authorization,
  17. ExternalKnowledgeApiSetting,
  18. )
  19. from services.errors.dataset import DatasetNameDuplicateError
  20. class ExternalDatasetService:
  21. @staticmethod
  22. def get_external_knowledge_apis(page, per_page, tenant_id, search=None) -> tuple[list[ExternalKnowledgeApis], int]:
  23. query = ExternalKnowledgeApis.query.filter(ExternalKnowledgeApis.tenant_id == tenant_id).order_by(
  24. ExternalKnowledgeApis.created_at.desc()
  25. )
  26. if search:
  27. query = query.filter(ExternalKnowledgeApis.name.ilike(f"%{search}%"))
  28. external_knowledge_apis = query.paginate(page=page, per_page=per_page, max_per_page=100, error_out=False)
  29. return external_knowledge_apis.items, external_knowledge_apis.total
  30. @classmethod
  31. def validate_api_list(cls, api_settings: dict):
  32. if not api_settings:
  33. raise ValueError("api list is empty")
  34. if "endpoint" not in api_settings and not api_settings["endpoint"]:
  35. raise ValueError("endpoint is required")
  36. if "api_key" not in api_settings and not api_settings["api_key"]:
  37. raise ValueError("api_key is required")
  38. @staticmethod
  39. def create_external_knowledge_api(tenant_id: str, user_id: str, args: dict) -> ExternalKnowledgeApis:
  40. ExternalDatasetService.check_endpoint_and_api_key(args.get("settings"))
  41. external_knowledge_api = ExternalKnowledgeApis(
  42. tenant_id=tenant_id,
  43. created_by=user_id,
  44. updated_by=user_id,
  45. name=args.get("name"),
  46. description=args.get("description", ""),
  47. settings=json.dumps(args.get("settings"), ensure_ascii=False),
  48. )
  49. db.session.add(external_knowledge_api)
  50. db.session.commit()
  51. return external_knowledge_api
  52. @staticmethod
  53. def check_endpoint_and_api_key(settings: dict):
  54. if "endpoint" not in settings or not settings["endpoint"]:
  55. raise ValueError("endpoint is required")
  56. if "api_key" not in settings or not settings["api_key"]:
  57. raise ValueError("api_key is required")
  58. endpoint = f"{settings['endpoint']}/retrieval"
  59. api_key = settings["api_key"]
  60. if not validators.url(endpoint):
  61. raise ValueError(f"invalid endpoint: {endpoint}")
  62. try:
  63. response = httpx.post(endpoint, headers={"Authorization": f"Bearer {api_key}"})
  64. except Exception as e:
  65. raise ValueError(f"failed to connect to the endpoint: {endpoint}")
  66. if response.status_code == 502:
  67. raise ValueError(f"Bad Gateway: failed to connect to the endpoint: {endpoint}")
  68. if response.status_code == 404:
  69. raise ValueError(f"Not Found: failed to connect to the endpoint: {endpoint}")
  70. if response.status_code == 403:
  71. raise ValueError(f"Forbidden: Authorization failed with api_key: {api_key}")
  72. @staticmethod
  73. def get_external_knowledge_api(external_knowledge_api_id: str) -> ExternalKnowledgeApis:
  74. return ExternalKnowledgeApis.query.filter_by(id=external_knowledge_api_id).first()
  75. @staticmethod
  76. def update_external_knowledge_api(tenant_id, user_id, external_knowledge_api_id, args) -> ExternalKnowledgeApis:
  77. external_knowledge_api = ExternalKnowledgeApis.query.filter_by(
  78. id=external_knowledge_api_id, tenant_id=tenant_id
  79. ).first()
  80. if external_knowledge_api is None:
  81. raise ValueError("api template not found")
  82. external_knowledge_api.name = args.get("name")
  83. external_knowledge_api.description = args.get("description", "")
  84. external_knowledge_api.settings = json.dumps(args.get("settings"), ensure_ascii=False)
  85. external_knowledge_api.updated_by = user_id
  86. external_knowledge_api.updated_at = datetime.now(timezone.utc).replace(tzinfo=None)
  87. db.session.commit()
  88. return external_knowledge_api
  89. @staticmethod
  90. def delete_external_knowledge_api(tenant_id: str, external_knowledge_api_id: str):
  91. external_knowledge_api = ExternalKnowledgeApis.query.filter_by(
  92. id=external_knowledge_api_id, tenant_id=tenant_id
  93. ).first()
  94. if external_knowledge_api is None:
  95. raise ValueError("api template not found")
  96. db.session.delete(external_knowledge_api)
  97. db.session.commit()
  98. @staticmethod
  99. def external_knowledge_api_use_check(external_knowledge_api_id: str) -> tuple[bool, int]:
  100. count = ExternalKnowledgeBindings.query.filter_by(external_knowledge_api_id=external_knowledge_api_id).count()
  101. if count > 0:
  102. return True, count
  103. return False, 0
  104. @staticmethod
  105. def get_external_knowledge_binding_with_dataset_id(tenant_id: str, dataset_id: str) -> ExternalKnowledgeBindings:
  106. external_knowledge_binding = ExternalKnowledgeBindings.query.filter_by(
  107. dataset_id=dataset_id, tenant_id=tenant_id
  108. ).first()
  109. if not external_knowledge_binding:
  110. raise ValueError("external knowledge binding not found")
  111. return external_knowledge_binding
  112. @staticmethod
  113. def document_create_args_validate(tenant_id: str, external_knowledge_api_id: str, process_parameter: dict):
  114. external_knowledge_api = ExternalKnowledgeApis.query.filter_by(
  115. id=external_knowledge_api_id, tenant_id=tenant_id
  116. ).first()
  117. if external_knowledge_api is None:
  118. raise ValueError("api template not found")
  119. settings = json.loads(external_knowledge_api.settings)
  120. for setting in settings:
  121. custom_parameters = setting.get("document_process_setting")
  122. if custom_parameters:
  123. for parameter in custom_parameters:
  124. if parameter.get("required", False) and not process_parameter.get(parameter.get("name")):
  125. raise ValueError(f'{parameter.get("name")} is required')
  126. @staticmethod
  127. def process_external_api(
  128. settings: ExternalKnowledgeApiSetting, files: Union[None, dict[str, Any]]
  129. ) -> httpx.Response:
  130. """
  131. do http request depending on api bundle
  132. """
  133. kwargs = {
  134. "url": settings.url,
  135. "headers": settings.headers,
  136. "follow_redirects": True,
  137. }
  138. response = getattr(ssrf_proxy, settings.request_method)(data=json.dumps(settings.params), files=files, **kwargs)
  139. return response
  140. @staticmethod
  141. def assembling_headers(authorization: Authorization, headers: Optional[dict] = None) -> dict[str, Any]:
  142. authorization = deepcopy(authorization)
  143. if headers:
  144. headers = deepcopy(headers)
  145. else:
  146. headers = {}
  147. if authorization.type == "api-key":
  148. if authorization.config is None:
  149. raise ValueError("authorization config is required")
  150. if authorization.config.api_key is None:
  151. raise ValueError("api_key is required")
  152. if not authorization.config.header:
  153. authorization.config.header = "Authorization"
  154. if authorization.config.type == "bearer":
  155. headers[authorization.config.header] = f"Bearer {authorization.config.api_key}"
  156. elif authorization.config.type == "basic":
  157. headers[authorization.config.header] = f"Basic {authorization.config.api_key}"
  158. elif authorization.config.type == "custom":
  159. headers[authorization.config.header] = authorization.config.api_key
  160. return headers
  161. @staticmethod
  162. def get_external_knowledge_api_settings(settings: dict) -> ExternalKnowledgeApiSetting:
  163. return ExternalKnowledgeApiSetting.parse_obj(settings)
  164. @staticmethod
  165. def create_external_dataset(tenant_id: str, user_id: str, args: dict) -> Dataset:
  166. # check if dataset name already exists
  167. if Dataset.query.filter_by(name=args.get("name"), tenant_id=tenant_id).first():
  168. raise DatasetNameDuplicateError(f"Dataset with name {args.get('name')} already exists.")
  169. external_knowledge_api = ExternalKnowledgeApis.query.filter_by(
  170. id=args.get("external_knowledge_api_id"), tenant_id=tenant_id
  171. ).first()
  172. if external_knowledge_api is None:
  173. raise ValueError("api template not found")
  174. dataset = Dataset(
  175. tenant_id=tenant_id,
  176. name=args.get("name"),
  177. description=args.get("description", ""),
  178. provider="external",
  179. retrieval_model=args.get("external_retrieval_model"),
  180. created_by=user_id,
  181. )
  182. db.session.add(dataset)
  183. db.session.flush()
  184. external_knowledge_binding = ExternalKnowledgeBindings(
  185. tenant_id=tenant_id,
  186. dataset_id=dataset.id,
  187. external_knowledge_api_id=args.get("external_knowledge_api_id"),
  188. external_knowledge_id=args.get("external_knowledge_id"),
  189. created_by=user_id,
  190. )
  191. db.session.add(external_knowledge_binding)
  192. db.session.commit()
  193. return dataset
  194. @staticmethod
  195. def fetch_external_knowledge_retrieval(
  196. tenant_id: str, dataset_id: str, query: str, external_retrieval_parameters: dict
  197. ) -> list:
  198. external_knowledge_binding = ExternalKnowledgeBindings.query.filter_by(
  199. dataset_id=dataset_id, tenant_id=tenant_id
  200. ).first()
  201. if not external_knowledge_binding:
  202. raise ValueError("external knowledge binding not found")
  203. external_knowledge_api = ExternalKnowledgeApis.query.filter_by(
  204. id=external_knowledge_binding.external_knowledge_api_id
  205. ).first()
  206. if not external_knowledge_api:
  207. raise ValueError("external api template not found")
  208. settings = json.loads(external_knowledge_api.settings)
  209. headers = {"Content-Type": "application/json"}
  210. if settings.get("api_key"):
  211. headers["Authorization"] = f"Bearer {settings.get('api_key')}"
  212. score_threshold_enabled = external_retrieval_parameters.get("score_threshold_enabled") or False
  213. score_threshold = external_retrieval_parameters.get("score_threshold", 0.0) if score_threshold_enabled else 0.0
  214. request_params = {
  215. "retrieval_setting": {
  216. "top_k": external_retrieval_parameters.get("top_k"),
  217. "score_threshold": score_threshold,
  218. },
  219. "query": query,
  220. "knowledge_id": external_knowledge_binding.external_knowledge_id,
  221. }
  222. external_knowledge_api_setting = {
  223. "url": f"{settings.get('endpoint')}/retrieval",
  224. "request_method": "post",
  225. "headers": headers,
  226. "params": request_params,
  227. }
  228. response = ExternalDatasetService.process_external_api(
  229. ExternalKnowledgeApiSetting(**external_knowledge_api_setting), None
  230. )
  231. if response.status_code == 200:
  232. return response.json().get("records", [])
  233. return []