message_file_parser.py 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220
  1. import re
  2. from collections.abc import Mapping, Sequence
  3. from typing import Any, Union
  4. from urllib.parse import parse_qs, urlparse
  5. import requests
  6. from core.file.file_obj import FileBelongsTo, FileExtraConfig, FileTransferMethod, FileType, FileVar
  7. from extensions.ext_database import db
  8. from models.account import Account
  9. from models.model import EndUser, MessageFile, UploadFile
  10. from services.file_service import IMAGE_EXTENSIONS
  11. class MessageFileParser:
  12. def __init__(self, tenant_id: str, app_id: str) -> None:
  13. self.tenant_id = tenant_id
  14. self.app_id = app_id
  15. def validate_and_transform_files_arg(self, files: Sequence[Mapping[str, Any]], file_extra_config: FileExtraConfig,
  16. user: Union[Account, EndUser]) -> list[FileVar]:
  17. """
  18. validate and transform files arg
  19. :param files:
  20. :param file_extra_config:
  21. :param user:
  22. :return:
  23. """
  24. for file in files:
  25. if not isinstance(file, dict):
  26. raise ValueError('Invalid file format, must be dict')
  27. if not file.get('type'):
  28. raise ValueError('Missing file type')
  29. FileType.value_of(file.get('type'))
  30. if not file.get('transfer_method'):
  31. raise ValueError('Missing file transfer method')
  32. FileTransferMethod.value_of(file.get('transfer_method'))
  33. if file.get('transfer_method') == FileTransferMethod.REMOTE_URL.value:
  34. if not file.get('url'):
  35. raise ValueError('Missing file url')
  36. if not file.get('url').startswith('http'):
  37. raise ValueError('Invalid file url')
  38. if file.get('transfer_method') == FileTransferMethod.LOCAL_FILE.value and not file.get('upload_file_id'):
  39. raise ValueError('Missing file upload_file_id')
  40. if file.get('transform_method') == FileTransferMethod.TOOL_FILE.value and not file.get('tool_file_id'):
  41. raise ValueError('Missing file tool_file_id')
  42. # transform files to file objs
  43. type_file_objs = self._to_file_objs(files, file_extra_config)
  44. # validate files
  45. new_files = []
  46. for file_type, file_objs in type_file_objs.items():
  47. if file_type == FileType.IMAGE:
  48. # parse and validate files
  49. image_config = file_extra_config.image_config
  50. # check if image file feature is enabled
  51. if not image_config:
  52. continue
  53. # Validate number of files
  54. if len(files) > image_config['number_limits']:
  55. raise ValueError(f"Number of image files exceeds the maximum limit {image_config['number_limits']}")
  56. for file_obj in file_objs:
  57. # Validate transfer method
  58. if file_obj.transfer_method.value not in image_config['transfer_methods']:
  59. raise ValueError(f'Invalid transfer method: {file_obj.transfer_method.value}')
  60. # Validate file type
  61. if file_obj.type != FileType.IMAGE:
  62. raise ValueError(f'Invalid file type: {file_obj.type}')
  63. if file_obj.transfer_method == FileTransferMethod.REMOTE_URL:
  64. # check remote url valid and is image
  65. result, error = self._check_image_remote_url(file_obj.url)
  66. if result is False:
  67. raise ValueError(error)
  68. elif file_obj.transfer_method == FileTransferMethod.LOCAL_FILE:
  69. # get upload file from upload_file_id
  70. upload_file = (db.session.query(UploadFile)
  71. .filter(
  72. UploadFile.id == file_obj.related_id,
  73. UploadFile.tenant_id == self.tenant_id,
  74. UploadFile.created_by == user.id,
  75. UploadFile.created_by_role == ('account' if isinstance(user, Account) else 'end_user'),
  76. UploadFile.extension.in_(IMAGE_EXTENSIONS)
  77. ).first())
  78. # check upload file is belong to tenant and user
  79. if not upload_file:
  80. raise ValueError('Invalid upload file')
  81. new_files.append(file_obj)
  82. # return all file objs
  83. return new_files
  84. def transform_message_files(self, files: list[MessageFile], file_extra_config: FileExtraConfig):
  85. """
  86. transform message files
  87. :param files:
  88. :param file_extra_config:
  89. :return:
  90. """
  91. # transform files to file objs
  92. type_file_objs = self._to_file_objs(files, file_extra_config)
  93. # return all file objs
  94. return [file_obj for file_objs in type_file_objs.values() for file_obj in file_objs]
  95. def _to_file_objs(self, files: list[Union[dict, MessageFile]],
  96. file_extra_config: FileExtraConfig) -> dict[FileType, list[FileVar]]:
  97. """
  98. transform files to file objs
  99. :param files:
  100. :param file_extra_config:
  101. :return:
  102. """
  103. type_file_objs: dict[FileType, list[FileVar]] = {
  104. # Currently only support image
  105. FileType.IMAGE: []
  106. }
  107. if not files:
  108. return type_file_objs
  109. # group by file type and convert file args or message files to FileObj
  110. for file in files:
  111. if isinstance(file, MessageFile):
  112. if file.belongs_to == FileBelongsTo.ASSISTANT.value:
  113. continue
  114. file_obj = self._to_file_obj(file, file_extra_config)
  115. if file_obj.type not in type_file_objs:
  116. continue
  117. type_file_objs[file_obj.type].append(file_obj)
  118. return type_file_objs
  119. def _to_file_obj(self, file: Union[dict, MessageFile], file_extra_config: FileExtraConfig):
  120. """
  121. transform file to file obj
  122. :param file:
  123. :return:
  124. """
  125. if isinstance(file, dict):
  126. transfer_method = FileTransferMethod.value_of(file.get('transfer_method'))
  127. if transfer_method != FileTransferMethod.TOOL_FILE:
  128. return FileVar(
  129. tenant_id=self.tenant_id,
  130. type=FileType.value_of(file.get('type')),
  131. transfer_method=transfer_method,
  132. url=file.get('url') if transfer_method == FileTransferMethod.REMOTE_URL else None,
  133. related_id=file.get('upload_file_id') if transfer_method == FileTransferMethod.LOCAL_FILE else None,
  134. extra_config=file_extra_config
  135. )
  136. return FileVar(
  137. tenant_id=self.tenant_id,
  138. type=FileType.value_of(file.get('type')),
  139. transfer_method=transfer_method,
  140. url=None,
  141. related_id=file.get('tool_file_id'),
  142. extra_config=file_extra_config
  143. )
  144. else:
  145. return FileVar(
  146. id=file.id,
  147. tenant_id=self.tenant_id,
  148. type=FileType.value_of(file.type),
  149. transfer_method=FileTransferMethod.value_of(file.transfer_method),
  150. url=file.url,
  151. related_id=file.upload_file_id or None,
  152. extra_config=file_extra_config
  153. )
  154. def _check_image_remote_url(self, url):
  155. try:
  156. headers = {
  157. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
  158. }
  159. def is_s3_presigned_url(url):
  160. try:
  161. parsed_url = urlparse(url)
  162. if 'amazonaws.com' not in parsed_url.netloc:
  163. return False
  164. query_params = parse_qs(parsed_url.query)
  165. required_params = ['Signature', 'Expires']
  166. for param in required_params:
  167. if param not in query_params:
  168. return False
  169. if not query_params['Expires'][0].isdigit():
  170. return False
  171. signature = query_params['Signature'][0]
  172. if not re.match(r'^[A-Za-z0-9+/]+={0,2}$', signature):
  173. return False
  174. return True
  175. except Exception:
  176. return False
  177. if is_s3_presigned_url(url):
  178. response = requests.get(url, headers=headers, allow_redirects=True)
  179. if response.status_code in {200, 304}:
  180. return True, ""
  181. response = requests.head(url, headers=headers, allow_redirects=True)
  182. if response.status_code in {200, 304}:
  183. return True, ""
  184. else:
  185. return False, "URL does not exist."
  186. except requests.RequestException as e:
  187. return False, f"Error checking URL: {e}"