file_extractor.py 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106
  1. import tempfile
  2. from pathlib import Path
  3. from typing import List, Optional, Union
  4. import requests
  5. from core.data_loader.loader.csv_loader import CSVLoader
  6. from core.data_loader.loader.excel import ExcelLoader
  7. from core.data_loader.loader.html import HTMLLoader
  8. from core.data_loader.loader.markdown import MarkdownLoader
  9. from core.data_loader.loader.pdf import PdfLoader
  10. from core.data_loader.loader.unstructured.unstructured_eml import UnstructuredEmailLoader
  11. from core.data_loader.loader.unstructured.unstructured_markdown import UnstructuredMarkdownLoader
  12. from core.data_loader.loader.unstructured.unstructured_msg import UnstructuredMsgLoader
  13. from core.data_loader.loader.unstructured.unstructured_ppt import UnstructuredPPTLoader
  14. from core.data_loader.loader.unstructured.unstructured_pptx import UnstructuredPPTXLoader
  15. from core.data_loader.loader.unstructured.unstructured_text import UnstructuredTextLoader
  16. from core.data_loader.loader.unstructured.unstructured_xml import UnstructuredXmlLoader
  17. from extensions.ext_storage import storage
  18. from flask import current_app
  19. from langchain.document_loaders import Docx2txtLoader, TextLoader
  20. from langchain.schema import Document
  21. from models.model import UploadFile
  22. SUPPORT_URL_CONTENT_TYPES = ['application/pdf', 'text/plain']
  23. USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
  24. class FileExtractor:
  25. @classmethod
  26. def load(cls, upload_file: UploadFile, return_text: bool = False, is_automatic: bool = False) -> Union[List[Document] | str]:
  27. with tempfile.TemporaryDirectory() as temp_dir:
  28. suffix = Path(upload_file.key).suffix
  29. file_path = f"{temp_dir}/{next(tempfile._get_candidate_names())}{suffix}"
  30. storage.download(upload_file.key, file_path)
  31. return cls.load_from_file(file_path, return_text, upload_file, is_automatic)
  32. @classmethod
  33. def load_from_url(cls, url: str, return_text: bool = False) -> Union[List[Document] | str]:
  34. response = requests.get(url, headers={
  35. "User-Agent": USER_AGENT
  36. })
  37. with tempfile.TemporaryDirectory() as temp_dir:
  38. suffix = Path(url).suffix
  39. file_path = f"{temp_dir}/{next(tempfile._get_candidate_names())}{suffix}"
  40. with open(file_path, 'wb') as file:
  41. file.write(response.content)
  42. return cls.load_from_file(file_path, return_text)
  43. @classmethod
  44. def load_from_file(cls, file_path: str, return_text: bool = False,
  45. upload_file: Optional[UploadFile] = None,
  46. is_automatic: bool = False) -> Union[List[Document] | str]:
  47. input_file = Path(file_path)
  48. delimiter = '\n'
  49. file_extension = input_file.suffix.lower()
  50. etl_type = current_app.config['ETL_TYPE']
  51. unstructured_api_url = current_app.config['UNSTRUCTURED_API_URL']
  52. if etl_type == 'Unstructured':
  53. if file_extension == '.xlsx':
  54. loader = ExcelLoader(file_path)
  55. elif file_extension == '.pdf':
  56. loader = PdfLoader(file_path, upload_file=upload_file)
  57. elif file_extension in ['.md', '.markdown']:
  58. loader = UnstructuredMarkdownLoader(file_path, unstructured_api_url) if is_automatic \
  59. else MarkdownLoader(file_path, autodetect_encoding=True)
  60. elif file_extension in ['.htm', '.html']:
  61. loader = HTMLLoader(file_path)
  62. elif file_extension == '.docx':
  63. loader = Docx2txtLoader(file_path)
  64. elif file_extension == '.csv':
  65. loader = CSVLoader(file_path, autodetect_encoding=True)
  66. elif file_extension == '.msg':
  67. loader = UnstructuredMsgLoader(file_path, unstructured_api_url)
  68. elif file_extension == '.eml':
  69. loader = UnstructuredEmailLoader(file_path, unstructured_api_url)
  70. elif file_extension == '.ppt':
  71. loader = UnstructuredPPTLoader(file_path, unstructured_api_url)
  72. elif file_extension == '.pptx':
  73. loader = UnstructuredPPTXLoader(file_path, unstructured_api_url)
  74. elif file_extension == '.xml':
  75. loader = UnstructuredXmlLoader(file_path, unstructured_api_url)
  76. else:
  77. # txt
  78. loader = UnstructuredTextLoader(file_path, unstructured_api_url) if is_automatic \
  79. else TextLoader(file_path, autodetect_encoding=True)
  80. else:
  81. if file_extension == '.xlsx':
  82. loader = ExcelLoader(file_path)
  83. elif file_extension == '.pdf':
  84. loader = PdfLoader(file_path, upload_file=upload_file)
  85. elif file_extension in ['.md', '.markdown']:
  86. loader = MarkdownLoader(file_path, autodetect_encoding=True)
  87. elif file_extension in ['.htm', '.html']:
  88. loader = HTMLLoader(file_path)
  89. elif file_extension == '.docx':
  90. loader = Docx2txtLoader(file_path)
  91. elif file_extension == '.csv':
  92. loader = CSVLoader(file_path, autodetect_encoding=True)
  93. else:
  94. # txt
  95. loader = TextLoader(file_path, autodetect_encoding=True)
  96. return delimiter.join([document.page_content for document in loader.load()]) if return_text else loader.load()