pdf.py 1.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455
  1. import logging
  2. from typing import List, Optional
  3. from langchain.document_loaders import PyPDFium2Loader
  4. from langchain.document_loaders.base import BaseLoader
  5. from langchain.schema import Document
  6. from extensions.ext_storage import storage
  7. from models.model import UploadFile
  8. logger = logging.getLogger(__name__)
  9. class PdfLoader(BaseLoader):
  10. """Load pdf files.
  11. Args:
  12. file_path: Path to the file to load.
  13. """
  14. def __init__(
  15. self,
  16. file_path: str,
  17. upload_file: Optional[UploadFile] = None
  18. ):
  19. """Initialize with file path."""
  20. self._file_path = file_path
  21. self._upload_file = upload_file
  22. def load(self) -> List[Document]:
  23. plaintext_file_key = ''
  24. plaintext_file_exists = False
  25. if self._upload_file:
  26. if self._upload_file.hash:
  27. plaintext_file_key = 'upload_files/' + self._upload_file.tenant_id + '/' \
  28. + self._upload_file.hash + '.0625.plaintext'
  29. try:
  30. text = storage.load(plaintext_file_key).decode('utf-8')
  31. plaintext_file_exists = True
  32. return [Document(page_content=text)]
  33. except FileNotFoundError:
  34. pass
  35. documents = PyPDFium2Loader(file_path=self._file_path).load()
  36. text_list = []
  37. for document in documents:
  38. text_list.append(document.page_content)
  39. text = "\n\n".join(text_list)
  40. # save plaintext file for caching
  41. if not plaintext_file_exists and plaintext_file_key:
  42. storage.save(plaintext_file_key, text.encode('utf-8'))
  43. return documents