excel.py 1.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546
  1. import logging
  2. from typing import List
  3. from langchain.document_loaders.base import BaseLoader
  4. from langchain.schema import Document
  5. from openpyxl.reader.excel import load_workbook
  6. logger = logging.getLogger(__name__)
  7. class ExcelLoader(BaseLoader):
  8. """Load xlxs files.
  9. Args:
  10. file_path: Path to the file to load.
  11. """
  12. def __init__(
  13. self,
  14. file_path: str
  15. ):
  16. """Initialize with file path."""
  17. self._file_path = file_path
  18. def load(self) -> List[Document]:
  19. data = []
  20. keys = []
  21. wb = load_workbook(filename=self._file_path, read_only=True)
  22. # loop over all sheets
  23. for sheet in wb:
  24. if 'A1:A1' == sheet.calculate_dimension():
  25. sheet.reset_dimensions()
  26. for row in sheet.iter_rows(values_only=True):
  27. if all(v is None for v in row):
  28. continue
  29. if keys == []:
  30. keys = list(map(str, row))
  31. else:
  32. row_dict = dict(zip(keys, list(map(str, row))))
  33. row_dict = {k: v for k, v in row_dict.items() if v}
  34. item = ''.join(f'{k}:{v};' for k, v in row_dict.items())
  35. document = Document(page_content=item, metadata={'source': self._file_path})
  36. data.append(document)
  37. return data