excel.py 1.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647
  1. import json
  2. import logging
  3. from typing import List
  4. from langchain.document_loaders.base import BaseLoader
  5. from langchain.schema import Document
  6. from openpyxl.reader.excel import load_workbook
  7. logger = logging.getLogger(__name__)
  8. class ExcelLoader(BaseLoader):
  9. """Load xlxs files.
  10. Args:
  11. file_path: Path to the file to load.
  12. """
  13. def __init__(
  14. self,
  15. file_path: str
  16. ):
  17. """Initialize with file path."""
  18. self._file_path = file_path
  19. def load(self) -> List[Document]:
  20. data = []
  21. keys = []
  22. wb = load_workbook(filename=self._file_path, read_only=True)
  23. # loop over all sheets
  24. for sheet in wb:
  25. if 'A1:A1' == sheet.calculate_dimension():
  26. sheet.reset_dimensions()
  27. for row in sheet.iter_rows(values_only=True):
  28. if all(v is None for v in row):
  29. continue
  30. if keys == []:
  31. keys = list(map(str, row))
  32. else:
  33. row_dict = dict(zip(keys, list(map(str, row))))
  34. row_dict = {k: v for k, v in row_dict.items() if v}
  35. item = ''.join(f'{k}:{v};' for k, v in row_dict.items())
  36. document = Document(page_content=item, metadata={'source': self._file_path})
  37. data.append(document)
  38. return data