html.py 803 B

1234567891011121314151617181920212223242526272829303132333435
  1. import logging
  2. from typing import List
  3. from bs4 import BeautifulSoup
  4. from langchain.document_loaders.base import BaseLoader
  5. from langchain.schema import Document
  6. logger = logging.getLogger(__name__)
  7. class HTMLLoader(BaseLoader):
  8. """Load html files.
  9. Args:
  10. file_path: Path to the file to load.
  11. """
  12. def __init__(
  13. self,
  14. file_path: str
  15. ):
  16. """Initialize with file path."""
  17. self._file_path = file_path
  18. def load(self) -> List[Document]:
  19. return [Document(page_content=self._load_as_text())]
  20. def _load_as_text(self) -> str:
  21. with open(self._file_path, "rb") as fp:
  22. soup = BeautifulSoup(fp, 'html.parser')
  23. text = soup.get_text()
  24. text = text.strip() if text else ''
  25. return text