1234567891011121314151617181920212223242526272829303132333435 |
- import logging
- from typing import List
- from bs4 import BeautifulSoup
- from langchain.document_loaders.base import BaseLoader
- from langchain.schema import Document
- logger = logging.getLogger(__name__)
- class HTMLLoader(BaseLoader):
- """Load html files.
- Args:
- file_path: Path to the file to load.
- """
- def __init__(
- self,
- file_path: str
- ):
- """Initialize with file path."""
- self._file_path = file_path
- def load(self) -> List[Document]:
- return [Document(page_content=self._load_as_text())]
- def _load_as_text(self) -> str:
- with open(self._file_path, "rb") as fp:
- soup = BeautifulSoup(fp, 'html.parser')
- text = soup.get_text()
- text = text.strip() if text else ''
- return text
|