html_parser.py 566 B

12345678910111213141516171819202122
  1. from pathlib import Path
  2. from typing import Dict
  3. from bs4 import BeautifulSoup
  4. from llama_index.readers.file.base_parser import BaseParser
  5. class HTMLParser(BaseParser):
  6. """HTML parser."""
  7. def _init_parser(self) -> Dict:
  8. """Init parser."""
  9. return {}
  10. def parse_file(self, file: Path, errors: str = "ignore") -> str:
  11. """Parse file."""
  12. with open(file, "rb") as fp:
  13. soup = BeautifulSoup(fp, 'html.parser')
  14. text = soup.get_text()
  15. text = text.strip() if text else ''
  16. return text