|  | @@ -0,0 +1,111 @@
 | 
	
		
			
				|  |  | +"""Markdown parser.
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +Contains parser for md files.
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +"""
 | 
	
		
			
				|  |  | +import re
 | 
	
		
			
				|  |  | +from pathlib import Path
 | 
	
		
			
				|  |  | +from typing import Any, Dict, List, Optional, Tuple, Union, cast
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +from llama_index.readers.file.base_parser import BaseParser
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +class MarkdownParser(BaseParser):
 | 
	
		
			
				|  |  | +    """Markdown parser.
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    Extract text from markdown files.
 | 
	
		
			
				|  |  | +    Returns dictionary with keys as headers and values as the text between headers.
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    """
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    def __init__(
 | 
	
		
			
				|  |  | +        self,
 | 
	
		
			
				|  |  | +        *args: Any,
 | 
	
		
			
				|  |  | +        remove_hyperlinks: bool = True,
 | 
	
		
			
				|  |  | +        remove_images: bool = True,
 | 
	
		
			
				|  |  | +        **kwargs: Any,
 | 
	
		
			
				|  |  | +    ) -> None:
 | 
	
		
			
				|  |  | +        """Init params."""
 | 
	
		
			
				|  |  | +        super().__init__(*args, **kwargs)
 | 
	
		
			
				|  |  | +        self._remove_hyperlinks = remove_hyperlinks
 | 
	
		
			
				|  |  | +        self._remove_images = remove_images
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    def markdown_to_tups(self, markdown_text: str) -> List[Tuple[Optional[str], str]]:
 | 
	
		
			
				|  |  | +        """Convert a markdown file to a dictionary.
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        The keys are the headers and the values are the text under each header.
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        """
 | 
	
		
			
				|  |  | +        markdown_tups: List[Tuple[Optional[str], str]] = []
 | 
	
		
			
				|  |  | +        lines = markdown_text.split("\n")
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        current_header = None
 | 
	
		
			
				|  |  | +        current_text = ""
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        for line in lines:
 | 
	
		
			
				|  |  | +            header_match = re.match(r"^#+\s", line)
 | 
	
		
			
				|  |  | +            if header_match:
 | 
	
		
			
				|  |  | +                if current_header is not None:
 | 
	
		
			
				|  |  | +                    markdown_tups.append((current_header, current_text))
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +                current_header = line
 | 
	
		
			
				|  |  | +                current_text = ""
 | 
	
		
			
				|  |  | +            else:
 | 
	
		
			
				|  |  | +                current_text += line + "\n"
 | 
	
		
			
				|  |  | +        markdown_tups.append((current_header, current_text))
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        if current_header is not None:
 | 
	
		
			
				|  |  | +            # pass linting, assert keys are defined
 | 
	
		
			
				|  |  | +            markdown_tups = [
 | 
	
		
			
				|  |  | +                (re.sub(r"#", "", cast(str, key)).strip(), re.sub(r"<.*?>", "", value))
 | 
	
		
			
				|  |  | +                for key, value in markdown_tups
 | 
	
		
			
				|  |  | +            ]
 | 
	
		
			
				|  |  | +        else:
 | 
	
		
			
				|  |  | +            markdown_tups = [
 | 
	
		
			
				|  |  | +                (key, re.sub("\n", "", value)) for key, value in markdown_tups
 | 
	
		
			
				|  |  | +            ]
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        return markdown_tups
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    def remove_images(self, content: str) -> str:
 | 
	
		
			
				|  |  | +        """Get a dictionary of a markdown file from its path."""
 | 
	
		
			
				|  |  | +        pattern = r"!{1}\[\[(.*)\]\]"
 | 
	
		
			
				|  |  | +        content = re.sub(pattern, "", content)
 | 
	
		
			
				|  |  | +        return content
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    def remove_hyperlinks(self, content: str) -> str:
 | 
	
		
			
				|  |  | +        """Get a dictionary of a markdown file from its path."""
 | 
	
		
			
				|  |  | +        pattern = r"\[(.*?)\]\((.*?)\)"
 | 
	
		
			
				|  |  | +        content = re.sub(pattern, r"\1", content)
 | 
	
		
			
				|  |  | +        return content
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    def _init_parser(self) -> Dict:
 | 
	
		
			
				|  |  | +        """Initialize the parser with the config."""
 | 
	
		
			
				|  |  | +        return {}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    def parse_tups(
 | 
	
		
			
				|  |  | +        self, filepath: Path, errors: str = "ignore"
 | 
	
		
			
				|  |  | +    ) -> List[Tuple[Optional[str], str]]:
 | 
	
		
			
				|  |  | +        """Parse file into tuples."""
 | 
	
		
			
				|  |  | +        with open(filepath, "r", encoding="utf-8") as f:
 | 
	
		
			
				|  |  | +            content = f.read()
 | 
	
		
			
				|  |  | +        if self._remove_hyperlinks:
 | 
	
		
			
				|  |  | +            content = self.remove_hyperlinks(content)
 | 
	
		
			
				|  |  | +        if self._remove_images:
 | 
	
		
			
				|  |  | +            content = self.remove_images(content)
 | 
	
		
			
				|  |  | +        markdown_tups = self.markdown_to_tups(content)
 | 
	
		
			
				|  |  | +        return markdown_tups
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    def parse_file(
 | 
	
		
			
				|  |  | +        self, filepath: Path, errors: str = "ignore"
 | 
	
		
			
				|  |  | +    ) -> Union[str, List[str]]:
 | 
	
		
			
				|  |  | +        """Parse file into string."""
 | 
	
		
			
				|  |  | +        tups = self.parse_tups(filepath, errors=errors)
 | 
	
		
			
				|  |  | +        results = []
 | 
	
		
			
				|  |  | +        # TODO: don't include headers right now
 | 
	
		
			
				|  |  | +        for header, value in tups:
 | 
	
		
			
				|  |  | +            if header is None:
 | 
	
		
			
				|  |  | +                results.append(value)
 | 
	
		
			
				|  |  | +            else:
 | 
	
		
			
				|  |  | +                results.append(f"\n\n{header}\n{value}")
 | 
	
		
			
				|  |  | +        return results
 |