|  | @@ -1,12 +1,20 @@
 | 
	
		
			
				|  |  |  """Abstract interface for document loader implementations."""
 | 
	
		
			
				|  |  | +import datetime
 | 
	
		
			
				|  |  | +import mimetypes
 | 
	
		
			
				|  |  |  import os
 | 
	
		
			
				|  |  |  import tempfile
 | 
	
		
			
				|  |  | +import uuid
 | 
	
		
			
				|  |  |  from urllib.parse import urlparse
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |  import requests
 | 
	
		
			
				|  |  | +from docx import Document as DocxDocument
 | 
	
		
			
				|  |  | +from flask import current_app
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |  from core.rag.extractor.extractor_base import BaseExtractor
 | 
	
		
			
				|  |  |  from core.rag.models.document import Document
 | 
	
		
			
				|  |  | +from extensions.ext_database import db
 | 
	
		
			
				|  |  | +from extensions.ext_storage import storage
 | 
	
		
			
				|  |  | +from models.model import UploadFile
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |  class WordExtractor(BaseExtractor):
 | 
	
	
		
			
				|  | @@ -17,9 +25,12 @@ class WordExtractor(BaseExtractor):
 | 
	
		
			
				|  |  |          file_path: Path to the file to load.
 | 
	
		
			
				|  |  |      """
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -    def __init__(self, file_path: str):
 | 
	
		
			
				|  |  | +    def __init__(self, file_path: str, tenant_id: str, user_id: str):
 | 
	
		
			
				|  |  |          """Initialize with file path."""
 | 
	
		
			
				|  |  |          self.file_path = file_path
 | 
	
		
			
				|  |  | +        self.tenant_id = tenant_id
 | 
	
		
			
				|  |  | +        self.user_id = user_id
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  |          if "~" in self.file_path:
 | 
	
		
			
				|  |  |              self.file_path = os.path.expanduser(self.file_path)
 | 
	
		
			
				|  |  |  
 | 
	
	
		
			
				|  | @@ -45,12 +56,7 @@ class WordExtractor(BaseExtractor):
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |      def extract(self) -> list[Document]:
 | 
	
		
			
				|  |  |          """Load given path as single page."""
 | 
	
		
			
				|  |  | -        from docx import Document as docx_Document
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -        document = docx_Document(self.file_path)
 | 
	
		
			
				|  |  | -        doc_texts = [paragraph.text for paragraph in document.paragraphs]
 | 
	
		
			
				|  |  | -        content = '\n'.join(doc_texts)
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | +        content = self.parse_docx(self.file_path, 'storage')
 | 
	
		
			
				|  |  |          return [Document(
 | 
	
		
			
				|  |  |              page_content=content,
 | 
	
		
			
				|  |  |              metadata={"source": self.file_path},
 | 
	
	
		
			
				|  | @@ -61,3 +67,111 @@ class WordExtractor(BaseExtractor):
 | 
	
		
			
				|  |  |          """Check if the url is valid."""
 | 
	
		
			
				|  |  |          parsed = urlparse(url)
 | 
	
		
			
				|  |  |          return bool(parsed.netloc) and bool(parsed.scheme)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    def _extract_images_from_docx(self, doc, image_folder):
 | 
	
		
			
				|  |  | +        os.makedirs(image_folder, exist_ok=True)
 | 
	
		
			
				|  |  | +        image_count = 0
 | 
	
		
			
				|  |  | +        image_map = {}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        for rel in doc.part.rels.values():
 | 
	
		
			
				|  |  | +            if "image" in rel.target_ref:
 | 
	
		
			
				|  |  | +                image_count += 1
 | 
	
		
			
				|  |  | +                image_ext = rel.target_ref.split('.')[-1]
 | 
	
		
			
				|  |  | +                # user uuid as file name
 | 
	
		
			
				|  |  | +                file_uuid = str(uuid.uuid4())
 | 
	
		
			
				|  |  | +                file_key = 'image_files/' + self.tenant_id + '/' + file_uuid + '.' + image_ext
 | 
	
		
			
				|  |  | +                mime_type, _ = mimetypes.guess_type(file_key)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +                storage.save(file_key, rel.target_part.blob)
 | 
	
		
			
				|  |  | +                # save file to db
 | 
	
		
			
				|  |  | +                config = current_app.config
 | 
	
		
			
				|  |  | +                upload_file = UploadFile(
 | 
	
		
			
				|  |  | +                    tenant_id=self.tenant_id,
 | 
	
		
			
				|  |  | +                    storage_type=config['STORAGE_TYPE'],
 | 
	
		
			
				|  |  | +                    key=file_key,
 | 
	
		
			
				|  |  | +                    name=file_key,
 | 
	
		
			
				|  |  | +                    size=0,
 | 
	
		
			
				|  |  | +                    extension=image_ext,
 | 
	
		
			
				|  |  | +                    mime_type=mime_type,
 | 
	
		
			
				|  |  | +                    created_by=self.user_id,
 | 
	
		
			
				|  |  | +                    created_at=datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None),
 | 
	
		
			
				|  |  | +                    used=True,
 | 
	
		
			
				|  |  | +                    used_by=self.user_id,
 | 
	
		
			
				|  |  | +                    used_at=datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None)
 | 
	
		
			
				|  |  | +                )
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +                db.session.add(upload_file)
 | 
	
		
			
				|  |  | +                db.session.commit()
 | 
	
		
			
				|  |  | +                image_map[rel.target_part] = f"}/files/{upload_file.id}/image-preview)"
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        return image_map
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    def _table_to_markdown(self, table):
 | 
	
		
			
				|  |  | +        markdown = ""
 | 
	
		
			
				|  |  | +        # deal with table headers
 | 
	
		
			
				|  |  | +        header_row = table.rows[0]
 | 
	
		
			
				|  |  | +        headers = [cell.text for cell in header_row.cells]
 | 
	
		
			
				|  |  | +        markdown += "| " + " | ".join(headers) + " |\n"
 | 
	
		
			
				|  |  | +        markdown += "| " + " | ".join(["---"] * len(headers)) + " |\n"
 | 
	
		
			
				|  |  | +        # deal with table rows
 | 
	
		
			
				|  |  | +        for row in table.rows[1:]:
 | 
	
		
			
				|  |  | +            row_cells = [cell.text for cell in row.cells]
 | 
	
		
			
				|  |  | +            markdown += "| " + " | ".join(row_cells) + " |\n"
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        return markdown
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    def _parse_paragraph(self, paragraph, image_map):
 | 
	
		
			
				|  |  | +        paragraph_content = []
 | 
	
		
			
				|  |  | +        for run in paragraph.runs:
 | 
	
		
			
				|  |  | +            if run.element.xpath('.//a:blip'):
 | 
	
		
			
				|  |  | +                for blip in run.element.xpath('.//a:blip'):
 | 
	
		
			
				|  |  | +                    embed_id = blip.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
 | 
	
		
			
				|  |  | +                    if embed_id:
 | 
	
		
			
				|  |  | +                        rel_target = run.part.rels[embed_id].target_ref
 | 
	
		
			
				|  |  | +                        if rel_target in image_map:
 | 
	
		
			
				|  |  | +                            paragraph_content.append(image_map[rel_target])
 | 
	
		
			
				|  |  | +            if run.text.strip():
 | 
	
		
			
				|  |  | +                paragraph_content.append(run.text.strip())
 | 
	
		
			
				|  |  | +        return ' '.join(paragraph_content) if paragraph_content else ''
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    def parse_docx(self, docx_path, image_folder):
 | 
	
		
			
				|  |  | +        doc = DocxDocument(docx_path)
 | 
	
		
			
				|  |  | +        os.makedirs(image_folder, exist_ok=True)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        content = []
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        image_map = self._extract_images_from_docx(doc, image_folder)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        def parse_paragraph(paragraph):
 | 
	
		
			
				|  |  | +            paragraph_content = []
 | 
	
		
			
				|  |  | +            for run in paragraph.runs:
 | 
	
		
			
				|  |  | +                if run.element.tag.endswith('r'):
 | 
	
		
			
				|  |  | +                    drawing_elements = run.element.findall(
 | 
	
		
			
				|  |  | +                        './/{http://schemas.openxmlformats.org/wordprocessingml/2006/main}drawing')
 | 
	
		
			
				|  |  | +                    for drawing in drawing_elements:
 | 
	
		
			
				|  |  | +                        blip_elements = drawing.findall(
 | 
	
		
			
				|  |  | +                            './/{http://schemas.openxmlformats.org/drawingml/2006/main}blip')
 | 
	
		
			
				|  |  | +                        for blip in blip_elements:
 | 
	
		
			
				|  |  | +                            embed_id = blip.get(
 | 
	
		
			
				|  |  | +                                '{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
 | 
	
		
			
				|  |  | +                            if embed_id:
 | 
	
		
			
				|  |  | +                                image_part = doc.part.related_parts.get(embed_id)
 | 
	
		
			
				|  |  | +                                if image_part in image_map:
 | 
	
		
			
				|  |  | +                                    paragraph_content.append(image_map[image_part])
 | 
	
		
			
				|  |  | +                if run.text.strip():
 | 
	
		
			
				|  |  | +                    paragraph_content.append(run.text.strip())
 | 
	
		
			
				|  |  | +            return ''.join(paragraph_content) if paragraph_content else ''
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        paragraphs = doc.paragraphs.copy()
 | 
	
		
			
				|  |  | +        tables = doc.tables.copy()
 | 
	
		
			
				|  |  | +        for element in doc.element.body:
 | 
	
		
			
				|  |  | +            if element.tag.endswith('p'):  # paragraph
 | 
	
		
			
				|  |  | +                para = paragraphs.pop(0)
 | 
	
		
			
				|  |  | +                parsed_paragraph = parse_paragraph(para)
 | 
	
		
			
				|  |  | +                if parsed_paragraph:
 | 
	
		
			
				|  |  | +                    content.append(parsed_paragraph)
 | 
	
		
			
				|  |  | +            elif element.tag.endswith('tbl'):  # table
 | 
	
		
			
				|  |  | +                table = tables.pop(0)
 | 
	
		
			
				|  |  | +                content.append(self._table_to_markdown(table))
 | 
	
		
			
				|  |  | +        return '\n'.join(content)
 | 
	
		
			
				|  |  | +
 |