소스 검색

feat: Add hyperlink parsing to the DOCX document. (#7017)

chenxu9741 8 달 전
부모
커밋
72c75b75cf
1개의 변경된 파일28개의 추가작업 그리고 0개의 파일을 삭제
  1. 28 0
      api/core/rag/extractor/word_extractor.py

+ 28 - 0
api/core/rag/extractor/word_extractor.py

@@ -1,9 +1,12 @@
 """Abstract interface for document loader implementations."""
 import datetime
+import logging
 import mimetypes
 import os
+import re
 import tempfile
 import uuid
+import xml.etree.ElementTree as ET
 from urllib.parse import urlparse
 
 import requests
@@ -16,6 +19,7 @@ from extensions.ext_database import db
 from extensions.ext_storage import storage
 from models.model import UploadFile
 
+logger = logging.getLogger(__name__)
 
 class WordExtractor(BaseExtractor):
     """Load docx files.
@@ -197,6 +201,30 @@ class WordExtractor(BaseExtractor):
 
         image_map = self._extract_images_from_docx(doc, image_folder)
 
+        hyperlinks_url = None
+        url_pattern = re.compile(r'http://[^\s+]+//|https://[^\s+]+')
+        for para in doc.paragraphs:
+            for run in para.runs:
+                if run.text and hyperlinks_url:
+                    result = f'  [{run.text}]({hyperlinks_url})  '
+                    run.text = result
+                    hyperlinks_url = None
+                if 'HYPERLINK' in run.element.xml:
+                    try:
+                        xml = ET.XML(run.element.xml)
+                        x_child = [c for c in xml.iter() if c is not None]
+                        for x in x_child:
+                            if x_child is None:
+                                continue
+                            if x.tag.endswith('instrText'):
+                                for i in url_pattern.findall(x.text):
+                                    hyperlinks_url = str(i)
+                    except Exception as e:
+                        logger.error(e)
+
+
+
+
         def parse_paragraph(paragraph):
             paragraph_content = []
             for run in paragraph.runs: