|
@@ -1,9 +1,12 @@
|
|
|
"""Abstract interface for document loader implementations."""
|
|
|
import datetime
|
|
|
+import logging
|
|
|
import mimetypes
|
|
|
import os
|
|
|
+import re
|
|
|
import tempfile
|
|
|
import uuid
|
|
|
+import xml.etree.ElementTree as ET
|
|
|
from urllib.parse import urlparse
|
|
|
|
|
|
import requests
|
|
@@ -16,6 +19,7 @@ from extensions.ext_database import db
|
|
|
from extensions.ext_storage import storage
|
|
|
from models.model import UploadFile
|
|
|
|
|
|
+logger = logging.getLogger(__name__)
|
|
|
|
|
|
class WordExtractor(BaseExtractor):
|
|
|
"""Load docx files.
|
|
@@ -197,6 +201,30 @@ class WordExtractor(BaseExtractor):
|
|
|
|
|
|
image_map = self._extract_images_from_docx(doc, image_folder)
|
|
|
|
|
|
+ hyperlinks_url = None
|
|
|
+ url_pattern = re.compile(r'http://[^\s+]+//|https://[^\s+]+')
|
|
|
+ for para in doc.paragraphs:
|
|
|
+ for run in para.runs:
|
|
|
+ if run.text and hyperlinks_url:
|
|
|
+ result = f' [{run.text}]({hyperlinks_url}) '
|
|
|
+ run.text = result
|
|
|
+ hyperlinks_url = None
|
|
|
+ if 'HYPERLINK' in run.element.xml:
|
|
|
+ try:
|
|
|
+ xml = ET.XML(run.element.xml)
|
|
|
+ x_child = [c for c in xml.iter() if c is not None]
|
|
|
+ for x in x_child:
|
|
|
+ if x_child is None:
|
|
|
+ continue
|
|
|
+ if x.tag.endswith('instrText'):
|
|
|
+ for i in url_pattern.findall(x.text):
|
|
|
+ hyperlinks_url = str(i)
|
|
|
+ except Exception as e:
|
|
|
+ logger.error(e)
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
def parse_paragraph(paragraph):
|
|
|
paragraph_content = []
|
|
|
for run in paragraph.runs:
|