|  | @@ -1,9 +1,12 @@
 | 
	
		
			
				|  |  |  """Abstract interface for document loader implementations."""
 | 
	
		
			
				|  |  |  import datetime
 | 
	
		
			
				|  |  | +import logging
 | 
	
		
			
				|  |  |  import mimetypes
 | 
	
		
			
				|  |  |  import os
 | 
	
		
			
				|  |  | +import re
 | 
	
		
			
				|  |  |  import tempfile
 | 
	
		
			
				|  |  |  import uuid
 | 
	
		
			
				|  |  | +import xml.etree.ElementTree as ET
 | 
	
		
			
				|  |  |  from urllib.parse import urlparse
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |  import requests
 | 
	
	
		
			
				|  | @@ -16,6 +19,7 @@ from extensions.ext_database import db
 | 
	
		
			
				|  |  |  from extensions.ext_storage import storage
 | 
	
		
			
				|  |  |  from models.model import UploadFile
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | +logger = logging.getLogger(__name__)
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |  class WordExtractor(BaseExtractor):
 | 
	
		
			
				|  |  |      """Load docx files.
 | 
	
	
		
			
				|  | @@ -197,6 +201,30 @@ class WordExtractor(BaseExtractor):
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |          image_map = self._extract_images_from_docx(doc, image_folder)
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | +        hyperlinks_url = None
 | 
	
		
			
				|  |  | +        url_pattern = re.compile(r'http://[^\s+]+//|https://[^\s+]+')
 | 
	
		
			
				|  |  | +        for para in doc.paragraphs:
 | 
	
		
			
				|  |  | +            for run in para.runs:
 | 
	
		
			
				|  |  | +                if run.text and hyperlinks_url:
 | 
	
		
			
				|  |  | +                    result = f'  [{run.text}]({hyperlinks_url})  '
 | 
	
		
			
				|  |  | +                    run.text = result
 | 
	
		
			
				|  |  | +                    hyperlinks_url = None
 | 
	
		
			
				|  |  | +                if 'HYPERLINK' in run.element.xml:
 | 
	
		
			
				|  |  | +                    try:
 | 
	
		
			
				|  |  | +                        xml = ET.XML(run.element.xml)
 | 
	
		
			
				|  |  | +                        x_child = [c for c in xml.iter() if c is not None]
 | 
	
		
			
				|  |  | +                        for x in x_child:
 | 
	
		
			
				|  |  | +                            if x_child is None:
 | 
	
		
			
				|  |  | +                                continue
 | 
	
		
			
				|  |  | +                            if x.tag.endswith('instrText'):
 | 
	
		
			
				|  |  | +                                for i in url_pattern.findall(x.text):
 | 
	
		
			
				|  |  | +                                    hyperlinks_url = str(i)
 | 
	
		
			
				|  |  | +                    except Exception as e:
 | 
	
		
			
				|  |  | +                        logger.error(e)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  |          def parse_paragraph(paragraph):
 | 
	
		
			
				|  |  |              paragraph_content = []
 | 
	
		
			
				|  |  |              for run in paragraph.runs:
 |