| 
					
				 | 
			
			
				@@ -3,7 +3,8 @@ from pathlib import Path 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from typing import List, Union, Optional 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 import requests 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-from langchain.document_loaders import TextLoader, Docx2txtLoader, UnstructuredFileLoader, UnstructuredAPIFileLoader 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from flask import current_app 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from langchain.document_loaders import TextLoader, Docx2txtLoader 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from langchain.schema import Document 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from core.data_loader.loader.csv_loader import CSVLoader 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -11,6 +12,13 @@ from core.data_loader.loader.excel import ExcelLoader 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from core.data_loader.loader.html import HTMLLoader 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from core.data_loader.loader.markdown import MarkdownLoader 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from core.data_loader.loader.pdf import PdfLoader 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from core.data_loader.loader.unstructured.unstructured_eml import UnstructuredEmailLoader 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from core.data_loader.loader.unstructured.unstructured_markdown import UnstructuredMarkdownLoader 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from core.data_loader.loader.unstructured.unstructured_msg import UnstructuredMsgLoader 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from core.data_loader.loader.unstructured.unstructured_ppt import UnstructuredPPTLoader 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from core.data_loader.loader.unstructured.unstructured_pptx import UnstructuredPPTXLoader 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from core.data_loader.loader.unstructured.unstructured_text import UnstructuredTextLoader 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from core.data_loader.loader.unstructured.unstructured_xml import UnstructuredXmlLoader 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from extensions.ext_storage import storage 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from models.model import UploadFile 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -49,14 +57,34 @@ class FileExtractor: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         input_file = Path(file_path) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         delimiter = '\n' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         file_extension = input_file.suffix.lower() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        if is_automatic: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            loader = UnstructuredFileLoader( 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                file_path, strategy="hi_res", mode="elements" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            ) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            # loader = UnstructuredAPIFileLoader( 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            #     file_path=filenames[0], 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            #     api_key="FAKE_API_KEY", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            # ) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        etl_type = current_app.config['ETL_TYPE'] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        unstructured_api_url = current_app.config['UNSTRUCTURED_API_URL'] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        if etl_type == 'Unstructured': 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            if file_extension == '.xlsx': 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                loader = ExcelLoader(file_path) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            elif file_extension == '.pdf': 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                loader = PdfLoader(file_path, upload_file=upload_file) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            elif file_extension in ['.md', '.markdown']: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                loader = UnstructuredMarkdownLoader(file_path, unstructured_api_url) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            elif file_extension in ['.htm', '.html']: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                loader = HTMLLoader(file_path) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            elif file_extension == '.docx': 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                loader = Docx2txtLoader(file_path) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            elif file_extension == '.csv': 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                loader = CSVLoader(file_path, autodetect_encoding=True) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            elif file_extension == '.msg': 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                loader = UnstructuredMsgLoader(file_path, unstructured_api_url) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            elif file_extension == '.eml': 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                loader = UnstructuredEmailLoader(file_path, unstructured_api_url) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            elif file_extension == '.ppt': 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                loader = UnstructuredPPTLoader(file_path, unstructured_api_url) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            elif file_extension == '.pptx': 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                loader = UnstructuredPPTXLoader(file_path, unstructured_api_url) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            elif file_extension == '.xml': 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                loader = UnstructuredXmlLoader(file_path, unstructured_api_url) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            else: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                # txt 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                loader = UnstructuredTextLoader(file_path, unstructured_api_url) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         else: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             if file_extension == '.xlsx': 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                 loader = ExcelLoader(file_path) 
			 |