2 年之前 · 31070ffbca
--- a/api/core/indexing_runner.py
+++ b/api/core/indexing_runner.py
@@ -62,7 +62,8 @@ class IndexingRunner:
 
				                 text_docs = self._extract(index_processor, dataset_document, processing_rule.to_dict())
			
 
				 
			
 
				                 # transform
			
 
				-                documents = self._transform(index_processor, dataset, text_docs, processing_rule.to_dict())
			
 
				+                documents = self._transform(index_processor, dataset, text_docs, dataset_document.doc_language,
			
 
				+                                            processing_rule.to_dict())
			
 
				                 # save segment
			
 
				                 self._load_segments(dataset, dataset_document, documents)
			
 
				 
			
@@ -120,7 +121,8 @@ class IndexingRunner:
 
				             text_docs = self._extract(index_processor, dataset_document, processing_rule.to_dict())
			
 
				 
			
 
				             # transform
			
 
				-            documents = self._transform(index_processor, dataset, text_docs, processing_rule.to_dict())
			
 
				+            documents = self._transform(index_processor, dataset, text_docs, dataset_document.doc_language,
			
 
				+                                        processing_rule.to_dict())
			
 
				             # save segment
			
 
				             self._load_segments(dataset, dataset_document, documents)
			
 
				 
			
@@ -750,7 +752,7 @@ class IndexingRunner:
 
				         index_processor.load(dataset, documents)
			
 
				 
			
 
				     def _transform(self, index_processor: BaseIndexProcessor, dataset: Dataset,
			
 
				-                   text_docs: list[Document], process_rule: dict) -> list[Document]:
			
 
				+                   text_docs: list[Document], doc_language: str, process_rule: dict) -> list[Document]:
			
 
				         # get embedding model instance
			
 
				         embedding_model_instance = None
			
 
				         if dataset.indexing_technique == 'high_quality':
			
@@ -768,7 +770,8 @@ class IndexingRunner:
 
				                 )
			
 
				 
			
 
				         documents = index_processor.transform(text_docs, embedding_model_instance=embedding_model_instance,
			
 
				-                                              process_rule=process_rule)
			
 
				+                                              process_rule=process_rule, tenant_id=dataset.tenant_id,
			
 
				+                                              doc_language=doc_language)
			
 
				 
			
 
				         return documents
			
 
				 
			
--- a/api/core/rag/index_processor/processor/qa_index_processor.py
+++ b/api/core/rag/index_processor/processor/qa_index_processor.py
@@ -7,7 +7,6 @@ from typing import Optional
 
				 
			
 
				 import pandas as pd
			
 
				 from flask import Flask, current_app
			
 
				-from flask_login import current_user
			
 
				 from werkzeug.datastructures import FileStorage
			
 
				 
			
 
				 from core.generator.llm_generator import LLMGenerator
			
@@ -31,7 +30,7 @@ class QAIndexProcessor(BaseIndexProcessor):
 
				 
			
 
				     def transform(self, documents: list[Document], **kwargs) -> list[Document]:
			
 
				         splitter = self._get_splitter(processing_rule=kwargs.get('process_rule'),
			
 
				-                                      embedding_model_instance=None)
			
 
				+                                      embedding_model_instance=kwargs.get('embedding_model_instance'))
			
 
				 
			
 
				         # Split the text documents into nodes.
			
 
				         all_documents = []
			
@@ -66,10 +65,10 @@ class QAIndexProcessor(BaseIndexProcessor):
 
				             for doc in sub_documents:
			
 
				                 document_format_thread = threading.Thread(target=self._format_qa_document, kwargs={
			
 
				                     'flask_app': current_app._get_current_object(),
			
 
				-                    'tenant_id': current_user.current_tenant.id,
			
 
				+                    'tenant_id': kwargs.get('tenant_id'),
			
 
				                     'document_node': doc,
			
 
				                     'all_qa_documents': all_qa_documents,
			
 
				-                    'document_language': kwargs.get('document_language', 'English')})
			
 
				+                    'document_language': kwargs.get('doc_language', 'English')})
			
 
				                 threads.append(document_format_thread)
			
 
				                 document_format_thread.start()
			
 
				             for thread in threads: