| 
					
				 | 
			
			
				@@ -494,6 +494,7 @@ class IndexingRunner: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         Split the text documents into nodes. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         """ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         all_documents = [] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        all_qa_documents = [] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         for text_doc in text_docs: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             # document clean 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             document_text = self._document_clean(text_doc.page_content, processing_rule) 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -502,58 +503,56 @@ class IndexingRunner: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             # parse document to nodes 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             documents = splitter.split_documents([text_doc]) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             split_documents = [] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            for document_node in documents: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                doc_id = str(uuid.uuid4()) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                hash = helper.generate_text_hash(document_node.page_content) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                document_node.metadata['doc_id'] = doc_id 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                document_node.metadata['doc_hash'] = hash 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                split_documents.append(document_node) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            all_documents.extend(split_documents) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        # processing qa document 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        if document_form == 'qa_model': 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             llm: StreamableOpenAI = LLMBuilder.to_llm( 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                 tenant_id=tenant_id, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                 model_name='gpt-3.5-turbo', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                 max_tokens=2000 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             ) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            for i in range(0, len(documents), 10): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            for i in range(0, len(all_documents), 10): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                 threads = [] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                sub_documents = documents[i:i + 10] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                sub_documents = all_documents[i:i + 10] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                 for doc in sub_documents: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                    document_format_thread = threading.Thread(target=self.format_document, kwargs={ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                        'llm': llm, 'document_node': doc, 'split_documents': split_documents, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                        'document_form': document_form}) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                    document_format_thread = threading.Thread(target=self.format_qa_document, kwargs={ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                        'llm': llm, 'document_node': doc, 'all_qa_documents': all_qa_documents}) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                     threads.append(document_format_thread) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                     document_format_thread.start() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                 for thread in threads: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                     thread.join() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            all_documents.extend(split_documents) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            return all_qa_documents 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         return all_documents 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    def format_document(self, llm: StreamableOpenAI, document_node, split_documents, document_form: str): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    def format_qa_document(self, llm: StreamableOpenAI, document_node, all_qa_documents): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         format_documents = [] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         if document_node.page_content is None or not document_node.page_content.strip(): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            return format_documents 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        if document_form == 'text_model': 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            # text model document 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            doc_id = str(uuid.uuid4()) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            hash = helper.generate_text_hash(document_node.page_content) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            document_node.metadata['doc_id'] = doc_id 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            document_node.metadata['doc_hash'] = hash 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            return 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        try: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            # qa model document 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            response = LLMGenerator.generate_qa_document_sync(llm, document_node.page_content) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            document_qa_list = self.format_split_text(response) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            qa_documents = [] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            for result in document_qa_list: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                qa_document = Document(page_content=result['question'], metadata=document_node.metadata.copy()) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                doc_id = str(uuid.uuid4()) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                hash = helper.generate_text_hash(result['question']) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                qa_document.metadata['answer'] = result['answer'] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                qa_document.metadata['doc_id'] = doc_id 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                qa_document.metadata['doc_hash'] = hash 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                qa_documents.append(qa_document) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            format_documents.extend(qa_documents) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        except Exception as e: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            logging.error(str(e)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            format_documents.append(document_node) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        elif document_form == 'qa_model': 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            try: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                # qa model document 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                response = LLMGenerator.generate_qa_document_sync(llm, document_node.page_content) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                document_qa_list = self.format_split_text(response) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                qa_documents = [] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                for result in document_qa_list: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                    qa_document = Document(page_content=result['question'], metadata=document_node.metadata.copy()) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                    doc_id = str(uuid.uuid4()) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                    hash = helper.generate_text_hash(result['question']) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                    qa_document.metadata['answer'] = result['answer'] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                    qa_document.metadata['doc_id'] = doc_id 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                    qa_document.metadata['doc_hash'] = hash 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                    qa_documents.append(qa_document) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                format_documents.extend(qa_documents) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            except Exception as e: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                logging.error(str(e)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        split_documents.extend(format_documents) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        all_qa_documents.extend(format_documents) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     def _split_to_documents_for_estimate(self, text_docs: List[Document], splitter: TextSplitter, 
			 |