|  | @@ -411,7 +411,8 @@ class IndexingRunner:
 | 
											
												
													
														|  |  
 |  |  
 | 
											
												
													
														|  |          return text_docs
 |  |          return text_docs
 | 
											
												
													
														|  |  
 |  |  
 | 
											
												
													
														|  | -    def filter_string(self, text):
 |  | 
 | 
											
												
													
														|  | 
 |  | +    @staticmethod
 | 
											
												
													
														|  | 
 |  | +    def filter_string(text):
 | 
											
												
													
														|  |          text = re.sub(r'<\|', '<', text)
 |  |          text = re.sub(r'<\|', '<', text)
 | 
											
												
													
														|  |          text = re.sub(r'\|>', '>', text)
 |  |          text = re.sub(r'\|>', '>', text)
 | 
											
												
													
														|  |          text = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F\xEF\xBF\xBE]', '', text)
 |  |          text = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F\xEF\xBF\xBE]', '', text)
 | 
											
										
											
												
													
														|  | @@ -419,7 +420,8 @@ class IndexingRunner:
 | 
											
												
													
														|  |          text = re.sub('\uFFFE', '', text)
 |  |          text = re.sub('\uFFFE', '', text)
 | 
											
												
													
														|  |          return text
 |  |          return text
 | 
											
												
													
														|  |  
 |  |  
 | 
											
												
													
														|  | -    def _get_splitter(self, processing_rule: DatasetProcessRule,
 |  | 
 | 
											
												
													
														|  | 
 |  | +    @staticmethod
 | 
											
												
													
														|  | 
 |  | +    def _get_splitter(processing_rule: DatasetProcessRule,
 | 
											
												
													
														|  |                        embedding_model_instance: Optional[ModelInstance]) -> TextSplitter:
 |  |                        embedding_model_instance: Optional[ModelInstance]) -> TextSplitter:
 | 
											
												
													
														|  |          """
 |  |          """
 | 
											
												
													
														|  |          Get the NodeParser object according to the processing rule.
 |  |          Get the NodeParser object according to the processing rule.
 | 
											
										
											
												
													
														|  | @@ -611,7 +613,8 @@ class IndexingRunner:
 | 
											
												
													
														|  |  
 |  |  
 | 
											
												
													
														|  |          return all_documents
 |  |          return all_documents
 | 
											
												
													
														|  |  
 |  |  
 | 
											
												
													
														|  | -    def _document_clean(self, text: str, processing_rule: DatasetProcessRule) -> str:
 |  | 
 | 
											
												
													
														|  | 
 |  | +    @staticmethod
 | 
											
												
													
														|  | 
 |  | +    def _document_clean(text: str, processing_rule: DatasetProcessRule) -> str:
 | 
											
												
													
														|  |          """
 |  |          """
 | 
											
												
													
														|  |          Clean the document text according to the processing rules.
 |  |          Clean the document text according to the processing rules.
 | 
											
												
													
														|  |          """
 |  |          """
 | 
											
										
											
												
													
														|  | @@ -640,7 +643,8 @@ class IndexingRunner:
 | 
											
												
													
														|  |  
 |  |  
 | 
											
												
													
														|  |          return text
 |  |          return text
 | 
											
												
													
														|  |  
 |  |  
 | 
											
												
													
														|  | -    def format_split_text(self, text):
 |  | 
 | 
											
												
													
														|  | 
 |  | +    @staticmethod
 | 
											
												
													
														|  | 
 |  | +    def format_split_text(text):
 | 
											
												
													
														|  |          regex = r"Q\d+:\s*(.*?)\s*A\d+:\s*([\s\S]*?)(?=Q\d+:|$)"
 |  |          regex = r"Q\d+:\s*(.*?)\s*A\d+:\s*([\s\S]*?)(?=Q\d+:|$)"
 | 
											
												
													
														|  |          matches = re.findall(regex, text, re.UNICODE)
 |  |          matches = re.findall(regex, text, re.UNICODE)
 | 
											
												
													
														|  |  
 |  |  
 | 
											
										
											
												
													
														|  | @@ -704,7 +708,8 @@ class IndexingRunner:
 | 
											
												
													
														|  |              }
 |  |              }
 | 
											
												
													
														|  |          )
 |  |          )
 | 
											
												
													
														|  |  
 |  |  
 | 
											
												
													
														|  | -    def _process_keyword_index(self, flask_app, dataset_id, document_id, documents):
 |  | 
 | 
											
												
													
														|  | 
 |  | +    @staticmethod
 | 
											
												
													
														|  | 
 |  | +    def _process_keyword_index(flask_app, dataset_id, document_id, documents):
 | 
											
												
													
														|  |          with flask_app.app_context():
 |  |          with flask_app.app_context():
 | 
											
												
													
														|  |              dataset = Dataset.query.filter_by(id=dataset_id).first()
 |  |              dataset = Dataset.query.filter_by(id=dataset_id).first()
 | 
											
												
													
														|  |              if not dataset:
 |  |              if not dataset:
 | 
											
										
											
												
													
														|  | @@ -758,13 +763,15 @@ class IndexingRunner:
 | 
											
												
													
														|  |  
 |  |  
 | 
											
												
													
														|  |              return tokens
 |  |              return tokens
 | 
											
												
													
														|  |  
 |  |  
 | 
											
												
													
														|  | -    def _check_document_paused_status(self, document_id: str):
 |  | 
 | 
											
												
													
														|  | 
 |  | +    @staticmethod
 | 
											
												
													
														|  | 
 |  | +    def _check_document_paused_status(document_id: str):
 | 
											
												
													
														|  |          indexing_cache_key = 'document_{}_is_paused'.format(document_id)
 |  |          indexing_cache_key = 'document_{}_is_paused'.format(document_id)
 | 
											
												
													
														|  |          result = redis_client.get(indexing_cache_key)
 |  |          result = redis_client.get(indexing_cache_key)
 | 
											
												
													
														|  |          if result:
 |  |          if result:
 | 
											
												
													
														|  |              raise DocumentIsPausedException()
 |  |              raise DocumentIsPausedException()
 | 
											
												
													
														|  |  
 |  |  
 | 
											
												
													
														|  | -    def _update_document_index_status(self, document_id: str, after_indexing_status: str,
 |  | 
 | 
											
												
													
														|  | 
 |  | +    @staticmethod
 | 
											
												
													
														|  | 
 |  | +    def _update_document_index_status(document_id: str, after_indexing_status: str,
 | 
											
												
													
														|  |                                        extra_update_params: Optional[dict] = None) -> None:
 |  |                                        extra_update_params: Optional[dict] = None) -> None:
 | 
											
												
													
														|  |          """
 |  |          """
 | 
											
												
													
														|  |          Update the document indexing status.
 |  |          Update the document indexing status.
 | 
											
										
											
												
													
														|  | @@ -786,14 +793,16 @@ class IndexingRunner:
 | 
											
												
													
														|  |          DatasetDocument.query.filter_by(id=document_id).update(update_params)
 |  |          DatasetDocument.query.filter_by(id=document_id).update(update_params)
 | 
											
												
													
														|  |          db.session.commit()
 |  |          db.session.commit()
 | 
											
												
													
														|  |  
 |  |  
 | 
											
												
													
														|  | -    def _update_segments_by_document(self, dataset_document_id: str, update_params: dict) -> None:
 |  | 
 | 
											
												
													
														|  | 
 |  | +    @staticmethod
 | 
											
												
													
														|  | 
 |  | +    def _update_segments_by_document(dataset_document_id: str, update_params: dict) -> None:
 | 
											
												
													
														|  |          """
 |  |          """
 | 
											
												
													
														|  |          Update the document segment by document id.
 |  |          Update the document segment by document id.
 | 
											
												
													
														|  |          """
 |  |          """
 | 
											
												
													
														|  |          DocumentSegment.query.filter_by(document_id=dataset_document_id).update(update_params)
 |  |          DocumentSegment.query.filter_by(document_id=dataset_document_id).update(update_params)
 | 
											
												
													
														|  |          db.session.commit()
 |  |          db.session.commit()
 | 
											
												
													
														|  |  
 |  |  
 | 
											
												
													
														|  | -    def batch_add_segments(self, segments: list[DocumentSegment], dataset: Dataset):
 |  | 
 | 
											
												
													
														|  | 
 |  | +    @staticmethod
 | 
											
												
													
														|  | 
 |  | +    def batch_add_segments(segments: list[DocumentSegment], dataset: Dataset):
 | 
											
												
													
														|  |          """
 |  |          """
 | 
											
												
													
														|  |          Batch add segments index processing
 |  |          Batch add segments index processing
 | 
											
												
													
														|  |          """
 |  |          """
 |