|
@@ -235,7 +235,8 @@ class IndexingRunner:
|
|
|
if len(preview_texts) < 5:
|
|
|
preview_texts.append(document.page_content)
|
|
|
|
|
|
- tokens += TokenCalculator.get_num_tokens(self.embedding_model_name, document.page_content)
|
|
|
+ tokens += TokenCalculator.get_num_tokens(self.embedding_model_name,
|
|
|
+ self.filter_string(document.page_content))
|
|
|
|
|
|
return {
|
|
|
"total_segments": total_segments,
|
|
@@ -345,6 +346,8 @@ class IndexingRunner:
|
|
|
return text_docs
|
|
|
|
|
|
def filter_string(self, text):
|
|
|
+ text = text.replace('<|', '<')
|
|
|
+ text = text.replace('|>', '>')
|
|
|
pattern = re.compile('[\x00-\x08\x0B\x0C\x0E-\x1F\x7F\x80-\xFF]')
|
|
|
return pattern.sub('', text)
|
|
|
|
|
@@ -425,7 +428,7 @@ class IndexingRunner:
|
|
|
return documents
|
|
|
|
|
|
def _split_to_documents(self, text_docs: List[Document], splitter: TextSplitter,
|
|
|
- processing_rule: DatasetProcessRule) -> List[Document]:
|
|
|
+ processing_rule: DatasetProcessRule) -> List[Document]:
|
|
|
"""
|
|
|
Split the text documents into nodes.
|
|
|
"""
|