Browse Source

ascii filter Unicode U+FFFE (#2038)

Co-authored-by: jyong <jyong@dify.ai>
Jyong 1 year ago
parent
commit
cb7a608d75
1 changed files with 3 additions and 1 deletions
  1. 3 1
      api/core/indexing_runner.py

+ 3 - 1
api/core/indexing_runner.py

@@ -531,7 +531,9 @@ class IndexingRunner:
     def filter_string(self, text):
         text = re.sub(r'<\|', '<', text)
         text = re.sub(r'\|>', '>', text)
-        text = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F\x80-\xFF]', '', text)
+        text = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F\xEF\xBF\xBE]', '', text)
+        # Unicode  U+FFFE
+        text = re.sub(u'\uFFFE', '', text)
         return text
 
     def _get_splitter(self, processing_rule: DatasetProcessRule,