Browse Source

refactor(file extractor): file extractor (#1059)

yezhwi 1 year ago
parent
commit
d33a269548

+ 2 - 2
api/controllers/console/datasets/file.py

@@ -83,7 +83,7 @@ class FileApi(Resource):
             raise FileTooLargeError(message)
 
         extension = file.filename.split('.')[-1]
-        if extension not in ALLOWED_EXTENSIONS:
+        if extension.lower() not in ALLOWED_EXTENSIONS:
             raise UnsupportedFileTypeError()
 
         # user uuid as file name
@@ -136,7 +136,7 @@ class FilePreviewApi(Resource):
 
         # extract text from file
         extension = upload_file.extension
-        if extension not in ALLOWED_EXTENSIONS:
+        if extension.lower() not in ALLOWED_EXTENSIONS:
             raise UnsupportedFileTypeError()
 
         text = FileExtractor.load(upload_file, return_text=True)

+ 7 - 6
api/core/data_loader/file_extractor.py

@@ -47,17 +47,18 @@ class FileExtractor:
                        upload_file: Optional[UploadFile] = None) -> Union[List[Document] | str]:
         input_file = Path(file_path)
         delimiter = '\n'
-        if input_file.suffix == '.xlsx':
+        file_extension = input_file.suffix.lower()
+        if file_extension == '.xlsx':
             loader = ExcelLoader(file_path)
-        elif input_file.suffix == '.pdf':
+        elif file_extension == '.pdf':
             loader = PdfLoader(file_path, upload_file=upload_file)
-        elif input_file.suffix in ['.md', '.markdown']:
+        elif file_extension in ['.md', '.markdown']:
             loader = MarkdownLoader(file_path, autodetect_encoding=True)
-        elif input_file.suffix in ['.htm', '.html']:
+        elif file_extension in ['.htm', '.html']:
             loader = HTMLLoader(file_path)
-        elif input_file.suffix == '.docx':
+        elif file_extension == '.docx':
             loader = Docx2txtLoader(file_path)
-        elif input_file.suffix == '.csv':
+        elif file_extension == '.csv':
             loader = CSVLoader(file_path, autodetect_encoding=True)
         else:
             # txt

+ 1 - 1
web/app/components/datasets/create/file-uploader/index.tsx

@@ -78,7 +78,7 @@ const FileUploader = ({
   const isValid = useCallback((file: File) => {
     const { size } = file
     const ext = `.${getFileType(file)}`
-    const isValidType = ACCEPTS.includes(ext)
+    const isValidType = ACCEPTS.includes(ext.toLowerCase())
     if (!isValidType)
       notify({ type: 'error', message: t('datasetCreation.stepOne.uploader.validation.typeError') })
 

+ 1 - 1
web/app/components/datasets/create/index.tsx

@@ -151,4 +151,4 @@ const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => {
   )
 }
 
-export default DatasetUpdateForm
+export default DatasetUpdateForm