1 year ago · 39c14ec7c1
--- a/api/core/rag/extractor/excel_extractor.py
+++ b/api/core/rag/extractor/excel_extractor.py
@@ -2,7 +2,6 @@
 
				 from typing import Optional
			
 
				 
			
 
				 import pandas as pd
			
 
				-import xlrd
			
 
				 
			
 
				 from core.rag.extractor.extractor_base import BaseExtractor
			
 
				 from core.rag.models.document import Document
			
@@ -28,61 +27,19 @@ class ExcelExtractor(BaseExtractor):
 
				         self._autodetect_encoding = autodetect_encoding
			
 
				 
			
 
				     def extract(self) -> list[Document]:
			
 
				-        """ parse excel file"""
			
 
				-        if self._file_path.endswith('.xls'):
			
 
				-            return self._extract4xls()
			
 
				-        elif self._file_path.endswith('.xlsx'):
			
 
				-            return self._extract4xlsx()
			
 
				-
			
 
				-    def _extract4xls(self) -> list[Document]:
			
 
				-        wb = xlrd.open_workbook(filename=self._file_path)
			
 
				+        """ Load from Excel file in xls or xlsx format using Pandas."""
			
 
				         documents = []
			
 
				-        # loop over all sheets
			
 
				-        for sheet in wb.sheets():
			
 
				-            row_header = None
			
 
				-            for row_index, row in enumerate(sheet.get_rows(), start=1):                
			
 
				-                if self.is_blank_row(row):
			
 
				-                    continue
			
 
				-                if row_header is None:
			
 
				-                    row_header = row
			
 
				-                    continue
			
 
				-                item_arr = []
			
 
				-                for index, cell in enumerate(row):
			
 
				-                    txt_value = str(cell.value)
			
 
				-                    item_arr.append(f'"{row_header[index].value}":"{txt_value}"')
			
 
				-                item_str = ",".join(item_arr)
			
 
				-                document = Document(page_content=item_str, metadata={'source': self._file_path})
			
 
				-                documents.append(document)
			
 
				-        return documents
			
 
				-
			
 
				-    def _extract4xlsx(self) -> list[Document]:
			
 
				-        """Load from file path using Pandas."""
			
 
				-        data = []
			
 
				         # Read each worksheet of an Excel file using Pandas
			
 
				-        xls = pd.ExcelFile(self._file_path)
			
 
				-        for sheet_name in xls.sheet_names:
			
 
				-            df = pd.read_excel(xls, sheet_name=sheet_name)
			
 
				+        excel_file = pd.ExcelFile(self._file_path)
			
 
				+        for sheet_name in excel_file.sheet_names:
			
 
				+            df: pd.DataFrame = excel_file.parse(sheet_name=sheet_name)
			
 
				 
			
 
				             # filter out rows with all NaN values
			
 
				             df.dropna(how='all', inplace=True)
			
 
				 
			
 
				             # transform each row into a Document
			
 
				-            for _, row in df.iterrows():
			
 
				-                item = ';'.join(f'"{k}":"{v}"' for k, v in row.items() if pd.notna(v))
			
 
				-                document = Document(page_content=item, metadata={'source': self._file_path})
			
 
				-                data.append(document)
			
 
				-        return data
			
 
				+            documents += [Document(page_content=';'.join(f'"{k}":"{v}"' for k, v in row.items() if pd.notna(v)),
			
 
				+                                   metadata={'source': self._file_path},
			
 
				+                                   ) for _, row in df.iterrows()]
			
 
				 
			
 
				-    @staticmethod
			
 
				-    def is_blank_row(row):
			
 
				-        """
			
 
				-
			
 
				-        Determine whether the specified line is a blank line.
			
 
				-        :param row: row object。
			
 
				-        :return: Returns True if the row is blank, False otherwise.
			
 
				-        """
			
 
				-        # Iterates through the cells and returns False if a non-empty cell is found
			
 
				-        for cell in row:
			
 
				-            if cell.value is not None and cell.value != '':
			
 
				-                return False
			
 
				-        return True
			
 
				+        return documents