|  | @@ -2,7 +2,6 @@
 | 
	
		
			
				|  |  |  from typing import Optional
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |  import pandas as pd
 | 
	
		
			
				|  |  | -import xlrd
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |  from core.rag.extractor.extractor_base import BaseExtractor
 | 
	
		
			
				|  |  |  from core.rag.models.document import Document
 | 
	
	
		
			
				|  | @@ -28,61 +27,19 @@ class ExcelExtractor(BaseExtractor):
 | 
	
		
			
				|  |  |          self._autodetect_encoding = autodetect_encoding
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |      def extract(self) -> list[Document]:
 | 
	
		
			
				|  |  | -        """ parse excel file"""
 | 
	
		
			
				|  |  | -        if self._file_path.endswith('.xls'):
 | 
	
		
			
				|  |  | -            return self._extract4xls()
 | 
	
		
			
				|  |  | -        elif self._file_path.endswith('.xlsx'):
 | 
	
		
			
				|  |  | -            return self._extract4xlsx()
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -    def _extract4xls(self) -> list[Document]:
 | 
	
		
			
				|  |  | -        wb = xlrd.open_workbook(filename=self._file_path)
 | 
	
		
			
				|  |  | +        """ Load from Excel file in xls or xlsx format using Pandas."""
 | 
	
		
			
				|  |  |          documents = []
 | 
	
		
			
				|  |  | -        # loop over all sheets
 | 
	
		
			
				|  |  | -        for sheet in wb.sheets():
 | 
	
		
			
				|  |  | -            row_header = None
 | 
	
		
			
				|  |  | -            for row_index, row in enumerate(sheet.get_rows(), start=1):                
 | 
	
		
			
				|  |  | -                if self.is_blank_row(row):
 | 
	
		
			
				|  |  | -                    continue
 | 
	
		
			
				|  |  | -                if row_header is None:
 | 
	
		
			
				|  |  | -                    row_header = row
 | 
	
		
			
				|  |  | -                    continue
 | 
	
		
			
				|  |  | -                item_arr = []
 | 
	
		
			
				|  |  | -                for index, cell in enumerate(row):
 | 
	
		
			
				|  |  | -                    txt_value = str(cell.value)
 | 
	
		
			
				|  |  | -                    item_arr.append(f'"{row_header[index].value}":"{txt_value}"')
 | 
	
		
			
				|  |  | -                item_str = ",".join(item_arr)
 | 
	
		
			
				|  |  | -                document = Document(page_content=item_str, metadata={'source': self._file_path})
 | 
	
		
			
				|  |  | -                documents.append(document)
 | 
	
		
			
				|  |  | -        return documents
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -    def _extract4xlsx(self) -> list[Document]:
 | 
	
		
			
				|  |  | -        """Load from file path using Pandas."""
 | 
	
		
			
				|  |  | -        data = []
 | 
	
		
			
				|  |  |          # Read each worksheet of an Excel file using Pandas
 | 
	
		
			
				|  |  | -        xls = pd.ExcelFile(self._file_path)
 | 
	
		
			
				|  |  | -        for sheet_name in xls.sheet_names:
 | 
	
		
			
				|  |  | -            df = pd.read_excel(xls, sheet_name=sheet_name)
 | 
	
		
			
				|  |  | +        excel_file = pd.ExcelFile(self._file_path)
 | 
	
		
			
				|  |  | +        for sheet_name in excel_file.sheet_names:
 | 
	
		
			
				|  |  | +            df: pd.DataFrame = excel_file.parse(sheet_name=sheet_name)
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |              # filter out rows with all NaN values
 | 
	
		
			
				|  |  |              df.dropna(how='all', inplace=True)
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |              # transform each row into a Document
 | 
	
		
			
				|  |  | -            for _, row in df.iterrows():
 | 
	
		
			
				|  |  | -                item = ';'.join(f'"{k}":"{v}"' for k, v in row.items() if pd.notna(v))
 | 
	
		
			
				|  |  | -                document = Document(page_content=item, metadata={'source': self._file_path})
 | 
	
		
			
				|  |  | -                data.append(document)
 | 
	
		
			
				|  |  | -        return data
 | 
	
		
			
				|  |  | +            documents += [Document(page_content=';'.join(f'"{k}":"{v}"' for k, v in row.items() if pd.notna(v)),
 | 
	
		
			
				|  |  | +                                   metadata={'source': self._file_path},
 | 
	
		
			
				|  |  | +                                   ) for _, row in df.iterrows()]
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -    @staticmethod
 | 
	
		
			
				|  |  | -    def is_blank_row(row):
 | 
	
		
			
				|  |  | -        """
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -        Determine whether the specified line is a blank line.
 | 
	
		
			
				|  |  | -        :param row: row object。
 | 
	
		
			
				|  |  | -        :return: Returns True if the row is blank, False otherwise.
 | 
	
		
			
				|  |  | -        """
 | 
	
		
			
				|  |  | -        # Iterates through the cells and returns False if a non-empty cell is found
 | 
	
		
			
				|  |  | -        for cell in row:
 | 
	
		
			
				|  |  | -            if cell.value is not None and cell.value != '':
 | 
	
		
			
				|  |  | -                return False
 | 
	
		
			
				|  |  | -        return True
 | 
	
		
			
				|  |  | +        return documents
 |