před 2 roky · 5b953c1ef2
--- a/api/controllers/console/datasets/data_source.py
+++ b/api/controllers/console/datasets/data_source.py
@@ -178,7 +178,8 @@ class DataSourceNotionApi(Resource):
 
				             notion_workspace_id=workspace_id,
			
 
				             notion_obj_id=page_id,
			
 
				             notion_page_type=page_type,
			
 
				-            notion_access_token=data_source_binding.access_token
			
 
				+            notion_access_token=data_source_binding.access_token,
			
 
				+            tenant_id=current_user.current_tenant_id
			
 
				         )
			
 
				 
			
 
				         text_docs = extractor.extract()
			
@@ -208,7 +209,8 @@ class DataSourceNotionApi(Resource):
 
				                     notion_info={
			
 
				                         "notion_workspace_id": workspace_id,
			
 
				                         "notion_obj_id": page['page_id'],
			
 
				-                        "notion_page_type": page['type']
			
 
				+                        "notion_page_type": page['type'],
			
 
				+                        "tenant_id": current_user.current_tenant_id
			
 
				                     },
			
 
				                     document_model=args['doc_form']
			
 
				                 )
			
--- a/api/controllers/console/datasets/datasets.py
+++ b/api/controllers/console/datasets/datasets.py
@@ -298,7 +298,8 @@ class DatasetIndexingEstimateApi(Resource):
 
				                         notion_info={
			
 
				                             "notion_workspace_id": workspace_id,
			
 
				                             "notion_obj_id": page['page_id'],
			
 
				-                            "notion_page_type": page['type']
			
 
				+                            "notion_page_type": page['type'],
			
 
				+                            "tenant_id": current_user.current_tenant_id
			
 
				                         },
			
 
				                         document_model=args['doc_form']
			
 
				                     )
			
--- a/api/controllers/console/datasets/datasets_document.py
+++ b/api/controllers/console/datasets/datasets_document.py
@@ -455,7 +455,8 @@ class DocumentBatchIndexingEstimateApi(DocumentResource):
 
				                     notion_info={
			
 
				                         "notion_workspace_id": data_source_info['notion_workspace_id'],
			
 
				                         "notion_obj_id": data_source_info['notion_page_id'],
			
 
				-                        "notion_page_type": data_source_info['type']
			
 
				+                        "notion_page_type": data_source_info['type'],
			
 
				+                        "tenant_id": current_user.current_tenant_id
			
 
				                     },
			
 
				                     document_model=document.doc_form
			
 
				                 )
			
--- a/api/core/indexing_runner.py
+++ b/api/core/indexing_runner.py
@@ -366,7 +366,8 @@ class IndexingRunner:
 
				                     "notion_workspace_id": data_source_info['notion_workspace_id'],
			
 
				                     "notion_obj_id": data_source_info['notion_page_id'],
			
 
				                     "notion_page_type": data_source_info['type'],
			
 
				-                    "document": dataset_document
			
 
				+                    "document": dataset_document,
			
 
				+                    "tenant_id": dataset_document.tenant_id
			
 
				                 },
			
 
				                 document_model=dataset_document.doc_form
			
 
				             )
			
--- a/api/core/rag/datasource/retrieval_service.py
+++ b/api/core/rag/datasource/retrieval_service.py
@@ -39,7 +39,8 @@ class RetrievalService:
 
				                 'flask_app': current_app._get_current_object(),
			
 
				                 'dataset_id': dataset_id,
			
 
				                 'query': query,
			
 
				-                'top_k': top_k
			
 
				+                'top_k': top_k,
			
 
				+                'all_documents': all_documents
			
 
				             })
			
 
				             threads.append(keyword_thread)
			
 
				             keyword_thread.start()
			
--- a/api/core/rag/extractor/entity/extract_setting.py
+++ b/api/core/rag/extractor/entity/extract_setting.py
@@ -12,6 +12,7 @@ class NotionInfo(BaseModel):
 
				     notion_obj_id: str
			
 
				     notion_page_type: str
			
 
				     document: Document = None
			
 
				+    tenant_id: str
			
 
				 
			
 
				     class Config:
			
 
				         arbitrary_types_allowed = True
			
--- a/api/core/rag/extractor/extract_processor.py
+++ b/api/core/rag/extractor/extract_processor.py
@@ -132,7 +132,8 @@ class ExtractProcessor:
 
				                 notion_workspace_id=extract_setting.notion_info.notion_workspace_id,
			
 
				                 notion_obj_id=extract_setting.notion_info.notion_obj_id,
			
 
				                 notion_page_type=extract_setting.notion_info.notion_page_type,
			
 
				-                document_model=extract_setting.notion_info.document
			
 
				+                document_model=extract_setting.notion_info.document,
			
 
				+                tenant_id=extract_setting.notion_info.tenant_id,
			
 
				             )
			
 
				             return extractor.extract()
			
 
				         else:
			
--- a/api/core/rag/extractor/html_extractor.py
+++ b/api/core/rag/extractor/html_extractor.py
@@ -1,13 +1,14 @@
 
				 """Abstract interface for document loader implementations."""
			
 
				-from typing import Optional
			
 
				+from bs4 import BeautifulSoup
			
 
				 
			
 
				 from core.rag.extractor.extractor_base import BaseExtractor
			
 
				-from core.rag.extractor.helpers import detect_file_encodings
			
 
				 from core.rag.models.document import Document
			
 
				 
			
 
				 
			
 
				 class HtmlExtractor(BaseExtractor):
			
 
				-    """Load html files.
			
 
				+
			
 
				+    """
			
 
				+    Load html files.
			
 
				 
			
 
				 
			
 
				     Args:
			
@@ -15,57 +16,19 @@ class HtmlExtractor(BaseExtractor):
 
				     """
			
 
				 
			
 
				     def __init__(
			
 
				-            self,
			
 
				-            file_path: str,
			
 
				-            encoding: Optional[str] = None,
			
 
				-            autodetect_encoding: bool = False,
			
 
				-            source_column: Optional[str] = None,
			
 
				-            csv_args: Optional[dict] = None,
			
 
				+        self,
			
 
				+        file_path: str
			
 
				     ):
			
 
				         """Initialize with file path."""
			
 
				         self._file_path = file_path
			
 
				-        self._encoding = encoding
			
 
				-        self._autodetect_encoding = autodetect_encoding
			
 
				-        self.source_column = source_column
			
 
				-        self.csv_args = csv_args or {}
			
 
				 
			
 
				     def extract(self) -> list[Document]:
			
 
				-        """Load data into document objects."""
			
 
				-        try:
			
 
				-            with open(self._file_path, newline="", encoding=self._encoding) as csvfile:
			
 
				-                docs = self._read_from_file(csvfile)
			
 
				-        except UnicodeDecodeError as e:
			
 
				-            if self._autodetect_encoding:
			
 
				-                detected_encodings = detect_file_encodings(self._file_path)
			
 
				-                for encoding in detected_encodings:
			
 
				-                    try:
			
 
				-                        with open(self._file_path, newline="", encoding=encoding.encoding) as csvfile:
			
 
				-                            docs = self._read_from_file(csvfile)
			
 
				-                        break
			
 
				-                    except UnicodeDecodeError:
			
 
				-                        continue
			
 
				-            else:
			
 
				-                raise RuntimeError(f"Error loading {self._file_path}") from e
			
 
				-
			
 
				-        return docs
			
 
				+        return [Document(page_content=self._load_as_text())]
			
 
				 
			
 
				-    def _read_from_file(self, csvfile) -> list[Document]:
			
 
				-        docs = []
			
 
				-        csv_reader = csv.DictReader(csvfile, **self.csv_args)  # type: ignore
			
 
				-        for i, row in enumerate(csv_reader):
			
 
				-            content = "\n".join(f"{k.strip()}: {v.strip()}" for k, v in row.items())
			
 
				-            try:
			
 
				-                source = (
			
 
				-                    row[self.source_column]
			
 
				-                    if self.source_column is not None
			
 
				-                    else ''
			
 
				-                )
			
 
				-            except KeyError:
			
 
				-                raise ValueError(
			
 
				-                    f"Source column '{self.source_column}' not found in CSV file."
			
 
				-                )
			
 
				-            metadata = {"source": source, "row": i}
			
 
				-            doc = Document(page_content=content, metadata=metadata)
			
 
				-            docs.append(doc)
			
 
				+    def _load_as_text(self) -> str:
			
 
				+        with open(self._file_path, "rb") as fp:
			
 
				+            soup = BeautifulSoup(fp, 'html.parser')
			
 
				+            text = soup.get_text()
			
 
				+            text = text.strip() if text else ''
			
 
				 
			
 
				-        return docs
			
 
				+        return text
			
--- a/api/core/rag/extractor/notion_extractor.py
+++ b/api/core/rag/extractor/notion_extractor.py
@@ -30,8 +30,10 @@ class NotionExtractor(BaseExtractor):
 
				             notion_workspace_id: str,
			
 
				             notion_obj_id: str,
			
 
				             notion_page_type: str,
			
 
				+            tenant_id: str,
			
 
				             document_model: Optional[DocumentModel] = None,
			
 
				-            notion_access_token: Optional[str] = None
			
 
				+            notion_access_token: Optional[str] = None,
			
 
				+
			
 
				     ):
			
 
				         self._notion_access_token = None
			
 
				         self._document_model = document_model
			
--- a/api/tasks/document_indexing_sync_task.py
+++ b/api/tasks/document_indexing_sync_task.py
@@ -58,7 +58,8 @@ def document_indexing_sync_task(dataset_id: str, document_id: str):
 
				             notion_workspace_id=workspace_id,
			
 
				             notion_obj_id=page_id,
			
 
				             notion_page_type=page_type,
			
 
				-            notion_access_token=data_source_binding.access_token
			
 
				+            notion_access_token=data_source_binding.access_token,
			
 
				+            tenant_id=document.tenant_id
			
 
				         )
			
 
				 
			
 
				         last_edited_time = loader.get_notion_last_edited_time()