1 年之前 · 3e9d271b52
--- a/api/constants/__init__.py
+++ b/api/constants/__init__.py
@@ -15,7 +15,9 @@ AUDIO_EXTENSIONS.extend([ext.upper() for ext in AUDIO_EXTENSIONS])
 
				 
			
 
				 if dify_config.ETL_TYPE == "Unstructured":
			
 
				     DOCUMENT_EXTENSIONS = ["txt", "markdown", "md", "pdf", "html", "htm", "xlsx", "xls"]
			
 
				-    DOCUMENT_EXTENSIONS.extend(("docx", "csv", "eml", "msg", "pptx", "ppt", "xml", "epub"))
			
 
				+    DOCUMENT_EXTENSIONS.extend(("docx", "csv", "eml", "msg", "pptx", "xml", "epub"))
			
 
				+    if dify_config.UNSTRUCTURED_API_URL:
			
 
				+        DOCUMENT_EXTENSIONS.append("ppt")
			
 
				     DOCUMENT_EXTENSIONS.extend([ext.upper() for ext in DOCUMENT_EXTENSIONS])
			
 
				 else:
			
 
				     DOCUMENT_EXTENSIONS = ["txt", "markdown", "md", "pdf", "html", "htm", "xlsx", "xls", "docx", "csv"]
			
--- a/api/core/rag/extractor/extract_processor.py
+++ b/api/core/rag/extractor/extract_processor.py
@@ -21,6 +21,7 @@ from core.rag.extractor.unstructured.unstructured_eml_extractor import Unstructu
 
				 from core.rag.extractor.unstructured.unstructured_epub_extractor import UnstructuredEpubExtractor
			
 
				 from core.rag.extractor.unstructured.unstructured_markdown_extractor import UnstructuredMarkdownExtractor
			
 
				 from core.rag.extractor.unstructured.unstructured_msg_extractor import UnstructuredMsgExtractor
			
 
				+from core.rag.extractor.unstructured.unstructured_pdf_extractor import UnstructuredPDFExtractor
			
 
				 from core.rag.extractor.unstructured.unstructured_ppt_extractor import UnstructuredPPTExtractor
			
 
				 from core.rag.extractor.unstructured.unstructured_pptx_extractor import UnstructuredPPTXExtractor
			
 
				 from core.rag.extractor.unstructured.unstructured_text_extractor import UnstructuredTextExtractor
			
@@ -102,10 +103,10 @@ class ExtractProcessor:
 
				                     if file_extension in {".xlsx", ".xls"}:
			
 
				                         extractor = ExcelExtractor(file_path)
			
 
				                     elif file_extension == ".pdf":
			
 
				-                        extractor = PdfExtractor(file_path)
			
 
				+                        extractor = UnstructuredPDFExtractor(file_path, unstructured_api_url, unstructured_api_key)
			
 
				                     elif file_extension in {".md", ".markdown"}:
			
 
				                         extractor = (
			
 
				-                            UnstructuredMarkdownExtractor(file_path, unstructured_api_url)
			
 
				+                            UnstructuredMarkdownExtractor(file_path, unstructured_api_url, unstructured_api_key)
			
 
				                             if is_automatic
			
 
				                             else MarkdownExtractor(file_path, autodetect_encoding=True)
			
 
				                         )
			
@@ -116,17 +117,17 @@ class ExtractProcessor:
 
				                     elif file_extension == ".csv":
			
 
				                         extractor = CSVExtractor(file_path, autodetect_encoding=True)
			
 
				                     elif file_extension == ".msg":
			
 
				-                        extractor = UnstructuredMsgExtractor(file_path, unstructured_api_url)
			
 
				+                        extractor = UnstructuredMsgExtractor(file_path, unstructured_api_url, unstructured_api_key)
			
 
				                     elif file_extension == ".eml":
			
 
				-                        extractor = UnstructuredEmailExtractor(file_path, unstructured_api_url)
			
 
				+                        extractor = UnstructuredEmailExtractor(file_path, unstructured_api_url, unstructured_api_key)
			
 
				                     elif file_extension == ".ppt":
			
 
				                         extractor = UnstructuredPPTExtractor(file_path, unstructured_api_url, unstructured_api_key)
			
 
				                     elif file_extension == ".pptx":
			
 
				-                        extractor = UnstructuredPPTXExtractor(file_path, unstructured_api_url)
			
 
				+                        extractor = UnstructuredPPTXExtractor(file_path, unstructured_api_url, unstructured_api_key)
			
 
				                     elif file_extension == ".xml":
			
 
				-                        extractor = UnstructuredXmlExtractor(file_path, unstructured_api_url)
			
 
				+                        extractor = UnstructuredXmlExtractor(file_path, unstructured_api_url, unstructured_api_key)
			
 
				                     elif file_extension == ".epub":
			
 
				-                        extractor = UnstructuredEpubExtractor(file_path, unstructured_api_url)
			
 
				+                        extractor = UnstructuredEpubExtractor(file_path, unstructured_api_url, unstructured_api_key)
			
 
				                     else:
			
 
				                         # txt
			
 
				                         extractor = (
			
--- a/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py
@@ -10,24 +10,26 @@ logger = logging.getLogger(__name__)
 
				 
			
 
				 
			
 
				 class UnstructuredEmailExtractor(BaseExtractor):
			
 
				-    """Load msg files.
			
 
				+    """Load eml files.
			
 
				     Args:
			
 
				         file_path: Path to the file to load.
			
 
				     """
			
 
				 
			
 
				-    def __init__(
			
 
				-        self,
			
 
				-        file_path: str,
			
 
				-        api_url: str,
			
 
				-    ):
			
 
				+    def __init__(self, file_path: str, api_url: str, api_key: str):
			
 
				         """Initialize with file path."""
			
 
				         self._file_path = file_path
			
 
				         self._api_url = api_url
			
 
				+        self._api_key = api_key
			
 
				 
			
 
				     def extract(self) -> list[Document]:
			
 
				-        from unstructured.partition.email import partition_email
			
 
				+        if self._api_url:
			
 
				+            from unstructured.partition.api import partition_via_api
			
 
				 
			
 
				-        elements = partition_email(filename=self._file_path)
			
 
				+            elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
			
 
				+        else:
			
 
				+            from unstructured.partition.email import partition_email
			
 
				+
			
 
				+            elements = partition_email(filename=self._file_path)
			
 
				 
			
 
				         # noinspection PyBroadException
			
 
				         try:
			
--- a/api/core/rag/extractor/unstructured/unstructured_epub_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_epub_extractor.py
@@ -19,15 +19,23 @@ class UnstructuredEpubExtractor(BaseExtractor):
 
				         self,
			
 
				         file_path: str,
			
 
				         api_url: Optional[str] = None,
			
 
				+        api_key: Optional[str] = None,
			
 
				     ):
			
 
				         """Initialize with file path."""
			
 
				         self._file_path = file_path
			
 
				         self._api_url = api_url
			
 
				+        self._api_key = api_key
			
 
				 
			
 
				     def extract(self) -> list[Document]:
			
 
				-        from unstructured.partition.epub import partition_epub
			
 
				+        if self._api_url:
			
 
				+            from unstructured.partition.api import partition_via_api
			
 
				+
			
 
				+            elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
			
 
				+        else:
			
 
				+            from unstructured.partition.epub import partition_epub
			
 
				+
			
 
				+            elements = partition_epub(filename=self._file_path, xml_keep_tags=True)
			
 
				 
			
 
				-        elements = partition_epub(filename=self._file_path, xml_keep_tags=True)
			
 
				         from unstructured.chunking.title import chunk_by_title
			
 
				 
			
 
				         chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
			
--- a/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py
@@ -24,19 +24,21 @@ class UnstructuredMarkdownExtractor(BaseExtractor):
 
				             if the specified encoding fails.
			
 
				     """
			
 
				 
			
 
				-    def __init__(
			
 
				-        self,
			
 
				-        file_path: str,
			
 
				-        api_url: str,
			
 
				-    ):
			
 
				+    def __init__(self, file_path: str, api_url: str, api_key: str):
			
 
				         """Initialize with file path."""
			
 
				         self._file_path = file_path
			
 
				         self._api_url = api_url
			
 
				+        self._api_key = api_key
			
 
				 
			
 
				     def extract(self) -> list[Document]:
			
 
				-        from unstructured.partition.md import partition_md
			
 
				+        if self._api_url:
			
 
				+            from unstructured.partition.api import partition_via_api
			
 
				 
			
 
				-        elements = partition_md(filename=self._file_path)
			
 
				+            elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
			
 
				+        else:
			
 
				+            from unstructured.partition.md import partition_md
			
 
				+
			
 
				+            elements = partition_md(filename=self._file_path)
			
 
				         from unstructured.chunking.title import chunk_by_title
			
 
				 
			
 
				         chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
			
--- a/api/core/rag/extractor/unstructured/unstructured_msg_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_msg_extractor.py
@@ -14,15 +14,21 @@ class UnstructuredMsgExtractor(BaseExtractor):
 
				         file_path: Path to the file to load.
			
 
				     """
			
 
				 
			
 
				-    def __init__(self, file_path: str, api_url: str):
			
 
				+    def __init__(self, file_path: str, api_url: str, api_key: str):
			
 
				         """Initialize with file path."""
			
 
				         self._file_path = file_path
			
 
				         self._api_url = api_url
			
 
				+        self._api_key = api_key
			
 
				 
			
 
				     def extract(self) -> list[Document]:
			
 
				-        from unstructured.partition.msg import partition_msg
			
 
				+        if self._api_url:
			
 
				+            from unstructured.partition.api import partition_via_api
			
 
				 
			
 
				-        elements = partition_msg(filename=self._file_path)
			
 
				+            elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
			
 
				+        else:
			
 
				+            from unstructured.partition.msg import partition_msg
			
 
				+
			
 
				+            elements = partition_msg(filename=self._file_path)
			
 
				         from unstructured.chunking.title import chunk_by_title
			
 
				 
			
 
				         chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
			
--- a/api/core/rag/extractor/unstructured/unstructured_pdf_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_pdf_extractor.py
@@ -0,0 +1,47 @@
 
				+import logging
			
 
				+
			
 
				+from core.rag.extractor.extractor_base import BaseExtractor
			
 
				+from core.rag.models.document import Document
			
 
				+
			
 
				+logger = logging.getLogger(__name__)
			
 
				+
			
 
				+
			
 
				+class UnstructuredPDFExtractor(BaseExtractor):
			
 
				+    """Load pdf files.
			
 
				+
			
 
				+
			
 
				+    Args:
			
 
				+        file_path: Path to the file to load.
			
 
				+
			
 
				+        api_url: Unstructured API URL
			
 
				+
			
 
				+        api_key: Unstructured API Key
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, file_path: str, api_url: str, api_key: str):
			
 
				+        """Initialize with file path."""
			
 
				+        self._file_path = file_path
			
 
				+        self._api_url = api_url
			
 
				+        self._api_key = api_key
			
 
				+
			
 
				+    def extract(self) -> list[Document]:
			
 
				+        if self._api_url:
			
 
				+            from unstructured.partition.api import partition_via_api
			
 
				+
			
 
				+            elements = partition_via_api(
			
 
				+                filename=self._file_path, api_url=self._api_url, api_key=self._api_key, strategy="auto"
			
 
				+            )
			
 
				+        else:
			
 
				+            from unstructured.partition.pdf import partition_pdf
			
 
				+
			
 
				+            elements = partition_pdf(filename=self._file_path, strategy="auto")
			
 
				+
			
 
				+        from unstructured.chunking.title import chunk_by_title
			
 
				+
			
 
				+        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
			
 
				+        documents = []
			
 
				+        for chunk in chunks:
			
 
				+            text = chunk.text.strip()
			
 
				+            documents.append(Document(page_content=text))
			
 
				+
			
 
				+        return documents
			
--- a/api/core/rag/extractor/unstructured/unstructured_ppt_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_ppt_extractor.py
@@ -7,7 +7,7 @@ logger = logging.getLogger(__name__)
 
				 
			
 
				 
			
 
				 class UnstructuredPPTExtractor(BaseExtractor):
			
 
				-    """Load msg files.
			
 
				+    """Load ppt files.
			
 
				 
			
 
				 
			
 
				     Args:
			
@@ -21,9 +21,12 @@ class UnstructuredPPTExtractor(BaseExtractor):
 
				         self._api_key = api_key
			
 
				 
			
 
				     def extract(self) -> list[Document]:
			
 
				-        from unstructured.partition.api import partition_via_api
			
 
				+        if self._api_url:
			
 
				+            from unstructured.partition.api import partition_via_api
			
 
				 
			
 
				-        elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
			
 
				+            elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
			
 
				+        else:
			
 
				+            raise NotImplementedError("Unstructured API Url is not configured")
			
 
				         text_by_page = {}
			
 
				         for element in elements:
			
 
				             page = element.metadata.page_number
			
--- a/api/core/rag/extractor/unstructured/unstructured_pptx_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_pptx_extractor.py
@@ -7,22 +7,28 @@ logger = logging.getLogger(__name__)
 
				 
			
 
				 
			
 
				 class UnstructuredPPTXExtractor(BaseExtractor):
			
 
				-    """Load msg files.
			
 
				+    """Load pptx files.
			
 
				 
			
 
				 
			
 
				     Args:
			
 
				         file_path: Path to the file to load.
			
 
				     """
			
 
				 
			
 
				-    def __init__(self, file_path: str, api_url: str):
			
 
				+    def __init__(self, file_path: str, api_url: str, api_key: str):
			
 
				         """Initialize with file path."""
			
 
				         self._file_path = file_path
			
 
				         self._api_url = api_url
			
 
				+        self._api_key = api_key
			
 
				 
			
 
				     def extract(self) -> list[Document]:
			
 
				-        from unstructured.partition.pptx import partition_pptx
			
 
				+        if self._api_url:
			
 
				+            from unstructured.partition.api import partition_via_api
			
 
				 
			
 
				-        elements = partition_pptx(filename=self._file_path)
			
 
				+            elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
			
 
				+        else:
			
 
				+            from unstructured.partition.pptx import partition_pptx
			
 
				+
			
 
				+            elements = partition_pptx(filename=self._file_path)
			
 
				         text_by_page = {}
			
 
				         for element in elements:
			
 
				             page = element.metadata.page_number
			
--- a/api/core/rag/extractor/unstructured/unstructured_xml_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_xml_extractor.py
@@ -7,22 +7,29 @@ logger = logging.getLogger(__name__)
 
				 
			
 
				 
			
 
				 class UnstructuredXmlExtractor(BaseExtractor):
			
 
				-    """Load msg files.
			
 
				+    """Load xml files.
			
 
				 
			
 
				 
			
 
				     Args:
			
 
				         file_path: Path to the file to load.
			
 
				     """
			
 
				 
			
 
				-    def __init__(self, file_path: str, api_url: str):
			
 
				+    def __init__(self, file_path: str, api_url: str, api_key: str):
			
 
				         """Initialize with file path."""
			
 
				         self._file_path = file_path
			
 
				         self._api_url = api_url
			
 
				+        self._api_key = api_key
			
 
				 
			
 
				     def extract(self) -> list[Document]:
			
 
				-        from unstructured.partition.xml import partition_xml
			
 
				+        if self._api_url:
			
 
				+            from unstructured.partition.api import partition_via_api
			
 
				+
			
 
				+            elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
			
 
				+        else:
			
 
				+            from unstructured.partition.xml import partition_xml
			
 
				+
			
 
				+            elements = partition_xml(filename=self._file_path, xml_keep_tags=True)
			
 
				 
			
 
				-        elements = partition_xml(filename=self._file_path, xml_keep_tags=True)
			
 
				         from unstructured.chunking.title import chunk_by_title
			
 
				 
			
 
				         chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
			
--- a/api/poetry.lock
+++ b/api/poetry.lock
--- a/api/pyproject.toml
+++ b/api/pyproject.toml
@@ -172,11 +172,12 @@ sagemaker = "2.231.0"
 
				 scikit-learn = "~1.5.1"
			
 
				 sentry-sdk = { version = "~1.44.1", extras = ["flask"] }
			
 
				 sqlalchemy = "~2.0.29"
			
 
				+starlette = "0.41.0"
			
 
				 tencentcloud-sdk-python-hunyuan = "~3.0.1158"
			
 
				 tiktoken = "~0.8.0"
			
 
				 tokenizers = "~0.15.0"
			
 
				 transformers = "~4.35.0"
			
 
				-unstructured = { version = "~0.10.27", extras = ["docx", "epub", "md", "msg", "ppt", "pptx"] }
			
 
				+unstructured = { version = "~0.15.7", extras = ["docx", "epub", "md", "msg", "ppt", "pptx", "pdf"] }
			
 
				 validators = "0.21.0"
			
 
				 volcengine-python-sdk = {extras = ["ark"], version = "~1.0.98"}
			
 
				 websocket-client = "~1.7.0"
			
@@ -206,7 +207,7 @@ duckduckgo-search = "~6.3.0"
 
				 jsonpath-ng = "1.6.1"
			
 
				 matplotlib = "~3.8.2"
			
 
				 newspaper3k = "0.2.8"
			
 
				-nltk = "3.8.1"
			
 
				+nltk = "3.9.1"
			
 
				 numexpr = "~2.9.0"
			
 
				 pydub = "~0.25.1"
			
 
				 qrcode = "~7.4.2"
			
--- a/dev/pytest/pytest_vdb.sh
+++ b/dev/pytest/pytest_vdb.sh
@@ -6,9 +6,4 @@ pytest api/tests/integration_tests/vdb/chroma \
 
				   api/tests/integration_tests/vdb/pgvecto_rs \
			
 
				   api/tests/integration_tests/vdb/pgvector \
			
 
				   api/tests/integration_tests/vdb/qdrant \
			
 
				-  api/tests/integration_tests/vdb/weaviate \
			
 
				-  api/tests/integration_tests/vdb/elasticsearch \
			
 
				-  api/tests/integration_tests/vdb/vikingdb \
			
 
				-  api/tests/integration_tests/vdb/baidu \
			
 
				-  api/tests/integration_tests/vdb/tcvectordb \
			
 
				-  api/tests/integration_tests/vdb/upstash
			
 
				+  api/tests/integration_tests/vdb/weaviate