|
@@ -75,7 +75,7 @@ class DocumentExtractorNode(BaseNode[DocumentExtractorNodeData]):
|
|
|
)
|
|
|
|
|
|
|
|
|
-def _extract_text(*, file_content: bytes, mime_type: str) -> str:
|
|
|
+def _extract_text_by_mime_type(*, file_content: bytes, mime_type: str) -> str:
|
|
|
"""Extract text from a file based on its MIME type."""
|
|
|
if mime_type.startswith("text/plain") or mime_type in {"text/html", "text/htm", "text/markdown", "text/xml"}:
|
|
|
return _extract_text_from_plain_text(file_content)
|
|
@@ -107,6 +107,33 @@ def _extract_text(*, file_content: bytes, mime_type: str) -> str:
|
|
|
raise UnsupportedFileTypeError(f"Unsupported MIME type: {mime_type}")
|
|
|
|
|
|
|
|
|
+def _extract_text_by_file_extension(*, file_content: bytes, file_extension: str) -> str:
|
|
|
+ """Extract text from a file based on its file extension."""
|
|
|
+ match file_extension:
|
|
|
+ case ".txt" | ".markdown" | ".md" | ".html" | ".htm" | ".xml":
|
|
|
+ return _extract_text_from_plain_text(file_content)
|
|
|
+ case ".pdf":
|
|
|
+ return _extract_text_from_pdf(file_content)
|
|
|
+ case ".doc" | ".docx":
|
|
|
+ return _extract_text_from_doc(file_content)
|
|
|
+ case ".csv":
|
|
|
+ return _extract_text_from_csv(file_content)
|
|
|
+ case ".xls" | ".xlsx":
|
|
|
+ return _extract_text_from_excel(file_content)
|
|
|
+ case ".ppt":
|
|
|
+ return _extract_text_from_ppt(file_content)
|
|
|
+ case ".pptx":
|
|
|
+ return _extract_text_from_pptx(file_content)
|
|
|
+ case ".epub":
|
|
|
+ return _extract_text_from_epub(file_content)
|
|
|
+ case ".eml":
|
|
|
+ return _extract_text_from_eml(file_content)
|
|
|
+ case ".msg":
|
|
|
+ return _extract_text_from_msg(file_content)
|
|
|
+ case _:
|
|
|
+ raise UnsupportedFileTypeError(f"Unsupported Extension Type: {file_extension}")
|
|
|
+
|
|
|
+
|
|
|
def _extract_text_from_plain_text(file_content: bytes) -> str:
|
|
|
try:
|
|
|
return file_content.decode("utf-8")
|
|
@@ -159,7 +186,10 @@ def _extract_text_from_file(file: File):
|
|
|
if file.mime_type is None:
|
|
|
raise UnsupportedFileTypeError("Unable to determine file type: MIME type is missing")
|
|
|
file_content = _download_file_content(file)
|
|
|
- extracted_text = _extract_text(file_content=file_content, mime_type=file.mime_type)
|
|
|
+ if file.transfer_method == FileTransferMethod.REMOTE_URL:
|
|
|
+ extracted_text = _extract_text_by_mime_type(file_content=file_content, mime_type=file.mime_type)
|
|
|
+ else:
|
|
|
+ extracted_text = _extract_text_by_file_extension(file_content=file_content, file_extension=file.extension)
|
|
|
return extracted_text
|
|
|
|
|
|
|
|
@@ -172,7 +202,7 @@ def _extract_text_from_csv(file_content: bytes) -> str:
|
|
|
if not rows:
|
|
|
return ""
|
|
|
|
|
|
- # Create markdown table
|
|
|
+ # Create Markdown table
|
|
|
markdown_table = "| " + " | ".join(rows[0]) + " |\n"
|
|
|
markdown_table += "| " + " | ".join(["---"] * len(rows[0])) + " |\n"
|
|
|
for row in rows[1:]:
|
|
@@ -192,7 +222,7 @@ def _extract_text_from_excel(file_content: bytes) -> str:
|
|
|
# Drop rows where all elements are NaN
|
|
|
df.dropna(how="all", inplace=True)
|
|
|
|
|
|
- # Convert DataFrame to markdown table
|
|
|
+ # Convert DataFrame to Markdown table
|
|
|
markdown_table = df.to_markdown(index=False)
|
|
|
return markdown_table
|
|
|
except Exception as e:
|