ソースを参照

Add new tool: Firecrawl (#3819)

Co-authored-by: crazywoola <427733928@qq.com>
Co-authored-by: Yeuoly <admin@srmxy.cn>
Richards Tu 11 ヶ月 前
コミット
f26ad16af7

+ 3 - 0
api/core/tools/provider/builtin/firecrawl/_assets/icon.svg

@@ -0,0 +1,3 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="111" height="111" viewBox="0 0 111 111" fill="none">
+    <text x="0" y="90" font-family="Verdana" font-size="85" fill="black">🔥</text>
+</svg>

+ 23 - 0
api/core/tools/provider/builtin/firecrawl/firecrawl.py

@@ -0,0 +1,23 @@
+from core.tools.errors import ToolProviderCredentialValidationError
+from core.tools.provider.builtin.firecrawl.tools.crawl import CrawlTool
+from core.tools.provider.builtin_tool_provider import BuiltinToolProviderController
+
+
+class FirecrawlProvider(BuiltinToolProviderController):
+    def _validate_credentials(self, credentials: dict) -> None:
+        try:
+            # Example validation using the Crawl tool
+            CrawlTool().fork_tool_runtime(
+                meta={"credentials": credentials}
+            ).invoke(
+                user_id='',
+                tool_parameters={
+                    "url": "https://example.com",
+                    "includes": '', 
+                    "excludes": '', 
+                    "limit": 1,
+                    "onlyMainContent": True,
+                }
+            )
+        except Exception as e:
+            raise ToolProviderCredentialValidationError(str(e))

+ 24 - 0
api/core/tools/provider/builtin/firecrawl/firecrawl.yaml

@@ -0,0 +1,24 @@
+identity:
+  author: Richards Tu
+  name: firecrawl
+  label:
+    en_US: Firecrawl
+    zh_CN: Firecrawl
+  description:
+    en_US: Firecrawl API integration for web crawling and scraping.
+    zh_CN: Firecrawl API 集成,用于网页爬取和数据抓取。
+  icon: icon.svg
+credentials_for_provider:
+  firecrawl_api_key:
+    type: secret-input
+    required: true
+    label:
+      en_US: Firecrawl API Key
+      zh_CN: Firecrawl API 密钥
+    placeholder:
+      en_US: Please input your Firecrawl API key
+      zh_CN: 请输入您的 Firecrawl API 密钥
+    help:
+      en_US: Get your Firecrawl API key from your Firecrawl account settings.
+      zh_CN: 从您的 Firecrawl 账户设置中获取 Firecrawl API 密钥。
+    url: https://www.firecrawl.dev/account

+ 50 - 0
api/core/tools/provider/builtin/firecrawl/tools/crawl.py

@@ -0,0 +1,50 @@
+from typing import Any, Union
+
+from firecrawl import FirecrawlApp
+
+from core.tools.entities.tool_entities import ToolInvokeMessage
+from core.tools.tool.builtin_tool import BuiltinTool
+
+
+class CrawlTool(BuiltinTool):
+    def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]:
+        # initialize the app object with the api key
+        app = FirecrawlApp(api_key=self.runtime.credentials['firecrawl_api_key'])
+
+        options = {
+            'crawlerOptions': {
+                'excludes': tool_parameters.get('excludes', '').split(',') if tool_parameters.get('excludes') else [],
+                'includes': tool_parameters.get('includes', '').split(',') if tool_parameters.get('includes') else [],
+                'limit': tool_parameters.get('limit', 5)
+            },
+            'pageOptions': {
+                'onlyMainContent': tool_parameters.get('onlyMainContent', False)
+            }
+        }
+
+        # crawl the url
+        crawl_result = app.crawl_url(
+            url=tool_parameters['url'], 
+            params=options,
+            wait_until_done=True, 
+        )
+        
+        # reformat crawl result
+        crawl_output = "**Crawl Result**\n\n"
+        try:
+            for result in crawl_result:
+                crawl_output += f"**- Title:** {result.get('metadata', {}).get('title', '')}\n"
+                crawl_output += f"**- Description:** {result.get('metadata', {}).get('description', '')}\n"
+                crawl_output += f"**- URL:** {result.get('metadata', {}).get('ogUrl', '')}\n\n"
+                crawl_output += f"**- Web Content:**\n{result.get('markdown', '')}\n\n"
+                crawl_output += "---\n\n"
+        except Exception as e:
+            crawl_output += f"An error occurred: {str(e)}\n"
+            crawl_output += f"**- Title:** {result.get('metadata', {}).get('title', '')}\n"
+            crawl_output += f"**- Description:** {result.get('metadata', {}).get('description','')}\n"
+            crawl_output += f"**- URL:** {result.get('metadata', {}).get('ogUrl', '')}\n\n"
+            crawl_output += f"**- Web Content:**\n{result.get('markdown', '')}\n\n"
+            crawl_output += "---\n\n"
+
+
+        return self.create_text_message(crawl_output)

+ 78 - 0
api/core/tools/provider/builtin/firecrawl/tools/crawl.yaml

@@ -0,0 +1,78 @@
+identity:
+  name: crawl
+  author: Richards Tu
+  label:
+    en_US: Crawl
+    zh_Hans: 爬取
+description:
+  human:
+    en_US: Extract data from a website by crawling through a URL.
+    zh_Hans: 通过URL从网站中提取数据。
+  llm: This tool initiates a web crawl to extract data from a specified URL. It allows configuring crawler options such as including or excluding URL patterns, generating alt text for images using LLMs (paid plan required), limiting the maximum number of pages to crawl, and returning only the main content of the page. The tool can return either a list of crawled documents or a list of URLs based on the provided options.
+parameters:
+  - name: url
+    type: string
+    required: true
+    label:
+      en_US: URL to crawl
+      zh_Hans: 要爬取的URL
+    human_description:
+      en_US: The URL of the website to crawl and extract data from.
+      zh_Hans: 要爬取并提取数据的网站URL。
+    llm_description: The URL of the website that needs to be crawled. This is a required parameter.
+    form: llm
+  - name: includes
+    type: string
+    required: false
+    label:
+      en_US: URL patterns to include
+      zh_Hans: 要包含的URL模式
+    human_description:
+      en_US: Specify URL patterns to include during the crawl. Only pages matching these patterns will be crawled, you can use ',' to separate multiple patterns.
+      zh_Hans: 指定爬取过程中要包含的URL模式。只有与这些模式匹配的页面才会被爬取。
+    form: form
+    default: ''
+  - name: excludes
+    type: string
+    required: false
+    label:
+      en_US: URL patterns to exclude
+      zh_Hans: 要排除的URL模式
+    human_description:
+      en_US: Specify URL patterns to exclude during the crawl. Pages matching these patterns will be skipped, you can use ',' to separate multiple patterns.
+      zh_Hans: 指定爬取过程中要排除的URL模式。匹配这些模式的页面将被跳过。
+    form: form
+    default: 'blog/*'
+  - name: limit
+    type: number
+    required: false
+    label:
+      en_US: Maximum number of pages to crawl
+      zh_Hans: 最大爬取页面数
+    human_description:
+      en_US: Specify the maximum number of pages to crawl. The crawler will stop after reaching this limit.
+      zh_Hans: 指定要爬取的最大页面数。爬虫将在达到此限制后停止。
+    form: form
+    min: 1
+    max: 20
+    default: 5
+  - name: onlyMainContent
+    type: boolean
+    required: false
+    label:
+      en_US: Only return the main content of the page
+      zh_Hans: 仅返回页面的主要内容
+    human_description:
+      en_US: If enabled, the crawler will only return the main content of the page, excluding headers, navigation, footers, etc.
+      zh_Hans: 如果启用,爬虫将仅返回页面的主要内容,不包括标题、导航、页脚等。
+    form: form
+    options:
+      - value: true
+        label:
+          en_US: Yes
+          zh_Hans: 是
+      - value: false
+        label:
+          en_US: No
+          zh_Hans: 否
+    default: false

+ 2 - 1
api/requirements.txt

@@ -81,4 +81,5 @@ lxml==5.1.0
 xlrd~=2.0.1
 pydantic~=1.10.0
 pgvecto-rs==0.1.4
-oss2==2.15.0
+firecrawl-py==0.0.5
+oss2==2.15.0