|  | @@ -1,16 +1,187 @@
 | 
	
		
			
				|  |  | +import json
 | 
	
		
			
				|  |  | +import time
 | 
	
		
			
				|  |  | +import urllib.error
 | 
	
		
			
				|  |  | +import urllib.parse
 | 
	
		
			
				|  |  | +import urllib.request
 | 
	
		
			
				|  |  |  from typing import Any
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -from langchain.tools import PubmedQueryRun
 | 
	
		
			
				|  |  |  from pydantic import BaseModel, Field
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |  from core.tools.entities.tool_entities import ToolInvokeMessage
 | 
	
		
			
				|  |  |  from core.tools.tool.builtin_tool import BuiltinTool
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | +class PubMedAPIWrapper(BaseModel):
 | 
	
		
			
				|  |  | +    """
 | 
	
		
			
				|  |  | +    Wrapper around PubMed API.
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    This wrapper will use the PubMed API to conduct searches and fetch
 | 
	
		
			
				|  |  | +    document summaries. By default, it will return the document summaries
 | 
	
		
			
				|  |  | +    of the top-k results of an input search.
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    Parameters:
 | 
	
		
			
				|  |  | +        top_k_results: number of the top-scored document used for the PubMed tool
 | 
	
		
			
				|  |  | +        load_max_docs: a limit to the number of loaded documents
 | 
	
		
			
				|  |  | +        load_all_available_meta:
 | 
	
		
			
				|  |  | +          if True: the `metadata` of the loaded Documents gets all available meta info
 | 
	
		
			
				|  |  | +            (see https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch)
 | 
	
		
			
				|  |  | +          if False: the `metadata` gets only the most informative fields.
 | 
	
		
			
				|  |  | +    """
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    base_url_esearch = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?"
 | 
	
		
			
				|  |  | +    base_url_efetch = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?"
 | 
	
		
			
				|  |  | +    max_retry = 5
 | 
	
		
			
				|  |  | +    sleep_time = 0.2
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    # Default values for the parameters
 | 
	
		
			
				|  |  | +    top_k_results: int = 3
 | 
	
		
			
				|  |  | +    load_max_docs: int = 25
 | 
	
		
			
				|  |  | +    ARXIV_MAX_QUERY_LENGTH = 300
 | 
	
		
			
				|  |  | +    doc_content_chars_max: int = 2000
 | 
	
		
			
				|  |  | +    load_all_available_meta: bool = False
 | 
	
		
			
				|  |  | +    email: str = "your_email@example.com"
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    def run(self, query: str) -> str:
 | 
	
		
			
				|  |  | +        """
 | 
	
		
			
				|  |  | +        Run PubMed search and get the article meta information.
 | 
	
		
			
				|  |  | +        See https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
 | 
	
		
			
				|  |  | +        It uses only the most informative fields of article meta information.
 | 
	
		
			
				|  |  | +        """
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        try:
 | 
	
		
			
				|  |  | +            # Retrieve the top-k results for the query
 | 
	
		
			
				|  |  | +            docs = [
 | 
	
		
			
				|  |  | +                f"Published: {result['pub_date']}\nTitle: {result['title']}\n"
 | 
	
		
			
				|  |  | +                f"Summary: {result['summary']}"
 | 
	
		
			
				|  |  | +                for result in self.load(query[: self.ARXIV_MAX_QUERY_LENGTH])
 | 
	
		
			
				|  |  | +            ]
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +            # Join the results and limit the character count
 | 
	
		
			
				|  |  | +            return (
 | 
	
		
			
				|  |  | +                "\n\n".join(docs)[:self.doc_content_chars_max]
 | 
	
		
			
				|  |  | +                if docs
 | 
	
		
			
				|  |  | +                else "No good PubMed Result was found"
 | 
	
		
			
				|  |  | +            )
 | 
	
		
			
				|  |  | +        except Exception as ex:
 | 
	
		
			
				|  |  | +            return f"PubMed exception: {ex}"
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    def load(self, query: str) -> list[dict]:
 | 
	
		
			
				|  |  | +        """
 | 
	
		
			
				|  |  | +        Search PubMed for documents matching the query.
 | 
	
		
			
				|  |  | +        Return a list of dictionaries containing the document metadata.
 | 
	
		
			
				|  |  | +        """
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        url = (
 | 
	
		
			
				|  |  | +            self.base_url_esearch
 | 
	
		
			
				|  |  | +            + "db=pubmed&term="
 | 
	
		
			
				|  |  | +            + str({urllib.parse.quote(query)})
 | 
	
		
			
				|  |  | +            + f"&retmode=json&retmax={self.top_k_results}&usehistory=y"
 | 
	
		
			
				|  |  | +        )
 | 
	
		
			
				|  |  | +        result = urllib.request.urlopen(url)
 | 
	
		
			
				|  |  | +        text = result.read().decode("utf-8")
 | 
	
		
			
				|  |  | +        json_text = json.loads(text)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        articles = []
 | 
	
		
			
				|  |  | +        webenv = json_text["esearchresult"]["webenv"]
 | 
	
		
			
				|  |  | +        for uid in json_text["esearchresult"]["idlist"]:
 | 
	
		
			
				|  |  | +            article = self.retrieve_article(uid, webenv)
 | 
	
		
			
				|  |  | +            articles.append(article)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        # Convert the list of articles to a JSON string
 | 
	
		
			
				|  |  | +        return articles
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    def retrieve_article(self, uid: str, webenv: str) -> dict:
 | 
	
		
			
				|  |  | +        url = (
 | 
	
		
			
				|  |  | +            self.base_url_efetch
 | 
	
		
			
				|  |  | +            + "db=pubmed&retmode=xml&id="
 | 
	
		
			
				|  |  | +            + uid
 | 
	
		
			
				|  |  | +            + "&webenv="
 | 
	
		
			
				|  |  | +            + webenv
 | 
	
		
			
				|  |  | +        )
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        retry = 0
 | 
	
		
			
				|  |  | +        while True:
 | 
	
		
			
				|  |  | +            try:
 | 
	
		
			
				|  |  | +                result = urllib.request.urlopen(url)
 | 
	
		
			
				|  |  | +                break
 | 
	
		
			
				|  |  | +            except urllib.error.HTTPError as e:
 | 
	
		
			
				|  |  | +                if e.code == 429 and retry < self.max_retry:
 | 
	
		
			
				|  |  | +                    # Too Many Requests error
 | 
	
		
			
				|  |  | +                    # wait for an exponentially increasing amount of time
 | 
	
		
			
				|  |  | +                    print(
 | 
	
		
			
				|  |  | +                        f"Too Many Requests, "
 | 
	
		
			
				|  |  | +                        f"waiting for {self.sleep_time:.2f} seconds..."
 | 
	
		
			
				|  |  | +                    )
 | 
	
		
			
				|  |  | +                    time.sleep(self.sleep_time)
 | 
	
		
			
				|  |  | +                    self.sleep_time *= 2
 | 
	
		
			
				|  |  | +                    retry += 1
 | 
	
		
			
				|  |  | +                else:
 | 
	
		
			
				|  |  | +                    raise e
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        xml_text = result.read().decode("utf-8")
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        # Get title
 | 
	
		
			
				|  |  | +        title = ""
 | 
	
		
			
				|  |  | +        if "<ArticleTitle>" in xml_text and "</ArticleTitle>" in xml_text:
 | 
	
		
			
				|  |  | +            start_tag = "<ArticleTitle>"
 | 
	
		
			
				|  |  | +            end_tag = "</ArticleTitle>"
 | 
	
		
			
				|  |  | +            title = xml_text[
 | 
	
		
			
				|  |  | +                xml_text.index(start_tag) + len(start_tag) : xml_text.index(end_tag)
 | 
	
		
			
				|  |  | +            ]
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        # Get abstract
 | 
	
		
			
				|  |  | +        abstract = ""
 | 
	
		
			
				|  |  | +        if "<AbstractText>" in xml_text and "</AbstractText>" in xml_text:
 | 
	
		
			
				|  |  | +            start_tag = "<AbstractText>"
 | 
	
		
			
				|  |  | +            end_tag = "</AbstractText>"
 | 
	
		
			
				|  |  | +            abstract = xml_text[
 | 
	
		
			
				|  |  | +                xml_text.index(start_tag) + len(start_tag) : xml_text.index(end_tag)
 | 
	
		
			
				|  |  | +            ]
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        # Get publication date
 | 
	
		
			
				|  |  | +        pub_date = ""
 | 
	
		
			
				|  |  | +        if "<PubDate>" in xml_text and "</PubDate>" in xml_text:
 | 
	
		
			
				|  |  | +            start_tag = "<PubDate>"
 | 
	
		
			
				|  |  | +            end_tag = "</PubDate>"
 | 
	
		
			
				|  |  | +            pub_date = xml_text[
 | 
	
		
			
				|  |  | +                xml_text.index(start_tag) + len(start_tag) : xml_text.index(end_tag)
 | 
	
		
			
				|  |  | +            ]
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        # Return article as dictionary
 | 
	
		
			
				|  |  | +        article = {
 | 
	
		
			
				|  |  | +            "uid": uid,
 | 
	
		
			
				|  |  | +            "title": title,
 | 
	
		
			
				|  |  | +            "summary": abstract,
 | 
	
		
			
				|  |  | +            "pub_date": pub_date,
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +        return article
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +class PubmedQueryRun(BaseModel):
 | 
	
		
			
				|  |  | +    """Tool that searches the PubMed API."""
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    name = "PubMed"
 | 
	
		
			
				|  |  | +    description = (
 | 
	
		
			
				|  |  | +        "A wrapper around PubMed.org "
 | 
	
		
			
				|  |  | +        "Useful for when you need to answer questions about Physics, Mathematics, "
 | 
	
		
			
				|  |  | +        "Computer Science, Quantitative Biology, Quantitative Finance, Statistics, "
 | 
	
		
			
				|  |  | +        "Electrical Engineering, and Economics "
 | 
	
		
			
				|  |  | +        "from scientific articles on PubMed.org. "
 | 
	
		
			
				|  |  | +        "Input should be a search query."
 | 
	
		
			
				|  |  | +    )
 | 
	
		
			
				|  |  | +    api_wrapper: PubMedAPIWrapper = Field(default_factory=PubMedAPIWrapper)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    def _run(
 | 
	
		
			
				|  |  | +        self,
 | 
	
		
			
				|  |  | +        query: str,
 | 
	
		
			
				|  |  | +    ) -> str:
 | 
	
		
			
				|  |  | +        """Use the Arxiv tool."""
 | 
	
		
			
				|  |  | +        return self.api_wrapper.run(query)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  |  class PubMedInput(BaseModel):
 | 
	
		
			
				|  |  |      query: str = Field(..., description="Search query.")
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  |  class PubMedSearchTool(BuiltinTool):
 | 
	
		
			
				|  |  |      """
 | 
	
		
			
				|  |  |      Tool for performing a search using PubMed search engine.
 | 
	
	
		
			
				|  | @@ -34,7 +205,7 @@ class PubMedSearchTool(BuiltinTool):
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |          tool = PubmedQueryRun(args_schema=PubMedInput)
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -        result = tool.run(query)
 | 
	
		
			
				|  |  | +        result = tool._run(query)
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |          return self.create_text_message(self.summary(user_id=user_id, content=result))
 | 
	
		
			
				|  |  |      
 |