dataset_tool_builder.py 2.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273
  1. from typing import Optional
  2. from langchain.callbacks import CallbackManager
  3. from llama_index.langchain_helpers.agents import IndexToolConfig
  4. from core.callback_handler.dataset_tool_callback_handler import DatasetToolCallbackHandler
  5. from core.callback_handler.index_tool_callback_handler import DatasetIndexToolCallbackHandler
  6. from core.callback_handler.std_out_callback_handler import DifyStdOutCallbackHandler
  7. from core.index.keyword_table_index import KeywordTableIndex
  8. from core.index.vector_index import VectorIndex
  9. from core.prompt.prompts import QUERY_KEYWORD_EXTRACT_TEMPLATE
  10. from core.tool.llama_index_tool import EnhanceLlamaIndexTool
  11. from models.dataset import Dataset
  12. class DatasetToolBuilder:
  13. @classmethod
  14. def build_dataset_tool(cls, dataset: Dataset,
  15. response_mode: str = "no_synthesizer",
  16. callback_handler: Optional[DatasetToolCallbackHandler] = None):
  17. if dataset.indexing_technique == "economy":
  18. # use keyword table query
  19. index = KeywordTableIndex(dataset=dataset).query_index
  20. if not index:
  21. return None
  22. query_kwargs = {
  23. "mode": "default",
  24. "response_mode": response_mode,
  25. "query_keyword_extract_template": QUERY_KEYWORD_EXTRACT_TEMPLATE,
  26. "max_keywords_per_query": 5,
  27. # If num_chunks_per_query is too large,
  28. # it will slow down the synthesis process due to multiple iterations of refinement.
  29. "num_chunks_per_query": 2
  30. }
  31. else:
  32. index = VectorIndex(dataset=dataset).query_index
  33. if not index:
  34. return None
  35. query_kwargs = {
  36. "mode": "default",
  37. "response_mode": response_mode,
  38. # If top_k is too large,
  39. # it will slow down the synthesis process due to multiple iterations of refinement.
  40. "similarity_top_k": 2
  41. }
  42. # fulfill description when it is empty
  43. description = dataset.description
  44. if not description:
  45. description = 'useful for when you want to answer queries about the ' + dataset.name
  46. index_tool_config = IndexToolConfig(
  47. index=index,
  48. name=f"dataset-{dataset.id}",
  49. description=description,
  50. index_query_kwargs=query_kwargs,
  51. tool_kwargs={
  52. "callback_manager": CallbackManager([callback_handler, DifyStdOutCallbackHandler()])
  53. },
  54. # tool_kwargs={"return_direct": True},
  55. # return_direct: Whether to return LLM results directly or process the output data with an Output Parser
  56. )
  57. index_callback_handler = DatasetIndexToolCallbackHandler(dataset_id=dataset.id)
  58. return EnhanceLlamaIndexTool.from_tool_config(
  59. tool_config=index_tool_config,
  60. callback_handler=index_callback_handler
  61. )