keyword_table_index.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135
  1. import json
  2. from typing import List, Optional
  3. from llama_index import ServiceContext, LLMPredictor, OpenAIEmbedding
  4. from llama_index.data_structs import KeywordTable, Node
  5. from llama_index.indices.keyword_table.base import BaseGPTKeywordTableIndex
  6. from llama_index.indices.registry import load_index_struct_from_dict
  7. from core.docstore.dataset_docstore import DatesetDocumentStore
  8. from core.docstore.empty_docstore import EmptyDocumentStore
  9. from core.index.index_builder import IndexBuilder
  10. from core.index.keyword_table.jieba_keyword_table import GPTJIEBAKeywordTableIndex
  11. from core.llm.llm_builder import LLMBuilder
  12. from extensions.ext_database import db
  13. from models.dataset import Dataset, DatasetKeywordTable, DocumentSegment
  14. class KeywordTableIndex:
  15. def __init__(self, dataset: Dataset):
  16. self._dataset = dataset
  17. def add_nodes(self, nodes: List[Node]):
  18. llm = LLMBuilder.to_llm(
  19. tenant_id=self._dataset.tenant_id,
  20. model_name='fake'
  21. )
  22. service_context = ServiceContext.from_defaults(
  23. llm_predictor=LLMPredictor(llm=llm),
  24. embed_model=OpenAIEmbedding()
  25. )
  26. dataset_keyword_table = self.get_keyword_table()
  27. if not dataset_keyword_table or not dataset_keyword_table.keyword_table_dict:
  28. index_struct = KeywordTable()
  29. else:
  30. index_struct_dict = dataset_keyword_table.keyword_table_dict
  31. index_struct: KeywordTable = load_index_struct_from_dict(index_struct_dict)
  32. # create index
  33. index = GPTJIEBAKeywordTableIndex(
  34. index_struct=index_struct,
  35. docstore=EmptyDocumentStore(),
  36. service_context=service_context
  37. )
  38. for node in nodes:
  39. keywords = index._extract_keywords(node.get_text())
  40. self.update_segment_keywords(node.doc_id, list(keywords))
  41. index._index_struct.add_node(list(keywords), node)
  42. index_struct_dict = index.index_struct.to_dict()
  43. if not dataset_keyword_table:
  44. dataset_keyword_table = DatasetKeywordTable(
  45. dataset_id=self._dataset.id,
  46. keyword_table=json.dumps(index_struct_dict)
  47. )
  48. db.session.add(dataset_keyword_table)
  49. else:
  50. dataset_keyword_table.keyword_table = json.dumps(index_struct_dict)
  51. db.session.commit()
  52. def del_nodes(self, node_ids: List[str]):
  53. llm = LLMBuilder.to_llm(
  54. tenant_id=self._dataset.tenant_id,
  55. model_name='fake'
  56. )
  57. service_context = ServiceContext.from_defaults(
  58. llm_predictor=LLMPredictor(llm=llm),
  59. embed_model=OpenAIEmbedding()
  60. )
  61. dataset_keyword_table = self.get_keyword_table()
  62. if not dataset_keyword_table or not dataset_keyword_table.keyword_table_dict:
  63. return
  64. else:
  65. index_struct_dict = dataset_keyword_table.keyword_table_dict
  66. index_struct: KeywordTable = load_index_struct_from_dict(index_struct_dict)
  67. # create index
  68. index = GPTJIEBAKeywordTableIndex(
  69. index_struct=index_struct,
  70. docstore=EmptyDocumentStore(),
  71. service_context=service_context
  72. )
  73. for node_id in node_ids:
  74. index.delete(node_id)
  75. index_struct_dict = index.index_struct.to_dict()
  76. if not dataset_keyword_table:
  77. dataset_keyword_table = DatasetKeywordTable(
  78. dataset_id=self._dataset.id,
  79. keyword_table=json.dumps(index_struct_dict)
  80. )
  81. db.session.add(dataset_keyword_table)
  82. else:
  83. dataset_keyword_table.keyword_table = json.dumps(index_struct_dict)
  84. db.session.commit()
  85. @property
  86. def query_index(self) -> Optional[BaseGPTKeywordTableIndex]:
  87. docstore = DatesetDocumentStore(
  88. dataset=self._dataset,
  89. user_id=self._dataset.created_by,
  90. embedding_model_name="text-embedding-ada-002"
  91. )
  92. service_context = IndexBuilder.get_default_service_context(tenant_id=self._dataset.tenant_id)
  93. dataset_keyword_table = self.get_keyword_table()
  94. if not dataset_keyword_table or not dataset_keyword_table.keyword_table_dict:
  95. return None
  96. index_struct: KeywordTable = load_index_struct_from_dict(dataset_keyword_table.keyword_table_dict)
  97. return GPTJIEBAKeywordTableIndex(index_struct=index_struct, docstore=docstore, service_context=service_context)
  98. def get_keyword_table(self):
  99. dataset_keyword_table = self._dataset.dataset_keyword_table
  100. if dataset_keyword_table:
  101. return dataset_keyword_table
  102. return None
  103. def update_segment_keywords(self, node_id: str, keywords: List[str]):
  104. document_segment = db.session.query(DocumentSegment).filter(DocumentSegment.index_node_id == node_id).first()
  105. if document_segment:
  106. document_segment.keywords = keywords
  107. db.session.commit()