123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135 |
- import json
- from typing import List, Optional
- from llama_index import ServiceContext, LLMPredictor, OpenAIEmbedding
- from llama_index.data_structs import KeywordTable, Node
- from llama_index.indices.keyword_table.base import BaseGPTKeywordTableIndex
- from llama_index.indices.registry import load_index_struct_from_dict
- from core.docstore.dataset_docstore import DatesetDocumentStore
- from core.docstore.empty_docstore import EmptyDocumentStore
- from core.index.index_builder import IndexBuilder
- from core.index.keyword_table.jieba_keyword_table import GPTJIEBAKeywordTableIndex
- from core.llm.llm_builder import LLMBuilder
- from extensions.ext_database import db
- from models.dataset import Dataset, DatasetKeywordTable, DocumentSegment
- class KeywordTableIndex:
- def __init__(self, dataset: Dataset):
- self._dataset = dataset
- def add_nodes(self, nodes: List[Node]):
- llm = LLMBuilder.to_llm(
- tenant_id=self._dataset.tenant_id,
- model_name='fake'
- )
- service_context = ServiceContext.from_defaults(
- llm_predictor=LLMPredictor(llm=llm),
- embed_model=OpenAIEmbedding()
- )
- dataset_keyword_table = self.get_keyword_table()
- if not dataset_keyword_table or not dataset_keyword_table.keyword_table_dict:
- index_struct = KeywordTable()
- else:
- index_struct_dict = dataset_keyword_table.keyword_table_dict
- index_struct: KeywordTable = load_index_struct_from_dict(index_struct_dict)
- # create index
- index = GPTJIEBAKeywordTableIndex(
- index_struct=index_struct,
- docstore=EmptyDocumentStore(),
- service_context=service_context
- )
- for node in nodes:
- keywords = index._extract_keywords(node.get_text())
- self.update_segment_keywords(node.doc_id, list(keywords))
- index._index_struct.add_node(list(keywords), node)
- index_struct_dict = index.index_struct.to_dict()
- if not dataset_keyword_table:
- dataset_keyword_table = DatasetKeywordTable(
- dataset_id=self._dataset.id,
- keyword_table=json.dumps(index_struct_dict)
- )
- db.session.add(dataset_keyword_table)
- else:
- dataset_keyword_table.keyword_table = json.dumps(index_struct_dict)
- db.session.commit()
- def del_nodes(self, node_ids: List[str]):
- llm = LLMBuilder.to_llm(
- tenant_id=self._dataset.tenant_id,
- model_name='fake'
- )
- service_context = ServiceContext.from_defaults(
- llm_predictor=LLMPredictor(llm=llm),
- embed_model=OpenAIEmbedding()
- )
- dataset_keyword_table = self.get_keyword_table()
- if not dataset_keyword_table or not dataset_keyword_table.keyword_table_dict:
- return
- else:
- index_struct_dict = dataset_keyword_table.keyword_table_dict
- index_struct: KeywordTable = load_index_struct_from_dict(index_struct_dict)
- # create index
- index = GPTJIEBAKeywordTableIndex(
- index_struct=index_struct,
- docstore=EmptyDocumentStore(),
- service_context=service_context
- )
- for node_id in node_ids:
- index.delete(node_id)
- index_struct_dict = index.index_struct.to_dict()
- if not dataset_keyword_table:
- dataset_keyword_table = DatasetKeywordTable(
- dataset_id=self._dataset.id,
- keyword_table=json.dumps(index_struct_dict)
- )
- db.session.add(dataset_keyword_table)
- else:
- dataset_keyword_table.keyword_table = json.dumps(index_struct_dict)
- db.session.commit()
- @property
- def query_index(self) -> Optional[BaseGPTKeywordTableIndex]:
- docstore = DatesetDocumentStore(
- dataset=self._dataset,
- user_id=self._dataset.created_by,
- embedding_model_name="text-embedding-ada-002"
- )
- service_context = IndexBuilder.get_default_service_context(tenant_id=self._dataset.tenant_id)
- dataset_keyword_table = self.get_keyword_table()
- if not dataset_keyword_table or not dataset_keyword_table.keyword_table_dict:
- return None
- index_struct: KeywordTable = load_index_struct_from_dict(dataset_keyword_table.keyword_table_dict)
- return GPTJIEBAKeywordTableIndex(index_struct=index_struct, docstore=docstore, service_context=service_context)
- def get_keyword_table(self):
- dataset_keyword_table = self._dataset.dataset_keyword_table
- if dataset_keyword_table:
- return dataset_keyword_table
- return None
- def update_segment_keywords(self, node_id: str, keywords: List[str]):
- document_segment = db.session.query(DocumentSegment).filter(DocumentSegment.index_node_id == node_id).first()
- if document_segment:
- document_segment.keywords = keywords
- db.session.commit()
|