vector_service.py 2.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071
  1. from typing import Optional
  2. from core.rag.datasource.keyword.keyword_factory import Keyword
  3. from core.rag.datasource.vdb.vector_factory import Vector
  4. from core.rag.models.document import Document
  5. from models.dataset import Dataset, DocumentSegment
  6. class VectorService:
  7. @classmethod
  8. def create_segments_vector(cls, keywords_list: Optional[list[list[str]]],
  9. segments: list[DocumentSegment], dataset: Dataset):
  10. documents = []
  11. for segment in segments:
  12. document = Document(
  13. page_content=segment.content,
  14. metadata={
  15. "doc_id": segment.index_node_id,
  16. "doc_hash": segment.index_node_hash,
  17. "document_id": segment.document_id,
  18. "dataset_id": segment.dataset_id,
  19. }
  20. )
  21. documents.append(document)
  22. if dataset.indexing_technique == 'high_quality':
  23. # save vector index
  24. vector = Vector(
  25. dataset=dataset
  26. )
  27. vector.add_texts(documents, duplicate_check=True)
  28. # save keyword index
  29. keyword = Keyword(dataset)
  30. if keywords_list and len(keywords_list) > 0:
  31. keyword.add_texts(documents, keyword_list=keywords_list)
  32. else:
  33. keyword.add_texts(documents)
  34. @classmethod
  35. def update_segment_vector(cls, keywords: Optional[list[str]], segment: DocumentSegment, dataset: Dataset):
  36. # update segment index task
  37. # format new index
  38. document = Document(
  39. page_content=segment.content,
  40. metadata={
  41. "doc_id": segment.index_node_id,
  42. "doc_hash": segment.index_node_hash,
  43. "document_id": segment.document_id,
  44. "dataset_id": segment.dataset_id,
  45. }
  46. )
  47. if dataset.indexing_technique == 'high_quality':
  48. # update vector index
  49. vector = Vector(
  50. dataset=dataset
  51. )
  52. vector.delete_by_ids([segment.index_node_id])
  53. vector.add_texts([document], duplicate_check=True)
  54. # update keyword index
  55. keyword = Keyword(dataset)
  56. keyword.delete_by_ids([segment.index_node_id])
  57. # save keyword index
  58. if keywords and len(keywords) > 0:
  59. keyword.add_texts([document], keywords_list=[keywords])
  60. else:
  61. keyword.add_texts([document])