dataset.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495
  1. import json
  2. import pickle
  3. from json import JSONDecodeError
  4. from sqlalchemy import func
  5. from sqlalchemy.dialects.postgresql import JSONB, UUID
  6. from extensions.ext_database import db
  7. from models.account import Account
  8. from models.model import App, UploadFile
  9. class Dataset(db.Model):
  10. __tablename__ = 'datasets'
  11. __table_args__ = (
  12. db.PrimaryKeyConstraint('id', name='dataset_pkey'),
  13. db.Index('dataset_tenant_idx', 'tenant_id'),
  14. db.Index('retrieval_model_idx', "retrieval_model", postgresql_using='gin')
  15. )
  16. INDEXING_TECHNIQUE_LIST = ['high_quality', 'economy', None]
  17. id = db.Column(UUID, server_default=db.text('uuid_generate_v4()'))
  18. tenant_id = db.Column(UUID, nullable=False)
  19. name = db.Column(db.String(255), nullable=False)
  20. description = db.Column(db.Text, nullable=True)
  21. provider = db.Column(db.String(255), nullable=False,
  22. server_default=db.text("'vendor'::character varying"))
  23. permission = db.Column(db.String(255), nullable=False,
  24. server_default=db.text("'only_me'::character varying"))
  25. data_source_type = db.Column(db.String(255))
  26. indexing_technique = db.Column(db.String(255), nullable=True)
  27. index_struct = db.Column(db.Text, nullable=True)
  28. created_by = db.Column(UUID, nullable=False)
  29. created_at = db.Column(db.DateTime, nullable=False,
  30. server_default=db.text('CURRENT_TIMESTAMP(0)'))
  31. updated_by = db.Column(UUID, nullable=True)
  32. updated_at = db.Column(db.DateTime, nullable=False,
  33. server_default=db.text('CURRENT_TIMESTAMP(0)'))
  34. embedding_model = db.Column(db.String(255), nullable=True)
  35. embedding_model_provider = db.Column(db.String(255), nullable=True)
  36. collection_binding_id = db.Column(UUID, nullable=True)
  37. retrieval_model = db.Column(JSONB, nullable=True)
  38. @property
  39. def dataset_keyword_table(self):
  40. dataset_keyword_table = db.session.query(DatasetKeywordTable).filter(
  41. DatasetKeywordTable.dataset_id == self.id).first()
  42. if dataset_keyword_table:
  43. return dataset_keyword_table
  44. return None
  45. @property
  46. def index_struct_dict(self):
  47. return json.loads(self.index_struct) if self.index_struct else None
  48. @property
  49. def created_by_account(self):
  50. return Account.query.get(self.created_by)
  51. @property
  52. def latest_process_rule(self):
  53. return DatasetProcessRule.query.filter(DatasetProcessRule.dataset_id == self.id) \
  54. .order_by(DatasetProcessRule.created_at.desc()).first()
  55. @property
  56. def app_count(self):
  57. return db.session.query(func.count(AppDatasetJoin.id)).filter(AppDatasetJoin.dataset_id == self.id).scalar()
  58. @property
  59. def document_count(self):
  60. return db.session.query(func.count(Document.id)).filter(Document.dataset_id == self.id).scalar()
  61. @property
  62. def available_document_count(self):
  63. return db.session.query(func.count(Document.id)).filter(
  64. Document.dataset_id == self.id,
  65. Document.indexing_status == 'completed',
  66. Document.enabled == True,
  67. Document.archived == False
  68. ).scalar()
  69. @property
  70. def available_segment_count(self):
  71. return db.session.query(func.count(DocumentSegment.id)).filter(
  72. DocumentSegment.dataset_id == self.id,
  73. DocumentSegment.status == 'completed',
  74. DocumentSegment.enabled == True
  75. ).scalar()
  76. @property
  77. def word_count(self):
  78. return Document.query.with_entities(func.coalesce(func.sum(Document.word_count))) \
  79. .filter(Document.dataset_id == self.id).scalar()
  80. @property
  81. def doc_form(self):
  82. document = db.session.query(Document).filter(
  83. Document.dataset_id == self.id).first()
  84. if document:
  85. return document.doc_form
  86. return None
  87. @property
  88. def retrieval_model_dict(self):
  89. default_retrieval_model = {
  90. 'search_method': 'semantic_search',
  91. 'reranking_enable': False,
  92. 'reranking_model': {
  93. 'reranking_provider_name': '',
  94. 'reranking_model_name': ''
  95. },
  96. 'top_k': 2,
  97. 'score_threshold_enabled': False
  98. }
  99. return self.retrieval_model if self.retrieval_model else default_retrieval_model
  100. @staticmethod
  101. def gen_collection_name_by_id(dataset_id: str) -> str:
  102. normalized_dataset_id = dataset_id.replace("-", "_")
  103. return f'Vector_index_{normalized_dataset_id}_Node'
  104. class DatasetProcessRule(db.Model):
  105. __tablename__ = 'dataset_process_rules'
  106. __table_args__ = (
  107. db.PrimaryKeyConstraint('id', name='dataset_process_rule_pkey'),
  108. db.Index('dataset_process_rule_dataset_id_idx', 'dataset_id'),
  109. )
  110. id = db.Column(UUID, nullable=False,
  111. server_default=db.text('uuid_generate_v4()'))
  112. dataset_id = db.Column(UUID, nullable=False)
  113. mode = db.Column(db.String(255), nullable=False,
  114. server_default=db.text("'automatic'::character varying"))
  115. rules = db.Column(db.Text, nullable=True)
  116. created_by = db.Column(UUID, nullable=False)
  117. created_at = db.Column(db.DateTime, nullable=False,
  118. server_default=db.text('CURRENT_TIMESTAMP(0)'))
  119. MODES = ['automatic', 'custom']
  120. PRE_PROCESSING_RULES = ['remove_stopwords', 'remove_extra_spaces', 'remove_urls_emails']
  121. AUTOMATIC_RULES = {
  122. 'pre_processing_rules': [
  123. {'id': 'remove_extra_spaces', 'enabled': True},
  124. {'id': 'remove_urls_emails', 'enabled': False}
  125. ],
  126. 'segmentation': {
  127. 'delimiter': '\n',
  128. 'max_tokens': 500,
  129. 'chunk_overlap': 50
  130. }
  131. }
  132. def to_dict(self):
  133. return {
  134. 'id': self.id,
  135. 'dataset_id': self.dataset_id,
  136. 'mode': self.mode,
  137. 'rules': self.rules_dict,
  138. 'created_by': self.created_by,
  139. 'created_at': self.created_at,
  140. }
  141. @property
  142. def rules_dict(self):
  143. try:
  144. return json.loads(self.rules) if self.rules else None
  145. except JSONDecodeError:
  146. return None
  147. class Document(db.Model):
  148. __tablename__ = 'documents'
  149. __table_args__ = (
  150. db.PrimaryKeyConstraint('id', name='document_pkey'),
  151. db.Index('document_dataset_id_idx', 'dataset_id'),
  152. db.Index('document_is_paused_idx', 'is_paused'),
  153. db.Index('document_tenant_idx', 'tenant_id'),
  154. )
  155. # initial fields
  156. id = db.Column(UUID, nullable=False,
  157. server_default=db.text('uuid_generate_v4()'))
  158. tenant_id = db.Column(UUID, nullable=False)
  159. dataset_id = db.Column(UUID, nullable=False)
  160. position = db.Column(db.Integer, nullable=False)
  161. data_source_type = db.Column(db.String(255), nullable=False)
  162. data_source_info = db.Column(db.Text, nullable=True)
  163. dataset_process_rule_id = db.Column(UUID, nullable=True)
  164. batch = db.Column(db.String(255), nullable=False)
  165. name = db.Column(db.String(255), nullable=False)
  166. created_from = db.Column(db.String(255), nullable=False)
  167. created_by = db.Column(UUID, nullable=False)
  168. created_api_request_id = db.Column(UUID, nullable=True)
  169. created_at = db.Column(db.DateTime, nullable=False,
  170. server_default=db.text('CURRENT_TIMESTAMP(0)'))
  171. # start processing
  172. processing_started_at = db.Column(db.DateTime, nullable=True)
  173. # parsing
  174. file_id = db.Column(db.Text, nullable=True)
  175. word_count = db.Column(db.Integer, nullable=True)
  176. parsing_completed_at = db.Column(db.DateTime, nullable=True)
  177. # cleaning
  178. cleaning_completed_at = db.Column(db.DateTime, nullable=True)
  179. # split
  180. splitting_completed_at = db.Column(db.DateTime, nullable=True)
  181. # indexing
  182. tokens = db.Column(db.Integer, nullable=True)
  183. indexing_latency = db.Column(db.Float, nullable=True)
  184. completed_at = db.Column(db.DateTime, nullable=True)
  185. # pause
  186. is_paused = db.Column(db.Boolean, nullable=True, server_default=db.text('false'))
  187. paused_by = db.Column(UUID, nullable=True)
  188. paused_at = db.Column(db.DateTime, nullable=True)
  189. # error
  190. error = db.Column(db.Text, nullable=True)
  191. stopped_at = db.Column(db.DateTime, nullable=True)
  192. # basic fields
  193. indexing_status = db.Column(db.String(
  194. 255), nullable=False, server_default=db.text("'waiting'::character varying"))
  195. enabled = db.Column(db.Boolean, nullable=False,
  196. server_default=db.text('true'))
  197. disabled_at = db.Column(db.DateTime, nullable=True)
  198. disabled_by = db.Column(UUID, nullable=True)
  199. archived = db.Column(db.Boolean, nullable=False,
  200. server_default=db.text('false'))
  201. archived_reason = db.Column(db.String(255), nullable=True)
  202. archived_by = db.Column(UUID, nullable=True)
  203. archived_at = db.Column(db.DateTime, nullable=True)
  204. updated_at = db.Column(db.DateTime, nullable=False,
  205. server_default=db.text('CURRENT_TIMESTAMP(0)'))
  206. doc_type = db.Column(db.String(40), nullable=True)
  207. doc_metadata = db.Column(db.JSON, nullable=True)
  208. doc_form = db.Column(db.String(
  209. 255), nullable=False, server_default=db.text("'text_model'::character varying"))
  210. doc_language = db.Column(db.String(255), nullable=True)
  211. DATA_SOURCES = ['upload_file', 'notion_import']
  212. @property
  213. def display_status(self):
  214. status = None
  215. if self.indexing_status == 'waiting':
  216. status = 'queuing'
  217. elif self.indexing_status not in ['completed', 'error', 'waiting'] and self.is_paused:
  218. status = 'paused'
  219. elif self.indexing_status in ['parsing', 'cleaning', 'splitting', 'indexing']:
  220. status = 'indexing'
  221. elif self.indexing_status == 'error':
  222. status = 'error'
  223. elif self.indexing_status == 'completed' and not self.archived and self.enabled:
  224. status = 'available'
  225. elif self.indexing_status == 'completed' and not self.archived and not self.enabled:
  226. status = 'disabled'
  227. elif self.indexing_status == 'completed' and self.archived:
  228. status = 'archived'
  229. return status
  230. @property
  231. def data_source_info_dict(self):
  232. if self.data_source_info:
  233. try:
  234. data_source_info_dict = json.loads(self.data_source_info)
  235. except JSONDecodeError:
  236. data_source_info_dict = {}
  237. return data_source_info_dict
  238. return None
  239. @property
  240. def data_source_detail_dict(self):
  241. if self.data_source_info:
  242. if self.data_source_type == 'upload_file':
  243. data_source_info_dict = json.loads(self.data_source_info)
  244. file_detail = db.session.query(UploadFile). \
  245. filter(UploadFile.id == data_source_info_dict['upload_file_id']). \
  246. one_or_none()
  247. if file_detail:
  248. return {
  249. 'upload_file': {
  250. 'id': file_detail.id,
  251. 'name': file_detail.name,
  252. 'size': file_detail.size,
  253. 'extension': file_detail.extension,
  254. 'mime_type': file_detail.mime_type,
  255. 'created_by': file_detail.created_by,
  256. 'created_at': file_detail.created_at.timestamp()
  257. }
  258. }
  259. elif self.data_source_type == 'notion_import':
  260. return json.loads(self.data_source_info)
  261. return {}
  262. @property
  263. def average_segment_length(self):
  264. if self.word_count and self.word_count != 0 and self.segment_count and self.segment_count != 0:
  265. return self.word_count // self.segment_count
  266. return 0
  267. @property
  268. def dataset_process_rule(self):
  269. if self.dataset_process_rule_id:
  270. return DatasetProcessRule.query.get(self.dataset_process_rule_id)
  271. return None
  272. @property
  273. def dataset(self):
  274. return db.session.query(Dataset).filter(Dataset.id == self.dataset_id).one_or_none()
  275. @property
  276. def segment_count(self):
  277. return DocumentSegment.query.filter(DocumentSegment.document_id == self.id).count()
  278. @property
  279. def hit_count(self):
  280. return DocumentSegment.query.with_entities(func.coalesce(func.sum(DocumentSegment.hit_count))) \
  281. .filter(DocumentSegment.document_id == self.id).scalar()
  282. class DocumentSegment(db.Model):
  283. __tablename__ = 'document_segments'
  284. __table_args__ = (
  285. db.PrimaryKeyConstraint('id', name='document_segment_pkey'),
  286. db.Index('document_segment_dataset_id_idx', 'dataset_id'),
  287. db.Index('document_segment_document_id_idx', 'document_id'),
  288. db.Index('document_segment_tenant_dataset_idx', 'dataset_id', 'tenant_id'),
  289. db.Index('document_segment_tenant_document_idx', 'document_id', 'tenant_id'),
  290. db.Index('document_segment_dataset_node_idx', 'dataset_id', 'index_node_id'),
  291. db.Index('document_segment_tenant_idx', 'tenant_id'),
  292. )
  293. # initial fields
  294. id = db.Column(UUID, nullable=False,
  295. server_default=db.text('uuid_generate_v4()'))
  296. tenant_id = db.Column(UUID, nullable=False)
  297. dataset_id = db.Column(UUID, nullable=False)
  298. document_id = db.Column(UUID, nullable=False)
  299. position = db.Column(db.Integer, nullable=False)
  300. content = db.Column(db.Text, nullable=False)
  301. answer = db.Column(db.Text, nullable=True)
  302. word_count = db.Column(db.Integer, nullable=False)
  303. tokens = db.Column(db.Integer, nullable=False)
  304. # indexing fields
  305. keywords = db.Column(db.JSON, nullable=True)
  306. index_node_id = db.Column(db.String(255), nullable=True)
  307. index_node_hash = db.Column(db.String(255), nullable=True)
  308. # basic fields
  309. hit_count = db.Column(db.Integer, nullable=False, default=0)
  310. enabled = db.Column(db.Boolean, nullable=False,
  311. server_default=db.text('true'))
  312. disabled_at = db.Column(db.DateTime, nullable=True)
  313. disabled_by = db.Column(UUID, nullable=True)
  314. status = db.Column(db.String(255), nullable=False,
  315. server_default=db.text("'waiting'::character varying"))
  316. created_by = db.Column(UUID, nullable=False)
  317. created_at = db.Column(db.DateTime, nullable=False,
  318. server_default=db.text('CURRENT_TIMESTAMP(0)'))
  319. updated_by = db.Column(UUID, nullable=True)
  320. updated_at = db.Column(db.DateTime, nullable=False,
  321. server_default=db.text('CURRENT_TIMESTAMP(0)'))
  322. indexing_at = db.Column(db.DateTime, nullable=True)
  323. completed_at = db.Column(db.DateTime, nullable=True)
  324. error = db.Column(db.Text, nullable=True)
  325. stopped_at = db.Column(db.DateTime, nullable=True)
  326. @property
  327. def dataset(self):
  328. return db.session.query(Dataset).filter(Dataset.id == self.dataset_id).first()
  329. @property
  330. def document(self):
  331. return db.session.query(Document).filter(Document.id == self.document_id).first()
  332. @property
  333. def previous_segment(self):
  334. return db.session.query(DocumentSegment).filter(
  335. DocumentSegment.document_id == self.document_id,
  336. DocumentSegment.position == self.position - 1
  337. ).first()
  338. @property
  339. def next_segment(self):
  340. return db.session.query(DocumentSegment).filter(
  341. DocumentSegment.document_id == self.document_id,
  342. DocumentSegment.position == self.position + 1
  343. ).first()
  344. class AppDatasetJoin(db.Model):
  345. __tablename__ = 'app_dataset_joins'
  346. __table_args__ = (
  347. db.PrimaryKeyConstraint('id', name='app_dataset_join_pkey'),
  348. db.Index('app_dataset_join_app_dataset_idx', 'dataset_id', 'app_id'),
  349. )
  350. id = db.Column(UUID, primary_key=True, nullable=False, server_default=db.text('uuid_generate_v4()'))
  351. app_id = db.Column(UUID, nullable=False)
  352. dataset_id = db.Column(UUID, nullable=False)
  353. created_at = db.Column(db.DateTime, nullable=False, server_default=db.func.current_timestamp())
  354. @property
  355. def app(self):
  356. return App.query.get(self.app_id)
  357. class DatasetQuery(db.Model):
  358. __tablename__ = 'dataset_queries'
  359. __table_args__ = (
  360. db.PrimaryKeyConstraint('id', name='dataset_query_pkey'),
  361. db.Index('dataset_query_dataset_id_idx', 'dataset_id'),
  362. )
  363. id = db.Column(UUID, primary_key=True, nullable=False, server_default=db.text('uuid_generate_v4()'))
  364. dataset_id = db.Column(UUID, nullable=False)
  365. content = db.Column(db.Text, nullable=False)
  366. source = db.Column(db.String(255), nullable=False)
  367. source_app_id = db.Column(UUID, nullable=True)
  368. created_by_role = db.Column(db.String, nullable=False)
  369. created_by = db.Column(UUID, nullable=False)
  370. created_at = db.Column(db.DateTime, nullable=False, server_default=db.func.current_timestamp())
  371. class DatasetKeywordTable(db.Model):
  372. __tablename__ = 'dataset_keyword_tables'
  373. __table_args__ = (
  374. db.PrimaryKeyConstraint('id', name='dataset_keyword_table_pkey'),
  375. db.Index('dataset_keyword_table_dataset_id_idx', 'dataset_id'),
  376. )
  377. id = db.Column(UUID, primary_key=True, server_default=db.text('uuid_generate_v4()'))
  378. dataset_id = db.Column(UUID, nullable=False, unique=True)
  379. keyword_table = db.Column(db.Text, nullable=False)
  380. @property
  381. def keyword_table_dict(self):
  382. class SetDecoder(json.JSONDecoder):
  383. def __init__(self, *args, **kwargs):
  384. super().__init__(object_hook=self.object_hook, *args, **kwargs)
  385. def object_hook(self, dct):
  386. if isinstance(dct, dict):
  387. for keyword, node_idxs in dct.items():
  388. if isinstance(node_idxs, list):
  389. dct[keyword] = set(node_idxs)
  390. return dct
  391. return json.loads(self.keyword_table, cls=SetDecoder) if self.keyword_table else None
  392. class Embedding(db.Model):
  393. __tablename__ = 'embeddings'
  394. __table_args__ = (
  395. db.PrimaryKeyConstraint('id', name='embedding_pkey'),
  396. db.UniqueConstraint('model_name', 'hash', name='embedding_hash_idx')
  397. )
  398. id = db.Column(UUID, primary_key=True, server_default=db.text('uuid_generate_v4()'))
  399. model_name = db.Column(db.String(40), nullable=False,
  400. server_default=db.text("'text-embedding-ada-002'::character varying"))
  401. hash = db.Column(db.String(64), nullable=False)
  402. embedding = db.Column(db.LargeBinary, nullable=False)
  403. created_at = db.Column(db.DateTime, nullable=False, server_default=db.text('CURRENT_TIMESTAMP(0)'))
  404. def set_embedding(self, embedding_data: list[float]):
  405. self.embedding = pickle.dumps(embedding_data, protocol=pickle.HIGHEST_PROTOCOL)
  406. def get_embedding(self) -> list[float]:
  407. return pickle.loads(self.embedding)
  408. class DatasetCollectionBinding(db.Model):
  409. __tablename__ = 'dataset_collection_bindings'
  410. __table_args__ = (
  411. db.PrimaryKeyConstraint('id', name='dataset_collection_bindings_pkey'),
  412. db.Index('provider_model_name_idx', 'provider_name', 'model_name')
  413. )
  414. id = db.Column(UUID, primary_key=True, server_default=db.text('uuid_generate_v4()'))
  415. provider_name = db.Column(db.String(40), nullable=False)
  416. model_name = db.Column(db.String(40), nullable=False)
  417. type = db.Column(db.String(40), server_default=db.text("'dataset'::character varying"), nullable=False)
  418. collection_name = db.Column(db.String(64), nullable=False)
  419. created_at = db.Column(db.DateTime, nullable=False, server_default=db.text('CURRENT_TIMESTAMP(0)'))