datasets_document.py 39 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982
  1. import logging
  2. from argparse import ArgumentTypeError
  3. from datetime import datetime, timezone
  4. from flask import request
  5. from flask_login import current_user
  6. from flask_restful import Resource, fields, marshal, marshal_with, reqparse
  7. from sqlalchemy import asc, desc
  8. from transformers.hf_argparser import string_to_bool
  9. from werkzeug.exceptions import Forbidden, NotFound
  10. import services
  11. from controllers.console import api
  12. from controllers.console.app.error import (
  13. ProviderModelCurrentlyNotSupportError,
  14. ProviderNotInitializeError,
  15. ProviderQuotaExceededError,
  16. )
  17. from controllers.console.datasets.error import (
  18. ArchivedDocumentImmutableError,
  19. DocumentAlreadyFinishedError,
  20. DocumentIndexingError,
  21. InvalidActionError,
  22. InvalidMetadataError,
  23. )
  24. from controllers.console.setup import setup_required
  25. from controllers.console.wraps import account_initialization_required, cloud_edition_billing_resource_check
  26. from core.errors.error import (
  27. LLMBadRequestError,
  28. ModelCurrentlyNotSupportError,
  29. ProviderTokenNotInitError,
  30. QuotaExceededError,
  31. )
  32. from core.indexing_runner import IndexingRunner
  33. from core.model_manager import ModelManager
  34. from core.model_runtime.entities.model_entities import ModelType
  35. from core.model_runtime.errors.invoke import InvokeAuthorizationError
  36. from core.rag.extractor.entity.extract_setting import ExtractSetting
  37. from extensions.ext_database import db
  38. from extensions.ext_redis import redis_client
  39. from fields.document_fields import (
  40. dataset_and_document_fields,
  41. document_fields,
  42. document_status_fields,
  43. document_with_segments_fields,
  44. )
  45. from libs.login import login_required
  46. from models.dataset import Dataset, DatasetProcessRule, Document, DocumentSegment
  47. from models.model import UploadFile
  48. from services.dataset_service import DatasetService, DocumentService
  49. from tasks.add_document_to_index_task import add_document_to_index_task
  50. from tasks.remove_document_from_index_task import remove_document_from_index_task
  51. class DocumentResource(Resource):
  52. def get_document(self, dataset_id: str, document_id: str) -> Document:
  53. dataset = DatasetService.get_dataset(dataset_id)
  54. if not dataset:
  55. raise NotFound('Dataset not found.')
  56. try:
  57. DatasetService.check_dataset_permission(dataset, current_user)
  58. except services.errors.account.NoPermissionError as e:
  59. raise Forbidden(str(e))
  60. document = DocumentService.get_document(dataset_id, document_id)
  61. if not document:
  62. raise NotFound('Document not found.')
  63. if document.tenant_id != current_user.current_tenant_id:
  64. raise Forbidden('No permission.')
  65. return document
  66. def get_batch_documents(self, dataset_id: str, batch: str) -> list[Document]:
  67. dataset = DatasetService.get_dataset(dataset_id)
  68. if not dataset:
  69. raise NotFound('Dataset not found.')
  70. try:
  71. DatasetService.check_dataset_permission(dataset, current_user)
  72. except services.errors.account.NoPermissionError as e:
  73. raise Forbidden(str(e))
  74. documents = DocumentService.get_batch_documents(dataset_id, batch)
  75. if not documents:
  76. raise NotFound('Documents not found.')
  77. return documents
  78. class GetProcessRuleApi(Resource):
  79. @setup_required
  80. @login_required
  81. @account_initialization_required
  82. def get(self):
  83. req_data = request.args
  84. document_id = req_data.get('document_id')
  85. # get default rules
  86. mode = DocumentService.DEFAULT_RULES['mode']
  87. rules = DocumentService.DEFAULT_RULES['rules']
  88. if document_id:
  89. # get the latest process rule
  90. document = Document.query.get_or_404(document_id)
  91. dataset = DatasetService.get_dataset(document.dataset_id)
  92. if not dataset:
  93. raise NotFound('Dataset not found.')
  94. try:
  95. DatasetService.check_dataset_permission(dataset, current_user)
  96. except services.errors.account.NoPermissionError as e:
  97. raise Forbidden(str(e))
  98. # get the latest process rule
  99. dataset_process_rule = db.session.query(DatasetProcessRule). \
  100. filter(DatasetProcessRule.dataset_id == document.dataset_id). \
  101. order_by(DatasetProcessRule.created_at.desc()). \
  102. limit(1). \
  103. one_or_none()
  104. if dataset_process_rule:
  105. mode = dataset_process_rule.mode
  106. rules = dataset_process_rule.rules_dict
  107. return {
  108. 'mode': mode,
  109. 'rules': rules
  110. }
  111. class DatasetDocumentListApi(Resource):
  112. @setup_required
  113. @login_required
  114. @account_initialization_required
  115. def get(self, dataset_id):
  116. dataset_id = str(dataset_id)
  117. page = request.args.get('page', default=1, type=int)
  118. limit = request.args.get('limit', default=20, type=int)
  119. search = request.args.get('keyword', default=None, type=str)
  120. sort = request.args.get('sort', default='-created_at', type=str)
  121. # "yes", "true", "t", "y", "1" convert to True, while others convert to False.
  122. try:
  123. fetch = string_to_bool(request.args.get('fetch', default='false'))
  124. except (ArgumentTypeError, ValueError, Exception) as e:
  125. fetch = False
  126. dataset = DatasetService.get_dataset(dataset_id)
  127. if not dataset:
  128. raise NotFound('Dataset not found.')
  129. try:
  130. DatasetService.check_dataset_permission(dataset, current_user)
  131. except services.errors.account.NoPermissionError as e:
  132. raise Forbidden(str(e))
  133. query = Document.query.filter_by(
  134. dataset_id=str(dataset_id), tenant_id=current_user.current_tenant_id)
  135. if search:
  136. search = f'%{search}%'
  137. query = query.filter(Document.name.like(search))
  138. if sort.startswith('-'):
  139. sort_logic = desc
  140. sort = sort[1:]
  141. else:
  142. sort_logic = asc
  143. if sort == 'hit_count':
  144. sub_query = db.select(DocumentSegment.document_id,
  145. db.func.sum(DocumentSegment.hit_count).label("total_hit_count")) \
  146. .group_by(DocumentSegment.document_id) \
  147. .subquery()
  148. query = query.outerjoin(sub_query, sub_query.c.document_id == Document.id) \
  149. .order_by(sort_logic(db.func.coalesce(sub_query.c.total_hit_count, 0)))
  150. elif sort == 'created_at':
  151. query = query.order_by(sort_logic(Document.created_at))
  152. else:
  153. query = query.order_by(desc(Document.created_at))
  154. paginated_documents = query.paginate(
  155. page=page, per_page=limit, max_per_page=100, error_out=False)
  156. documents = paginated_documents.items
  157. if fetch:
  158. for document in documents:
  159. completed_segments = DocumentSegment.query.filter(DocumentSegment.completed_at.isnot(None),
  160. DocumentSegment.document_id == str(document.id),
  161. DocumentSegment.status != 're_segment').count()
  162. total_segments = DocumentSegment.query.filter(DocumentSegment.document_id == str(document.id),
  163. DocumentSegment.status != 're_segment').count()
  164. document.completed_segments = completed_segments
  165. document.total_segments = total_segments
  166. data = marshal(documents, document_with_segments_fields)
  167. else:
  168. data = marshal(documents, document_fields)
  169. response = {
  170. 'data': data,
  171. 'has_more': len(documents) == limit,
  172. 'limit': limit,
  173. 'total': paginated_documents.total,
  174. 'page': page
  175. }
  176. return response
  177. documents_and_batch_fields = {
  178. 'documents': fields.List(fields.Nested(document_fields)),
  179. 'batch': fields.String
  180. }
  181. @setup_required
  182. @login_required
  183. @account_initialization_required
  184. @marshal_with(documents_and_batch_fields)
  185. @cloud_edition_billing_resource_check('vector_space')
  186. def post(self, dataset_id):
  187. dataset_id = str(dataset_id)
  188. dataset = DatasetService.get_dataset(dataset_id)
  189. if not dataset:
  190. raise NotFound('Dataset not found.')
  191. # The role of the current user in the ta table must be admin, owner, or editor
  192. if not current_user.is_editor:
  193. raise Forbidden()
  194. try:
  195. DatasetService.check_dataset_permission(dataset, current_user)
  196. except services.errors.account.NoPermissionError as e:
  197. raise Forbidden(str(e))
  198. parser = reqparse.RequestParser()
  199. parser.add_argument('indexing_technique', type=str, choices=Dataset.INDEXING_TECHNIQUE_LIST, nullable=False,
  200. location='json')
  201. parser.add_argument('data_source', type=dict, required=False, location='json')
  202. parser.add_argument('process_rule', type=dict, required=False, location='json')
  203. parser.add_argument('duplicate', type=bool, default=True, nullable=False, location='json')
  204. parser.add_argument('original_document_id', type=str, required=False, location='json')
  205. parser.add_argument('doc_form', type=str, default='text_model', required=False, nullable=False, location='json')
  206. parser.add_argument('doc_language', type=str, default='English', required=False, nullable=False,
  207. location='json')
  208. parser.add_argument('retrieval_model', type=dict, required=False, nullable=False,
  209. location='json')
  210. args = parser.parse_args()
  211. if not dataset.indexing_technique and not args['indexing_technique']:
  212. raise ValueError('indexing_technique is required.')
  213. # validate args
  214. DocumentService.document_create_args_validate(args)
  215. try:
  216. documents, batch = DocumentService.save_document_with_dataset_id(dataset, args, current_user)
  217. except ProviderTokenNotInitError as ex:
  218. raise ProviderNotInitializeError(ex.description)
  219. except QuotaExceededError:
  220. raise ProviderQuotaExceededError()
  221. except ModelCurrentlyNotSupportError:
  222. raise ProviderModelCurrentlyNotSupportError()
  223. return {
  224. 'documents': documents,
  225. 'batch': batch
  226. }
  227. class DatasetInitApi(Resource):
  228. @setup_required
  229. @login_required
  230. @account_initialization_required
  231. @marshal_with(dataset_and_document_fields)
  232. @cloud_edition_billing_resource_check('vector_space')
  233. def post(self):
  234. # The role of the current user in the ta table must be admin, owner, or editor
  235. if not current_user.is_editor:
  236. raise Forbidden()
  237. parser = reqparse.RequestParser()
  238. parser.add_argument('indexing_technique', type=str, choices=Dataset.INDEXING_TECHNIQUE_LIST, required=True,
  239. nullable=False, location='json')
  240. parser.add_argument('data_source', type=dict, required=True, nullable=True, location='json')
  241. parser.add_argument('process_rule', type=dict, required=True, nullable=True, location='json')
  242. parser.add_argument('doc_form', type=str, default='text_model', required=False, nullable=False, location='json')
  243. parser.add_argument('doc_language', type=str, default='English', required=False, nullable=False,
  244. location='json')
  245. parser.add_argument('retrieval_model', type=dict, required=False, nullable=False,
  246. location='json')
  247. args = parser.parse_args()
  248. if args['indexing_technique'] == 'high_quality':
  249. try:
  250. model_manager = ModelManager()
  251. model_manager.get_default_model_instance(
  252. tenant_id=current_user.current_tenant_id,
  253. model_type=ModelType.TEXT_EMBEDDING
  254. )
  255. except InvokeAuthorizationError:
  256. raise ProviderNotInitializeError(
  257. "No Embedding Model available. Please configure a valid provider "
  258. "in the Settings -> Model Provider.")
  259. except ProviderTokenNotInitError as ex:
  260. raise ProviderNotInitializeError(ex.description)
  261. # validate args
  262. DocumentService.document_create_args_validate(args)
  263. try:
  264. dataset, documents, batch = DocumentService.save_document_without_dataset_id(
  265. tenant_id=current_user.current_tenant_id,
  266. document_data=args,
  267. account=current_user
  268. )
  269. except ProviderTokenNotInitError as ex:
  270. raise ProviderNotInitializeError(ex.description)
  271. except QuotaExceededError:
  272. raise ProviderQuotaExceededError()
  273. except ModelCurrentlyNotSupportError:
  274. raise ProviderModelCurrentlyNotSupportError()
  275. response = {
  276. 'dataset': dataset,
  277. 'documents': documents,
  278. 'batch': batch
  279. }
  280. return response
  281. class DocumentIndexingEstimateApi(DocumentResource):
  282. @setup_required
  283. @login_required
  284. @account_initialization_required
  285. def get(self, dataset_id, document_id):
  286. dataset_id = str(dataset_id)
  287. document_id = str(document_id)
  288. document = self.get_document(dataset_id, document_id)
  289. if document.indexing_status in ['completed', 'error']:
  290. raise DocumentAlreadyFinishedError()
  291. data_process_rule = document.dataset_process_rule
  292. data_process_rule_dict = data_process_rule.to_dict()
  293. response = {
  294. "tokens": 0,
  295. "total_price": 0,
  296. "currency": "USD",
  297. "total_segments": 0,
  298. "preview": []
  299. }
  300. if document.data_source_type == 'upload_file':
  301. data_source_info = document.data_source_info_dict
  302. if data_source_info and 'upload_file_id' in data_source_info:
  303. file_id = data_source_info['upload_file_id']
  304. file = db.session.query(UploadFile).filter(
  305. UploadFile.tenant_id == document.tenant_id,
  306. UploadFile.id == file_id
  307. ).first()
  308. # raise error if file not found
  309. if not file:
  310. raise NotFound('File not found.')
  311. extract_setting = ExtractSetting(
  312. datasource_type="upload_file",
  313. upload_file=file,
  314. document_model=document.doc_form
  315. )
  316. indexing_runner = IndexingRunner()
  317. try:
  318. response = indexing_runner.indexing_estimate(current_user.current_tenant_id, [extract_setting],
  319. data_process_rule_dict, document.doc_form,
  320. 'English', dataset_id)
  321. except LLMBadRequestError:
  322. raise ProviderNotInitializeError(
  323. "No Embedding Model available. Please configure a valid provider "
  324. "in the Settings -> Model Provider.")
  325. except ProviderTokenNotInitError as ex:
  326. raise ProviderNotInitializeError(ex.description)
  327. return response
  328. class DocumentBatchIndexingEstimateApi(DocumentResource):
  329. @setup_required
  330. @login_required
  331. @account_initialization_required
  332. def get(self, dataset_id, batch):
  333. dataset_id = str(dataset_id)
  334. batch = str(batch)
  335. documents = self.get_batch_documents(dataset_id, batch)
  336. response = {
  337. "tokens": 0,
  338. "total_price": 0,
  339. "currency": "USD",
  340. "total_segments": 0,
  341. "preview": []
  342. }
  343. if not documents:
  344. return response
  345. data_process_rule = documents[0].dataset_process_rule
  346. data_process_rule_dict = data_process_rule.to_dict()
  347. info_list = []
  348. extract_settings = []
  349. for document in documents:
  350. if document.indexing_status in ['completed', 'error']:
  351. raise DocumentAlreadyFinishedError()
  352. data_source_info = document.data_source_info_dict
  353. # format document files info
  354. if data_source_info and 'upload_file_id' in data_source_info:
  355. file_id = data_source_info['upload_file_id']
  356. info_list.append(file_id)
  357. # format document notion info
  358. elif data_source_info and 'notion_workspace_id' in data_source_info and 'notion_page_id' in data_source_info:
  359. pages = []
  360. page = {
  361. 'page_id': data_source_info['notion_page_id'],
  362. 'type': data_source_info['type']
  363. }
  364. pages.append(page)
  365. notion_info = {
  366. 'workspace_id': data_source_info['notion_workspace_id'],
  367. 'pages': pages
  368. }
  369. info_list.append(notion_info)
  370. if document.data_source_type == 'upload_file':
  371. file_id = data_source_info['upload_file_id']
  372. file_detail = db.session.query(UploadFile).filter(
  373. UploadFile.tenant_id == current_user.current_tenant_id,
  374. UploadFile.id == file_id
  375. ).first()
  376. if file_detail is None:
  377. raise NotFound("File not found.")
  378. extract_setting = ExtractSetting(
  379. datasource_type="upload_file",
  380. upload_file=file_detail,
  381. document_model=document.doc_form
  382. )
  383. extract_settings.append(extract_setting)
  384. elif document.data_source_type == 'notion_import':
  385. extract_setting = ExtractSetting(
  386. datasource_type="notion_import",
  387. notion_info={
  388. "notion_workspace_id": data_source_info['notion_workspace_id'],
  389. "notion_obj_id": data_source_info['notion_page_id'],
  390. "notion_page_type": data_source_info['type'],
  391. "tenant_id": current_user.current_tenant_id
  392. },
  393. document_model=document.doc_form
  394. )
  395. extract_settings.append(extract_setting)
  396. else:
  397. raise ValueError('Data source type not support')
  398. indexing_runner = IndexingRunner()
  399. try:
  400. response = indexing_runner.indexing_estimate(current_user.current_tenant_id, extract_settings,
  401. data_process_rule_dict, document.doc_form,
  402. 'English', dataset_id)
  403. except LLMBadRequestError:
  404. raise ProviderNotInitializeError(
  405. "No Embedding Model available. Please configure a valid provider "
  406. "in the Settings -> Model Provider.")
  407. except ProviderTokenNotInitError as ex:
  408. raise ProviderNotInitializeError(ex.description)
  409. return response
  410. class DocumentBatchIndexingStatusApi(DocumentResource):
  411. @setup_required
  412. @login_required
  413. @account_initialization_required
  414. def get(self, dataset_id, batch):
  415. dataset_id = str(dataset_id)
  416. batch = str(batch)
  417. documents = self.get_batch_documents(dataset_id, batch)
  418. documents_status = []
  419. for document in documents:
  420. completed_segments = DocumentSegment.query.filter(DocumentSegment.completed_at.isnot(None),
  421. DocumentSegment.document_id == str(document.id),
  422. DocumentSegment.status != 're_segment').count()
  423. total_segments = DocumentSegment.query.filter(DocumentSegment.document_id == str(document.id),
  424. DocumentSegment.status != 're_segment').count()
  425. document.completed_segments = completed_segments
  426. document.total_segments = total_segments
  427. if document.is_paused:
  428. document.indexing_status = 'paused'
  429. documents_status.append(marshal(document, document_status_fields))
  430. data = {
  431. 'data': documents_status
  432. }
  433. return data
  434. class DocumentIndexingStatusApi(DocumentResource):
  435. @setup_required
  436. @login_required
  437. @account_initialization_required
  438. def get(self, dataset_id, document_id):
  439. dataset_id = str(dataset_id)
  440. document_id = str(document_id)
  441. document = self.get_document(dataset_id, document_id)
  442. completed_segments = DocumentSegment.query \
  443. .filter(DocumentSegment.completed_at.isnot(None),
  444. DocumentSegment.document_id == str(document_id),
  445. DocumentSegment.status != 're_segment') \
  446. .count()
  447. total_segments = DocumentSegment.query \
  448. .filter(DocumentSegment.document_id == str(document_id),
  449. DocumentSegment.status != 're_segment') \
  450. .count()
  451. document.completed_segments = completed_segments
  452. document.total_segments = total_segments
  453. if document.is_paused:
  454. document.indexing_status = 'paused'
  455. return marshal(document, document_status_fields)
  456. class DocumentDetailApi(DocumentResource):
  457. METADATA_CHOICES = {'all', 'only', 'without'}
  458. @setup_required
  459. @login_required
  460. @account_initialization_required
  461. def get(self, dataset_id, document_id):
  462. dataset_id = str(dataset_id)
  463. document_id = str(document_id)
  464. document = self.get_document(dataset_id, document_id)
  465. metadata = request.args.get('metadata', 'all')
  466. if metadata not in self.METADATA_CHOICES:
  467. raise InvalidMetadataError(f'Invalid metadata value: {metadata}')
  468. if metadata == 'only':
  469. response = {
  470. 'id': document.id,
  471. 'doc_type': document.doc_type,
  472. 'doc_metadata': document.doc_metadata
  473. }
  474. elif metadata == 'without':
  475. process_rules = DatasetService.get_process_rules(dataset_id)
  476. data_source_info = document.data_source_detail_dict
  477. response = {
  478. 'id': document.id,
  479. 'position': document.position,
  480. 'data_source_type': document.data_source_type,
  481. 'data_source_info': data_source_info,
  482. 'dataset_process_rule_id': document.dataset_process_rule_id,
  483. 'dataset_process_rule': process_rules,
  484. 'name': document.name,
  485. 'created_from': document.created_from,
  486. 'created_by': document.created_by,
  487. 'created_at': document.created_at.timestamp(),
  488. 'tokens': document.tokens,
  489. 'indexing_status': document.indexing_status,
  490. 'completed_at': int(document.completed_at.timestamp()) if document.completed_at else None,
  491. 'updated_at': int(document.updated_at.timestamp()) if document.updated_at else None,
  492. 'indexing_latency': document.indexing_latency,
  493. 'error': document.error,
  494. 'enabled': document.enabled,
  495. 'disabled_at': int(document.disabled_at.timestamp()) if document.disabled_at else None,
  496. 'disabled_by': document.disabled_by,
  497. 'archived': document.archived,
  498. 'segment_count': document.segment_count,
  499. 'average_segment_length': document.average_segment_length,
  500. 'hit_count': document.hit_count,
  501. 'display_status': document.display_status,
  502. 'doc_form': document.doc_form
  503. }
  504. else:
  505. process_rules = DatasetService.get_process_rules(dataset_id)
  506. data_source_info = document.data_source_detail_dict
  507. response = {
  508. 'id': document.id,
  509. 'position': document.position,
  510. 'data_source_type': document.data_source_type,
  511. 'data_source_info': data_source_info,
  512. 'dataset_process_rule_id': document.dataset_process_rule_id,
  513. 'dataset_process_rule': process_rules,
  514. 'name': document.name,
  515. 'created_from': document.created_from,
  516. 'created_by': document.created_by,
  517. 'created_at': document.created_at.timestamp(),
  518. 'tokens': document.tokens,
  519. 'indexing_status': document.indexing_status,
  520. 'completed_at': int(document.completed_at.timestamp()) if document.completed_at else None,
  521. 'updated_at': int(document.updated_at.timestamp()) if document.updated_at else None,
  522. 'indexing_latency': document.indexing_latency,
  523. 'error': document.error,
  524. 'enabled': document.enabled,
  525. 'disabled_at': int(document.disabled_at.timestamp()) if document.disabled_at else None,
  526. 'disabled_by': document.disabled_by,
  527. 'archived': document.archived,
  528. 'doc_type': document.doc_type,
  529. 'doc_metadata': document.doc_metadata,
  530. 'segment_count': document.segment_count,
  531. 'average_segment_length': document.average_segment_length,
  532. 'hit_count': document.hit_count,
  533. 'display_status': document.display_status,
  534. 'doc_form': document.doc_form
  535. }
  536. return response, 200
  537. class DocumentProcessingApi(DocumentResource):
  538. @setup_required
  539. @login_required
  540. @account_initialization_required
  541. def patch(self, dataset_id, document_id, action):
  542. dataset_id = str(dataset_id)
  543. document_id = str(document_id)
  544. document = self.get_document(dataset_id, document_id)
  545. # The role of the current user in the ta table must be admin, owner, or editor
  546. if not current_user.is_editor:
  547. raise Forbidden()
  548. if action == "pause":
  549. if document.indexing_status != "indexing":
  550. raise InvalidActionError('Document not in indexing state.')
  551. document.paused_by = current_user.id
  552. document.paused_at = datetime.now(timezone.utc).replace(tzinfo=None)
  553. document.is_paused = True
  554. db.session.commit()
  555. elif action == "resume":
  556. if document.indexing_status not in ["paused", "error"]:
  557. raise InvalidActionError('Document not in paused or error state.')
  558. document.paused_by = None
  559. document.paused_at = None
  560. document.is_paused = False
  561. db.session.commit()
  562. else:
  563. raise InvalidActionError()
  564. return {'result': 'success'}, 200
  565. class DocumentDeleteApi(DocumentResource):
  566. @setup_required
  567. @login_required
  568. @account_initialization_required
  569. def delete(self, dataset_id, document_id):
  570. dataset_id = str(dataset_id)
  571. document_id = str(document_id)
  572. dataset = DatasetService.get_dataset(dataset_id)
  573. if dataset is None:
  574. raise NotFound("Dataset not found.")
  575. # check user's model setting
  576. DatasetService.check_dataset_model_setting(dataset)
  577. document = self.get_document(dataset_id, document_id)
  578. try:
  579. DocumentService.delete_document(document)
  580. except services.errors.document.DocumentIndexingError:
  581. raise DocumentIndexingError('Cannot delete document during indexing.')
  582. return {'result': 'success'}, 204
  583. class DocumentMetadataApi(DocumentResource):
  584. @setup_required
  585. @login_required
  586. @account_initialization_required
  587. def put(self, dataset_id, document_id):
  588. dataset_id = str(dataset_id)
  589. document_id = str(document_id)
  590. document = self.get_document(dataset_id, document_id)
  591. req_data = request.get_json()
  592. doc_type = req_data.get('doc_type')
  593. doc_metadata = req_data.get('doc_metadata')
  594. # The role of the current user in the ta table must be admin, owner, or editor
  595. if not current_user.is_editor:
  596. raise Forbidden()
  597. if doc_type is None or doc_metadata is None:
  598. raise ValueError('Both doc_type and doc_metadata must be provided.')
  599. if doc_type not in DocumentService.DOCUMENT_METADATA_SCHEMA:
  600. raise ValueError('Invalid doc_type.')
  601. if not isinstance(doc_metadata, dict):
  602. raise ValueError('doc_metadata must be a dictionary.')
  603. metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[doc_type]
  604. document.doc_metadata = {}
  605. if doc_type == 'others':
  606. document.doc_metadata = doc_metadata
  607. else:
  608. for key, value_type in metadata_schema.items():
  609. value = doc_metadata.get(key)
  610. if value is not None and isinstance(value, value_type):
  611. document.doc_metadata[key] = value
  612. document.doc_type = doc_type
  613. document.updated_at = datetime.now(timezone.utc).replace(tzinfo=None)
  614. db.session.commit()
  615. return {'result': 'success', 'message': 'Document metadata updated.'}, 200
  616. class DocumentStatusApi(DocumentResource):
  617. @setup_required
  618. @login_required
  619. @account_initialization_required
  620. @cloud_edition_billing_resource_check('vector_space')
  621. def patch(self, dataset_id, document_id, action):
  622. dataset_id = str(dataset_id)
  623. document_id = str(document_id)
  624. dataset = DatasetService.get_dataset(dataset_id)
  625. if dataset is None:
  626. raise NotFound("Dataset not found.")
  627. # check user's model setting
  628. DatasetService.check_dataset_model_setting(dataset)
  629. document = self.get_document(dataset_id, document_id)
  630. # The role of the current user in the ta table must be admin, owner, or editor
  631. if not current_user.is_editor:
  632. raise Forbidden()
  633. indexing_cache_key = 'document_{}_indexing'.format(document.id)
  634. cache_result = redis_client.get(indexing_cache_key)
  635. if cache_result is not None:
  636. raise InvalidActionError("Document is being indexed, please try again later")
  637. if action == "enable":
  638. if document.enabled:
  639. raise InvalidActionError('Document already enabled.')
  640. document.enabled = True
  641. document.disabled_at = None
  642. document.disabled_by = None
  643. document.updated_at = datetime.now(timezone.utc).replace(tzinfo=None)
  644. db.session.commit()
  645. # Set cache to prevent indexing the same document multiple times
  646. redis_client.setex(indexing_cache_key, 600, 1)
  647. add_document_to_index_task.delay(document_id)
  648. return {'result': 'success'}, 200
  649. elif action == "disable":
  650. if not document.completed_at or document.indexing_status != 'completed':
  651. raise InvalidActionError('Document is not completed.')
  652. if not document.enabled:
  653. raise InvalidActionError('Document already disabled.')
  654. document.enabled = False
  655. document.disabled_at = datetime.now(timezone.utc).replace(tzinfo=None)
  656. document.disabled_by = current_user.id
  657. document.updated_at = datetime.now(timezone.utc).replace(tzinfo=None)
  658. db.session.commit()
  659. # Set cache to prevent indexing the same document multiple times
  660. redis_client.setex(indexing_cache_key, 600, 1)
  661. remove_document_from_index_task.delay(document_id)
  662. return {'result': 'success'}, 200
  663. elif action == "archive":
  664. if document.archived:
  665. raise InvalidActionError('Document already archived.')
  666. document.archived = True
  667. document.archived_at = datetime.now(timezone.utc).replace(tzinfo=None)
  668. document.archived_by = current_user.id
  669. document.updated_at = datetime.now(timezone.utc).replace(tzinfo=None)
  670. db.session.commit()
  671. if document.enabled:
  672. # Set cache to prevent indexing the same document multiple times
  673. redis_client.setex(indexing_cache_key, 600, 1)
  674. remove_document_from_index_task.delay(document_id)
  675. return {'result': 'success'}, 200
  676. elif action == "un_archive":
  677. if not document.archived:
  678. raise InvalidActionError('Document is not archived.')
  679. document.archived = False
  680. document.archived_at = None
  681. document.archived_by = None
  682. document.updated_at = datetime.now(timezone.utc).replace(tzinfo=None)
  683. db.session.commit()
  684. # Set cache to prevent indexing the same document multiple times
  685. redis_client.setex(indexing_cache_key, 600, 1)
  686. add_document_to_index_task.delay(document_id)
  687. return {'result': 'success'}, 200
  688. else:
  689. raise InvalidActionError()
  690. class DocumentPauseApi(DocumentResource):
  691. @setup_required
  692. @login_required
  693. @account_initialization_required
  694. def patch(self, dataset_id, document_id):
  695. """pause document."""
  696. dataset_id = str(dataset_id)
  697. document_id = str(document_id)
  698. dataset = DatasetService.get_dataset(dataset_id)
  699. if not dataset:
  700. raise NotFound('Dataset not found.')
  701. document = DocumentService.get_document(dataset.id, document_id)
  702. # 404 if document not found
  703. if document is None:
  704. raise NotFound("Document Not Exists.")
  705. # 403 if document is archived
  706. if DocumentService.check_archived(document):
  707. raise ArchivedDocumentImmutableError()
  708. try:
  709. # pause document
  710. DocumentService.pause_document(document)
  711. except services.errors.document.DocumentIndexingError:
  712. raise DocumentIndexingError('Cannot pause completed document.')
  713. return {'result': 'success'}, 204
  714. class DocumentRecoverApi(DocumentResource):
  715. @setup_required
  716. @login_required
  717. @account_initialization_required
  718. def patch(self, dataset_id, document_id):
  719. """recover document."""
  720. dataset_id = str(dataset_id)
  721. document_id = str(document_id)
  722. dataset = DatasetService.get_dataset(dataset_id)
  723. if not dataset:
  724. raise NotFound('Dataset not found.')
  725. document = DocumentService.get_document(dataset.id, document_id)
  726. # 404 if document not found
  727. if document is None:
  728. raise NotFound("Document Not Exists.")
  729. # 403 if document is archived
  730. if DocumentService.check_archived(document):
  731. raise ArchivedDocumentImmutableError()
  732. try:
  733. # pause document
  734. DocumentService.recover_document(document)
  735. except services.errors.document.DocumentIndexingError:
  736. raise DocumentIndexingError('Document is not in paused status.')
  737. return {'result': 'success'}, 204
  738. class DocumentRetryApi(DocumentResource):
  739. @setup_required
  740. @login_required
  741. @account_initialization_required
  742. def post(self, dataset_id):
  743. """retry document."""
  744. parser = reqparse.RequestParser()
  745. parser.add_argument('document_ids', type=list, required=True, nullable=False,
  746. location='json')
  747. args = parser.parse_args()
  748. dataset_id = str(dataset_id)
  749. dataset = DatasetService.get_dataset(dataset_id)
  750. retry_documents = []
  751. if not dataset:
  752. raise NotFound('Dataset not found.')
  753. for document_id in args['document_ids']:
  754. try:
  755. document_id = str(document_id)
  756. document = DocumentService.get_document(dataset.id, document_id)
  757. # 404 if document not found
  758. if document is None:
  759. raise NotFound("Document Not Exists.")
  760. # 403 if document is archived
  761. if DocumentService.check_archived(document):
  762. raise ArchivedDocumentImmutableError()
  763. # 400 if document is completed
  764. if document.indexing_status == 'completed':
  765. raise DocumentAlreadyFinishedError()
  766. retry_documents.append(document)
  767. except Exception as e:
  768. logging.error(f"Document {document_id} retry failed: {str(e)}")
  769. continue
  770. # retry document
  771. DocumentService.retry_document(dataset_id, retry_documents)
  772. return {'result': 'success'}, 204
  773. class DocumentRenameApi(DocumentResource):
  774. @setup_required
  775. @login_required
  776. @account_initialization_required
  777. @marshal_with(document_fields)
  778. def post(self, dataset_id, document_id):
  779. # The role of the current user in the ta table must be admin or owner
  780. if not current_user.is_admin_or_owner:
  781. raise Forbidden()
  782. parser = reqparse.RequestParser()
  783. parser.add_argument('name', type=str, required=True, nullable=False, location='json')
  784. args = parser.parse_args()
  785. try:
  786. document = DocumentService.rename_document(dataset_id, document_id, args['name'])
  787. except services.errors.document.DocumentIndexingError:
  788. raise DocumentIndexingError('Cannot delete document during indexing.')
  789. return document
  790. api.add_resource(GetProcessRuleApi, '/datasets/process-rule')
  791. api.add_resource(DatasetDocumentListApi,
  792. '/datasets/<uuid:dataset_id>/documents')
  793. api.add_resource(DatasetInitApi,
  794. '/datasets/init')
  795. api.add_resource(DocumentIndexingEstimateApi,
  796. '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/indexing-estimate')
  797. api.add_resource(DocumentBatchIndexingEstimateApi,
  798. '/datasets/<uuid:dataset_id>/batch/<string:batch>/indexing-estimate')
  799. api.add_resource(DocumentBatchIndexingStatusApi,
  800. '/datasets/<uuid:dataset_id>/batch/<string:batch>/indexing-status')
  801. api.add_resource(DocumentIndexingStatusApi,
  802. '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/indexing-status')
  803. api.add_resource(DocumentDetailApi,
  804. '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>')
  805. api.add_resource(DocumentProcessingApi,
  806. '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/processing/<string:action>')
  807. api.add_resource(DocumentDeleteApi,
  808. '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>')
  809. api.add_resource(DocumentMetadataApi,
  810. '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/metadata')
  811. api.add_resource(DocumentStatusApi,
  812. '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/status/<string:action>')
  813. api.add_resource(DocumentPauseApi, '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/processing/pause')
  814. api.add_resource(DocumentRecoverApi, '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/processing/resume')
  815. api.add_resource(DocumentRetryApi, '/datasets/<uuid:dataset_id>/retry')
  816. api.add_resource(DocumentRenameApi,
  817. '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/rename')