create_document_index.py 1.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445
  1. import datetime
  2. import logging
  3. import time
  4. import click
  5. from werkzeug.exceptions import NotFound
  6. from core.indexing_runner import DocumentIsPausedException, IndexingRunner
  7. from events.event_handlers.document_index_event import document_index_created
  8. from extensions.ext_database import db
  9. from models.dataset import Document
  10. @document_index_created.connect
  11. def handle(sender, **kwargs):
  12. dataset_id = sender
  13. document_ids = kwargs.get('document_ids', None)
  14. documents = []
  15. start_at = time.perf_counter()
  16. for document_id in document_ids:
  17. logging.info(click.style('Start process document: {}'.format(document_id), fg='green'))
  18. document = db.session.query(Document).filter(
  19. Document.id == document_id,
  20. Document.dataset_id == dataset_id
  21. ).first()
  22. if not document:
  23. raise NotFound('Document not found')
  24. document.indexing_status = 'parsing'
  25. document.processing_started_at = datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None)
  26. documents.append(document)
  27. db.session.add(document)
  28. db.session.commit()
  29. try:
  30. indexing_runner = IndexingRunner()
  31. indexing_runner.run(documents)
  32. end_at = time.perf_counter()
  33. logging.info(click.style('Processed dataset: {} latency: {}'.format(dataset_id, end_at - start_at), fg='green'))
  34. except DocumentIsPausedException as ex:
  35. logging.info(click.style(str(ex), fg='yellow'))
  36. except Exception:
  37. pass