file.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147
  1. import datetime
  2. import hashlib
  3. import tempfile
  4. import chardet
  5. import time
  6. import uuid
  7. from pathlib import Path
  8. from cachetools import TTLCache
  9. from flask import request, current_app
  10. from flask_login import login_required, current_user
  11. from flask_restful import Resource, marshal_with, fields
  12. from werkzeug.exceptions import NotFound
  13. from controllers.console import api
  14. from controllers.console.datasets.error import NoFileUploadedError, TooManyFilesError, FileTooLargeError, \
  15. UnsupportedFileTypeError
  16. from controllers.console.setup import setup_required
  17. from controllers.console.wraps import account_initialization_required
  18. from core.data_loader.file_extractor import FileExtractor
  19. from extensions.ext_storage import storage
  20. from libs.helper import TimestampField
  21. from extensions.ext_database import db
  22. from models.model import UploadFile
  23. cache = TTLCache(maxsize=None, ttl=30)
  24. ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx']
  25. PREVIEW_WORDS_LIMIT = 3000
  26. class FileApi(Resource):
  27. upload_config_fields = {
  28. 'file_size_limit': fields.Integer,
  29. 'batch_count_limit': fields.Integer
  30. }
  31. @setup_required
  32. @login_required
  33. @account_initialization_required
  34. @marshal_with(upload_config_fields)
  35. def get(self):
  36. file_size_limit = current_app.config.get("UPLOAD_FILE_SIZE_LIMIT")
  37. batch_count_limit = current_app.config.get("UPLOAD_FILE_BATCH_LIMIT")
  38. return {
  39. 'file_size_limit': file_size_limit,
  40. 'batch_count_limit': batch_count_limit
  41. }, 200
  42. file_fields = {
  43. 'id': fields.String,
  44. 'name': fields.String,
  45. 'size': fields.Integer,
  46. 'extension': fields.String,
  47. 'mime_type': fields.String,
  48. 'created_by': fields.String,
  49. 'created_at': TimestampField,
  50. }
  51. @setup_required
  52. @login_required
  53. @account_initialization_required
  54. @marshal_with(file_fields)
  55. def post(self):
  56. # get file from request
  57. file = request.files['file']
  58. # check file
  59. if 'file' not in request.files:
  60. raise NoFileUploadedError()
  61. if len(request.files) > 1:
  62. raise TooManyFilesError()
  63. file_content = file.read()
  64. file_size = len(file_content)
  65. file_size_limit = current_app.config.get("UPLOAD_FILE_SIZE_LIMIT") * 1024 * 1024
  66. if file_size > file_size_limit:
  67. message = "({file_size} > {file_size_limit})"
  68. raise FileTooLargeError(message)
  69. extension = file.filename.split('.')[-1]
  70. if extension not in ALLOWED_EXTENSIONS:
  71. raise UnsupportedFileTypeError()
  72. # user uuid as file name
  73. file_uuid = str(uuid.uuid4())
  74. file_key = 'upload_files/' + current_user.current_tenant_id + '/' + file_uuid + '.' + extension
  75. # save file to storage
  76. storage.save(file_key, file_content)
  77. # save file to db
  78. config = current_app.config
  79. upload_file = UploadFile(
  80. tenant_id=current_user.current_tenant_id,
  81. storage_type=config['STORAGE_TYPE'],
  82. key=file_key,
  83. name=file.filename,
  84. size=file_size,
  85. extension=extension,
  86. mime_type=file.mimetype,
  87. created_by=current_user.id,
  88. created_at=datetime.datetime.utcnow(),
  89. used=False,
  90. hash=hashlib.sha3_256(file_content).hexdigest()
  91. )
  92. db.session.add(upload_file)
  93. db.session.commit()
  94. return upload_file, 201
  95. class FilePreviewApi(Resource):
  96. @setup_required
  97. @login_required
  98. @account_initialization_required
  99. def get(self, file_id):
  100. file_id = str(file_id)
  101. key = file_id + request.path
  102. cached_response = cache.get(key)
  103. if cached_response and time.time() - cached_response['timestamp'] < cache.ttl:
  104. return cached_response['response']
  105. upload_file = db.session.query(UploadFile) \
  106. .filter(UploadFile.id == file_id) \
  107. .first()
  108. if not upload_file:
  109. raise NotFound("File not found")
  110. # extract text from file
  111. extension = upload_file.extension
  112. if extension not in ALLOWED_EXTENSIONS:
  113. raise UnsupportedFileTypeError()
  114. text = FileExtractor.load(upload_file, return_text=True)
  115. text = text[0:PREVIEW_WORDS_LIMIT] if text else ''
  116. return {'content': text}
  117. api.add_resource(FileApi, '/files/upload')
  118. api.add_resource(FilePreviewApi, '/files/<uuid:file_id>/preview')