file.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148
  1. import datetime
  2. import hashlib
  3. import tempfile
  4. import chardet
  5. import time
  6. import uuid
  7. from pathlib import Path
  8. from cachetools import TTLCache
  9. from flask import request, current_app
  10. from flask_login import current_user
  11. from core.login.login import login_required
  12. from flask_restful import Resource, marshal_with, fields
  13. from werkzeug.exceptions import NotFound
  14. from controllers.console import api
  15. from controllers.console.datasets.error import NoFileUploadedError, TooManyFilesError, FileTooLargeError, \
  16. UnsupportedFileTypeError
  17. from controllers.console.setup import setup_required
  18. from controllers.console.wraps import account_initialization_required
  19. from core.data_loader.file_extractor import FileExtractor
  20. from extensions.ext_storage import storage
  21. from libs.helper import TimestampField
  22. from extensions.ext_database import db
  23. from models.model import UploadFile
  24. cache = TTLCache(maxsize=None, ttl=30)
  25. ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx']
  26. PREVIEW_WORDS_LIMIT = 3000
  27. class FileApi(Resource):
  28. upload_config_fields = {
  29. 'file_size_limit': fields.Integer,
  30. 'batch_count_limit': fields.Integer
  31. }
  32. @setup_required
  33. @login_required
  34. @account_initialization_required
  35. @marshal_with(upload_config_fields)
  36. def get(self):
  37. file_size_limit = current_app.config.get("UPLOAD_FILE_SIZE_LIMIT")
  38. batch_count_limit = current_app.config.get("UPLOAD_FILE_BATCH_LIMIT")
  39. return {
  40. 'file_size_limit': file_size_limit,
  41. 'batch_count_limit': batch_count_limit
  42. }, 200
  43. file_fields = {
  44. 'id': fields.String,
  45. 'name': fields.String,
  46. 'size': fields.Integer,
  47. 'extension': fields.String,
  48. 'mime_type': fields.String,
  49. 'created_by': fields.String,
  50. 'created_at': TimestampField,
  51. }
  52. @setup_required
  53. @login_required
  54. @account_initialization_required
  55. @marshal_with(file_fields)
  56. def post(self):
  57. # get file from request
  58. file = request.files['file']
  59. # check file
  60. if 'file' not in request.files:
  61. raise NoFileUploadedError()
  62. if len(request.files) > 1:
  63. raise TooManyFilesError()
  64. file_content = file.read()
  65. file_size = len(file_content)
  66. file_size_limit = current_app.config.get("UPLOAD_FILE_SIZE_LIMIT") * 1024 * 1024
  67. if file_size > file_size_limit:
  68. message = "({file_size} > {file_size_limit})"
  69. raise FileTooLargeError(message)
  70. extension = file.filename.split('.')[-1]
  71. if extension.lower() not in ALLOWED_EXTENSIONS:
  72. raise UnsupportedFileTypeError()
  73. # user uuid as file name
  74. file_uuid = str(uuid.uuid4())
  75. file_key = 'upload_files/' + current_user.current_tenant_id + '/' + file_uuid + '.' + extension
  76. # save file to storage
  77. storage.save(file_key, file_content)
  78. # save file to db
  79. config = current_app.config
  80. upload_file = UploadFile(
  81. tenant_id=current_user.current_tenant_id,
  82. storage_type=config['STORAGE_TYPE'],
  83. key=file_key,
  84. name=file.filename,
  85. size=file_size,
  86. extension=extension,
  87. mime_type=file.mimetype,
  88. created_by=current_user.id,
  89. created_at=datetime.datetime.utcnow(),
  90. used=False,
  91. hash=hashlib.sha3_256(file_content).hexdigest()
  92. )
  93. db.session.add(upload_file)
  94. db.session.commit()
  95. return upload_file, 201
  96. class FilePreviewApi(Resource):
  97. @setup_required
  98. @login_required
  99. @account_initialization_required
  100. def get(self, file_id):
  101. file_id = str(file_id)
  102. key = file_id + request.path
  103. cached_response = cache.get(key)
  104. if cached_response and time.time() - cached_response['timestamp'] < cache.ttl:
  105. return cached_response['response']
  106. upload_file = db.session.query(UploadFile) \
  107. .filter(UploadFile.id == file_id) \
  108. .first()
  109. if not upload_file:
  110. raise NotFound("File not found")
  111. # extract text from file
  112. extension = upload_file.extension
  113. if extension.lower() not in ALLOWED_EXTENSIONS:
  114. raise UnsupportedFileTypeError()
  115. text = FileExtractor.load(upload_file, return_text=True)
  116. text = text[0:PREVIEW_WORDS_LIMIT] if text else ''
  117. return {'content': text}
  118. api.add_resource(FileApi, '/files/upload')
  119. api.add_resource(FilePreviewApi, '/files/<uuid:file_id>/preview')