Browse Source

Feat/chat support voice input (#532)

zxhlyh 1 year ago
parent
commit
a03a92e9db
70 changed files with 1339 additions and 25 deletions
  1. 2 0
      .gitignore
  2. 2 2
      api/controllers/console/__init__.py
  3. 3 0
      api/controllers/console/app/app.py
  4. 69 0
      api/controllers/console/app/audio.py
  5. 24 0
      api/controllers/console/app/error.py
  6. 1 0
      api/controllers/console/app/model_config.py
  7. 66 0
      api/controllers/console/explore/audio.py
  8. 2 0
      api/controllers/console/explore/parameter.py
  9. 1 1
      api/controllers/service_api/__init__.py
  10. 2 0
      api/controllers/service_api/app/app.py
  11. 61 0
      api/controllers/service_api/app/audio.py
  12. 24 0
      api/controllers/service_api/app/error.py
  13. 1 1
      api/controllers/web/__init__.py
  14. 2 0
      api/controllers/web/app.py
  15. 63 0
      api/controllers/web/audio.py
  16. 24 0
      api/controllers/web/error.py
  17. 25 0
      api/core/llm/whisper.py
  18. 32 0
      api/migrations/versions/a5b56fb053ef_app_config_add_speech_to_text.py
  19. 10 0
      api/models/model.py
  20. 22 0
      api/services/app_model_config_service.py
  21. 43 0
      api/services/audio_service.py
  22. 1 1
      api/services/errors/__init__.py
  23. 23 0
      api/services/errors/audio.py
  24. 44 0
      web/app/components/app/chat/index.tsx
  25. 1 1
      web/app/components/app/chat/style.module.css
  26. 20 0
      web/app/components/app/configuration/config/feature/choose-feature/feature-item/preview-imgs/speech-to-text.svg
  27. 4 0
      web/app/components/app/configuration/config/feature/choose-feature/feature-item/style.module.css
  28. 16 0
      web/app/components/app/configuration/config/feature/choose-feature/index.tsx
  29. 14 8
      web/app/components/app/configuration/config/feature/use-feature.tsx
  30. 15 1
      web/app/components/app/configuration/config/index.tsx
  31. 4 0
      web/app/components/app/configuration/debug/index.tsx
  32. 16 6
      web/app/components/app/configuration/features/chat-group/index.tsx
  33. 25 0
      web/app/components/app/configuration/features/chat-group/speech-to-text/index.tsx
  34. 15 0
      web/app/components/app/configuration/index.tsx
  35. 10 0
      web/app/components/base/icons/assets/vender/line/general/loading-02.svg
  36. 5 0
      web/app/components/base/icons/assets/vender/line/general/x-close.svg
  37. 5 0
      web/app/components/base/icons/assets/vender/line/mediaAndDevices/microphone-01.svg
  38. 3 0
      web/app/components/base/icons/assets/vender/solid/general/x-circle.svg
  39. 8 0
      web/app/components/base/icons/assets/vender/solid/mediaAndDevices/microphone-01.svg
  40. 5 0
      web/app/components/base/icons/assets/vender/solid/mediaAndDevices/stop-circle.svg
  41. 64 0
      web/app/components/base/icons/src/vender/line/general/Loading02.json
  42. 14 0
      web/app/components/base/icons/src/vender/line/general/Loading02.tsx
  43. 39 0
      web/app/components/base/icons/src/vender/line/general/XClose.json
  44. 14 0
      web/app/components/base/icons/src/vender/line/general/XClose.tsx
  45. 2 0
      web/app/components/base/icons/src/vender/line/general/index.ts
  46. 39 0
      web/app/components/base/icons/src/vender/line/mediaAndDevices/Microphone01.json
  47. 14 0
      web/app/components/base/icons/src/vender/line/mediaAndDevices/Microphone01.tsx
  48. 1 0
      web/app/components/base/icons/src/vender/line/mediaAndDevices/index.ts
  49. 29 0
      web/app/components/base/icons/src/vender/solid/general/XCircle.json
  50. 14 0
      web/app/components/base/icons/src/vender/solid/general/XCircle.tsx
  51. 1 0
      web/app/components/base/icons/src/vender/solid/general/index.ts
  52. 55 0
      web/app/components/base/icons/src/vender/solid/mediaAndDevices/Microphone01.json
  53. 14 0
      web/app/components/base/icons/src/vender/solid/mediaAndDevices/Microphone01.tsx
  54. 38 0
      web/app/components/base/icons/src/vender/solid/mediaAndDevices/StopCircle.json
  55. 14 0
      web/app/components/base/icons/src/vender/solid/mediaAndDevices/StopCircle.tsx
  56. 2 0
      web/app/components/base/icons/src/vender/solid/mediaAndDevices/index.ts
  57. 10 0
      web/app/components/base/voice-input/index.module.css
  58. 197 0
      web/app/components/base/voice-input/index.tsx
  59. 1 1
      web/app/components/explore/installed-app/index.tsx
  60. 4 1
      web/app/components/share/chat/index.tsx
  61. 7 1
      web/context/debug-configuration.ts
  62. 5 0
      web/i18n/lang/app-debug.en.ts
  63. 5 0
      web/i18n/lang/app-debug.zh.ts
  64. 5 0
      web/i18n/lang/common.en.ts
  65. 5 0
      web/i18n/lang/common.zh.ts
  66. 5 0
      web/models/debug.ts
  67. 3 0
      web/package.json
  68. 18 1
      web/service/base.ts
  69. 4 0
      web/service/share.ts
  70. 3 0
      web/types/app.ts

+ 2 - 0
.gitignore

@@ -147,3 +147,5 @@ docker/volumes/weaviate/*
 sdks/python-client/build
 sdks/python-client/dist
 sdks/python-client/dify_client.egg-info
+
+.vscode/

+ 2 - 2
api/controllers/console/__init__.py

@@ -9,7 +9,7 @@ api = ExternalApi(bp)
 from . import setup, version, apikey, admin
 
 # Import app controllers
-from .app import app, site, completion, model_config, statistic, conversation, message, generator
+from .app import app, site, completion, model_config, statistic, conversation, message, generator, audio
 
 # Import auth controllers
 from .auth import login, oauth, data_source_oauth
@@ -21,4 +21,4 @@ from .datasets import datasets, datasets_document, datasets_segments, file, hit_
 from .workspace import workspace, members, providers, account
 
 # Import explore controllers
-from .explore import installed_app, recommended_app, completion, conversation, message, parameter, saved_message
+from .explore import installed_app, recommended_app, completion, conversation, message, parameter, saved_message, audio

+ 3 - 0
api/controllers/console/app/app.py

@@ -22,6 +22,7 @@ model_config_fields = {
     'opening_statement': fields.String,
     'suggested_questions': fields.Raw(attribute='suggested_questions_list'),
     'suggested_questions_after_answer': fields.Raw(attribute='suggested_questions_after_answer_dict'),
+    'speech_to_text': fields.Raw(attribute='speech_to_text_dict'),
     'more_like_this': fields.Raw(attribute='more_like_this_dict'),
     'model': fields.Raw(attribute='model_dict'),
     'user_input_form': fields.Raw(attribute='user_input_form_list'),
@@ -144,6 +145,7 @@ class AppListApi(Resource):
                 opening_statement=model_configuration['opening_statement'],
                 suggested_questions=json.dumps(model_configuration['suggested_questions']),
                 suggested_questions_after_answer=json.dumps(model_configuration['suggested_questions_after_answer']),
+                speech_to_text=json.dumps(model_configuration['speech_to_text']),
                 more_like_this=json.dumps(model_configuration['more_like_this']),
                 model=json.dumps(model_configuration['model']),
                 user_input_form=json.dumps(model_configuration['user_input_form']),
@@ -434,6 +436,7 @@ class AppCopy(Resource):
             opening_statement=app_config.opening_statement,
             suggested_questions=app_config.suggested_questions,
             suggested_questions_after_answer=app_config.suggested_questions_after_answer,
+            speech_to_text=app_config.speech_to_text,
             more_like_this=app_config.more_like_this,
             model=app_config.model,
             user_input_form=app_config.user_input_form,

+ 69 - 0
api/controllers/console/app/audio.py

@@ -0,0 +1,69 @@
+# -*- coding:utf-8 -*-
+import logging
+
+from flask import request
+from flask_login import login_required
+from werkzeug.exceptions import InternalServerError, NotFound
+
+import services
+from controllers.console import api
+from controllers.console.app import _get_app
+from controllers.console.app.error import AppUnavailableError, \
+    ProviderNotInitializeError, CompletionRequestError, ProviderQuotaExceededError, \
+    ProviderModelCurrentlyNotSupportError, NoAudioUploadedError, AudioTooLargeError, \
+    UnsupportedAudioTypeError, ProviderNotSupportSpeechToTextError
+from controllers.console.setup import setup_required
+from controllers.console.wraps import account_initialization_required
+from core.llm.error import LLMBadRequestError, LLMAPIUnavailableError, LLMAuthorizationError, LLMAPIConnectionError, \
+    LLMRateLimitError, ProviderTokenNotInitError, QuotaExceededError, ModelCurrentlyNotSupportError
+from flask_restful import Resource
+from services.audio_service import AudioService
+from services.errors.audio import NoAudioUploadedServiceError, AudioTooLargeServiceError, \
+    UnsupportedAudioTypeServiceError, ProviderNotSupportSpeechToTextServiceError
+
+
+class ChatMessageAudioApi(Resource):
+    @setup_required
+    @login_required
+    @account_initialization_required
+    def post(self, app_id):
+        app_id = str(app_id)
+        app_model = _get_app(app_id, 'chat')
+
+        file = request.files['file']
+
+        try:
+            response = AudioService.transcript(
+                tenant_id=app_model.tenant_id,
+                file=file,
+            )
+
+            return response
+        except services.errors.app_model_config.AppModelConfigBrokenError:
+            logging.exception("App model config broken.")
+            raise AppUnavailableError()
+        except NoAudioUploadedServiceError:
+            raise NoAudioUploadedError()
+        except AudioTooLargeServiceError as e:
+            raise AudioTooLargeError(str(e))
+        except UnsupportedAudioTypeServiceError:
+            raise UnsupportedAudioTypeError()
+        except ProviderNotSupportSpeechToTextServiceError:
+            raise ProviderNotSupportSpeechToTextError()
+        except ProviderTokenNotInitError:
+            raise ProviderNotInitializeError()
+        except QuotaExceededError:
+            raise ProviderQuotaExceededError()
+        except ModelCurrentlyNotSupportError:
+            raise ProviderModelCurrentlyNotSupportError()
+        except (LLMBadRequestError, LLMAPIConnectionError, LLMAPIUnavailableError,
+                LLMRateLimitError, LLMAuthorizationError) as e:
+            raise CompletionRequestError(str(e))
+        except ValueError as e:
+            raise e
+        except Exception as e:
+            logging.exception("internal server error.")
+            raise InternalServerError()
+        
+
+api.add_resource(ChatMessageAudioApi, '/apps/<uuid:app_id>/audio-to-text')

+ 24 - 0
api/controllers/console/app/error.py

@@ -49,3 +49,27 @@ class AppMoreLikeThisDisabledError(BaseHTTPException):
     error_code = 'app_more_like_this_disabled'
     description = "The 'More like this' feature is disabled. Please refresh your page."
     code = 403
+
+
+class NoAudioUploadedError(BaseHTTPException):
+    error_code = 'no_audio_uploaded'
+    description = "Please upload your audio."
+    code = 400
+
+
+class AudioTooLargeError(BaseHTTPException):
+    error_code = 'audio_too_large'
+    description = "Audio size exceeded. {message}"
+    code = 413
+
+
+class UnsupportedAudioTypeError(BaseHTTPException):
+    error_code = 'unsupported_audio_type'
+    description = "Audio type not allowed."
+    code = 415
+
+
+class ProviderNotSupportSpeechToTextError(BaseHTTPException):
+    error_code = 'provider_not_support_speech_to_text'
+    description = "Provider not support speech to text."
+    code = 400

+ 1 - 0
api/controllers/console/app/model_config.py

@@ -41,6 +41,7 @@ class ModelConfigResource(Resource):
             opening_statement=model_configuration['opening_statement'],
             suggested_questions=json.dumps(model_configuration['suggested_questions']),
             suggested_questions_after_answer=json.dumps(model_configuration['suggested_questions_after_answer']),
+            speech_to_text=json.dumps(model_configuration['speech_to_text']),
             more_like_this=json.dumps(model_configuration['more_like_this']),
             model=json.dumps(model_configuration['model']),
             user_input_form=json.dumps(model_configuration['user_input_form']),

+ 66 - 0
api/controllers/console/explore/audio.py

@@ -0,0 +1,66 @@
+# -*- coding:utf-8 -*-
+import logging
+
+from flask import request
+from werkzeug.exceptions import InternalServerError
+
+import services
+from controllers.console import api
+from controllers.console.app.error import AppUnavailableError, ProviderNotInitializeError, \
+    ProviderQuotaExceededError, ProviderModelCurrentlyNotSupportError, CompletionRequestError, \
+    NoAudioUploadedError, AudioTooLargeError, \
+    UnsupportedAudioTypeError, ProviderNotSupportSpeechToTextError
+from controllers.console.explore.wraps import InstalledAppResource
+from core.llm.error import LLMBadRequestError, LLMAPIUnavailableError, LLMAuthorizationError, LLMAPIConnectionError, \
+    LLMRateLimitError, ProviderTokenNotInitError, QuotaExceededError, ModelCurrentlyNotSupportError
+from services.audio_service import AudioService
+from services.errors.audio import NoAudioUploadedServiceError, AudioTooLargeServiceError, \
+    UnsupportedAudioTypeServiceError, ProviderNotSupportSpeechToTextServiceError
+from models.model import AppModelConfig
+
+
+class ChatAudioApi(InstalledAppResource):
+    def post(self, installed_app):
+        app_model = installed_app.app
+        app_model_config: AppModelConfig = app_model.app_model_config
+
+        if not app_model_config.speech_to_text_dict['enabled']:
+            raise AppUnavailableError()
+
+        file = request.files['file']
+
+        try:
+            response = AudioService.transcript(
+                tenant_id=app_model.tenant_id,
+                file=file,
+            )
+
+            return response
+        except services.errors.app_model_config.AppModelConfigBrokenError:
+            logging.exception("App model config broken.")
+            raise AppUnavailableError()
+        except NoAudioUploadedServiceError:
+            raise NoAudioUploadedError()
+        except AudioTooLargeServiceError as e:
+            raise AudioTooLargeError(str(e))
+        except UnsupportedAudioTypeServiceError:
+            raise UnsupportedAudioTypeError()
+        except ProviderNotSupportSpeechToTextServiceError:
+            raise ProviderNotSupportSpeechToTextError()
+        except ProviderTokenNotInitError:
+            raise ProviderNotInitializeError()
+        except QuotaExceededError:
+            raise ProviderQuotaExceededError()
+        except ModelCurrentlyNotSupportError:
+            raise ProviderModelCurrentlyNotSupportError()
+        except (LLMBadRequestError, LLMAPIConnectionError, LLMAPIUnavailableError,
+                LLMRateLimitError, LLMAuthorizationError) as e:
+            raise CompletionRequestError(str(e))
+        except ValueError as e:
+            raise e
+        except Exception as e:
+            logging.exception("internal server error.")
+            raise InternalServerError()
+        
+
+api.add_resource(ChatAudioApi, '/installed-apps/<uuid:installed_app_id>/audio-to-text', endpoint='installed_app_audio')

+ 2 - 0
api/controllers/console/explore/parameter.py

@@ -21,6 +21,7 @@ class AppParameterApi(InstalledAppResource):
         'opening_statement': fields.String,
         'suggested_questions': fields.Raw,
         'suggested_questions_after_answer': fields.Raw,
+        'speech_to_text': fields.Raw,
         'more_like_this': fields.Raw,
         'user_input_form': fields.Raw,
     }
@@ -35,6 +36,7 @@ class AppParameterApi(InstalledAppResource):
             'opening_statement': app_model_config.opening_statement,
             'suggested_questions': app_model_config.suggested_questions_list,
             'suggested_questions_after_answer': app_model_config.suggested_questions_after_answer_dict,
+            'speech_to_text': app_model_config.speech_to_text_dict,
             'more_like_this': app_model_config.more_like_this_dict,
             'user_input_form': app_model_config.user_input_form_list
         }

+ 1 - 1
api/controllers/service_api/__init__.py

@@ -7,6 +7,6 @@ bp = Blueprint('service_api', __name__, url_prefix='/v1')
 api = ExternalApi(bp)
 
 
-from .app import completion, app, conversation, message
+from .app import completion, app, conversation, message, audio
 
 from .dataset import document

+ 2 - 0
api/controllers/service_api/app/app.py

@@ -22,6 +22,7 @@ class AppParameterApi(AppApiResource):
         'opening_statement': fields.String,
         'suggested_questions': fields.Raw,
         'suggested_questions_after_answer': fields.Raw,
+        'speech_to_text': fields.Raw,
         'more_like_this': fields.Raw,
         'user_input_form': fields.Raw,
     }
@@ -35,6 +36,7 @@ class AppParameterApi(AppApiResource):
             'opening_statement': app_model_config.opening_statement,
             'suggested_questions': app_model_config.suggested_questions_list,
             'suggested_questions_after_answer': app_model_config.suggested_questions_after_answer_dict,
+            'speech_to_text': app_model_config.speech_to_text_dict,
             'more_like_this': app_model_config.more_like_this_dict,
             'user_input_form': app_model_config.user_input_form_list
         }

+ 61 - 0
api/controllers/service_api/app/audio.py

@@ -0,0 +1,61 @@
+import logging
+
+from flask import request
+from werkzeug.exceptions import InternalServerError
+
+import services
+from controllers.service_api import api
+from controllers.service_api.app.error import AppUnavailableError, ProviderNotInitializeError, CompletionRequestError, ProviderQuotaExceededError, \
+    ProviderModelCurrentlyNotSupportError, NoAudioUploadedError, AudioTooLargeError, UnsupportedAudioTypeError, \
+    ProviderNotSupportSpeechToTextError
+from controllers.service_api.wraps import AppApiResource
+from core.llm.error import LLMBadRequestError, LLMAuthorizationError, LLMAPIUnavailableError, LLMAPIConnectionError, \
+    LLMRateLimitError, ProviderTokenNotInitError, QuotaExceededError, ModelCurrentlyNotSupportError
+from models.model import App, AppModelConfig
+from services.audio_service import AudioService
+from services.errors.audio import NoAudioUploadedServiceError, AudioTooLargeServiceError, \
+    UnsupportedAudioTypeServiceError, ProviderNotSupportSpeechToTextServiceError
+
+class AudioApi(AppApiResource):
+    def post(self, app_model: App, end_user):
+        app_model_config: AppModelConfig = app_model.app_model_config
+
+        if not app_model_config.speech_to_text_dict['enabled']:
+            raise AppUnavailableError() 
+
+        file = request.files['file']
+
+        try:
+            response = AudioService.transcript(
+                tenant_id=app_model.tenant_id,
+                file=file,
+            )
+
+            return response
+        except services.errors.app_model_config.AppModelConfigBrokenError:
+            logging.exception("App model config broken.")
+            raise AppUnavailableError()
+        except NoAudioUploadedServiceError:
+            raise NoAudioUploadedError()
+        except AudioTooLargeServiceError as e:
+            raise AudioTooLargeError(str(e))
+        except UnsupportedAudioTypeServiceError:
+            raise UnsupportedAudioTypeError()
+        except ProviderNotSupportSpeechToTextServiceError:
+            raise ProviderNotSupportSpeechToTextError()
+        except ProviderTokenNotInitError:
+            raise ProviderNotInitializeError()
+        except QuotaExceededError:
+            raise ProviderQuotaExceededError()
+        except ModelCurrentlyNotSupportError:
+            raise ProviderModelCurrentlyNotSupportError()
+        except (LLMBadRequestError, LLMAPIConnectionError, LLMAPIUnavailableError,
+                LLMRateLimitError, LLMAuthorizationError) as e:
+            raise CompletionRequestError(str(e))
+        except ValueError as e:
+            raise e
+        except Exception as e:
+            logging.exception("internal server error.")
+            raise InternalServerError()
+        
+api.add_resource(AudioApi, '/audio-to-text')

+ 24 - 0
api/controllers/service_api/app/error.py

@@ -51,3 +51,27 @@ class CompletionRequestError(BaseHTTPException):
     description = "Completion request failed."
     code = 400
 
+
+class NoAudioUploadedError(BaseHTTPException):
+    error_code = 'no_audio_uploaded'
+    description = "Please upload your audio."
+    code = 400
+
+
+class AudioTooLargeError(BaseHTTPException):
+    error_code = 'audio_too_large'
+    description = "Audio size exceeded. {message}"
+    code = 413
+
+
+class UnsupportedAudioTypeError(BaseHTTPException):
+    error_code = 'unsupported_audio_type'
+    description = "Audio type not allowed."
+    code = 415
+
+
+class ProviderNotSupportSpeechToTextError(BaseHTTPException):
+    error_code = 'provider_not_support_speech_to_text'
+    description = "Provider not support speech to text."
+    code = 400
+

+ 1 - 1
api/controllers/web/__init__.py

@@ -7,4 +7,4 @@ bp = Blueprint('web', __name__, url_prefix='/api')
 api = ExternalApi(bp)
 
 
-from . import completion, app, conversation, message, site, saved_message
+from . import completion, app, conversation, message, site, saved_message, audio

+ 2 - 0
api/controllers/web/app.py

@@ -21,6 +21,7 @@ class AppParameterApi(WebApiResource):
         'opening_statement': fields.String,
         'suggested_questions': fields.Raw,
         'suggested_questions_after_answer': fields.Raw,
+        'speech_to_text': fields.Raw,
         'more_like_this': fields.Raw,
         'user_input_form': fields.Raw,
     }
@@ -34,6 +35,7 @@ class AppParameterApi(WebApiResource):
             'opening_statement': app_model_config.opening_statement,
             'suggested_questions': app_model_config.suggested_questions_list,
             'suggested_questions_after_answer': app_model_config.suggested_questions_after_answer_dict,
+            'speech_to_text': app_model_config.speech_to_text_dict,
             'more_like_this': app_model_config.more_like_this_dict,
             'user_input_form': app_model_config.user_input_form_list
         }

+ 63 - 0
api/controllers/web/audio.py

@@ -0,0 +1,63 @@
+# -*- coding:utf-8 -*-
+import logging
+
+from flask import request
+from werkzeug.exceptions import InternalServerError
+
+import services
+from controllers.web import api
+from controllers.web.error import AppUnavailableError, ProviderNotInitializeError, CompletionRequestError, \
+    ProviderQuotaExceededError, ProviderModelCurrentlyNotSupportError, NoAudioUploadedError, AudioTooLargeError, \
+    UnsupportedAudioTypeError, ProviderNotSupportSpeechToTextError
+from controllers.web.wraps import WebApiResource
+from core.llm.error import LLMBadRequestError, LLMAPIUnavailableError, LLMAuthorizationError, LLMAPIConnectionError, \
+    LLMRateLimitError, ProviderTokenNotInitError, QuotaExceededError, ModelCurrentlyNotSupportError
+from services.audio_service import AudioService
+from services.errors.audio import NoAudioUploadedServiceError, AudioTooLargeServiceError, \
+    UnsupportedAudioTypeServiceError, ProviderNotSupportSpeechToTextServiceError
+from models.model import App, AppModelConfig
+
+
+class AudioApi(WebApiResource):
+    def post(self, app_model: App, end_user):
+        app_model_config: AppModelConfig = app_model.app_model_config
+
+        if not app_model_config.speech_to_text_dict['enabled']:
+            raise AppUnavailableError()
+
+        file = request.files['file']
+
+        try:
+            response = AudioService.transcript(
+                tenant_id=app_model.tenant_id,
+                file=file,
+            )
+
+            return response
+        except services.errors.app_model_config.AppModelConfigBrokenError:
+            logging.exception("App model config broken.")
+            raise AppUnavailableError()
+        except NoAudioUploadedServiceError:
+            raise NoAudioUploadedError()
+        except AudioTooLargeServiceError as e:
+            raise AudioTooLargeError(str(e))
+        except UnsupportedAudioTypeServiceError:
+            raise UnsupportedAudioTypeError()
+        except ProviderNotSupportSpeechToTextServiceError:
+            raise ProviderNotSupportSpeechToTextError()
+        except ProviderTokenNotInitError:
+            raise ProviderNotInitializeError()
+        except QuotaExceededError:
+            raise ProviderQuotaExceededError()
+        except ModelCurrentlyNotSupportError:
+            raise ProviderModelCurrentlyNotSupportError()
+        except (LLMBadRequestError, LLMAPIConnectionError, LLMAPIUnavailableError,
+                LLMRateLimitError, LLMAuthorizationError) as e:
+            raise CompletionRequestError(str(e))
+        except ValueError as e:
+            raise e
+        except Exception as e:
+            logging.exception("internal server error.")
+            raise InternalServerError()
+
+api.add_resource(AudioApi, '/audio-to-text')

+ 24 - 0
api/controllers/web/error.py

@@ -62,3 +62,27 @@ class AppSuggestedQuestionsAfterAnswerDisabledError(BaseHTTPException):
     error_code = 'app_suggested_questions_after_answer_disabled'
     description = "The 'Suggested Questions After Answer' feature is disabled. Please refresh your page."
     code = 403
+
+
+class NoAudioUploadedError(BaseHTTPException):
+    error_code = 'no_audio_uploaded'
+    description = "Please upload your audio."
+    code = 400
+
+
+class AudioTooLargeError(BaseHTTPException):
+    error_code = 'audio_too_large'
+    description = "Audio size exceeded. {message}"
+    code = 413
+
+
+class UnsupportedAudioTypeError(BaseHTTPException):
+    error_code = 'unsupported_audio_type'
+    description = "Audio type not allowed."
+    code = 415
+
+
+class ProviderNotSupportSpeechToTextError(BaseHTTPException):
+    error_code = 'provider_not_support_speech_to_text'
+    description = "Provider not support speech to text."
+    code = 400

+ 25 - 0
api/core/llm/whisper.py

@@ -0,0 +1,25 @@
+import openai
+from models.provider import ProviderName
+from core.llm.error_handle_wraps import handle_llm_exceptions
+from core.llm.provider.base import BaseProvider
+
+
+class Whisper:
+
+    def __init__(self, provider: BaseProvider):
+        self.provider = provider
+
+        if self.provider.get_provider_name() == ProviderName.OPENAI:
+            self.client = openai.Audio
+            self.credentials = provider.get_credentials()
+
+    @handle_llm_exceptions
+    def transcribe(self, file):
+        return self.client.transcribe(
+            model='whisper-1', 
+            file=file,
+            api_key=self.credentials.get('openai_api_key'),
+            api_base=self.credentials.get('openai_api_base'),
+            api_type=self.credentials.get('openai_api_type'),
+            api_version=self.credentials.get('openai_api_version'),
+        )

+ 32 - 0
api/migrations/versions/a5b56fb053ef_app_config_add_speech_to_text.py

@@ -0,0 +1,32 @@
+"""app config add speech_to_text
+
+Revision ID: a5b56fb053ef
+Revises: d3d503a3471c
+Create Date: 2023-07-06 17:55:20.894149
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = 'a5b56fb053ef'
+down_revision = 'd3d503a3471c'
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    with op.batch_alter_table('app_model_configs', schema=None) as batch_op:
+        batch_op.add_column(sa.Column('speech_to_text', sa.Text(), nullable=True))
+
+    # ### end Alembic commands ###
+
+
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    with op.batch_alter_table('app_model_configs', schema=None) as batch_op:
+        batch_op.drop_column('speech_to_text')
+
+    # ### end Alembic commands ###

+ 10 - 0
api/models/model.py

@@ -81,6 +81,7 @@ class AppModelConfig(db.Model):
     opening_statement = db.Column(db.Text)
     suggested_questions = db.Column(db.Text)
     suggested_questions_after_answer = db.Column(db.Text)
+    speech_to_text = db.Column(db.Text)
     more_like_this = db.Column(db.Text)
     model = db.Column(db.Text)
     user_input_form = db.Column(db.Text)
@@ -104,6 +105,11 @@ class AppModelConfig(db.Model):
     def suggested_questions_after_answer_dict(self) -> dict:
         return json.loads(self.suggested_questions_after_answer) if self.suggested_questions_after_answer \
             else {"enabled": False}
+    
+    @property
+    def speech_to_text_dict(self) -> dict:
+        return json.loads(self.speech_to_text) if self.speech_to_text \
+            else {"enabled": False}
 
     @property
     def more_like_this_dict(self) -> dict:
@@ -223,6 +229,9 @@ class Conversation(db.Model):
                 model_config['suggested_questions_after_answer'] = override_model_configs[
                     'suggested_questions_after_answer'] \
                     if 'suggested_questions_after_answer' in override_model_configs else {"enabled": False}
+                model_config['speech_to_text'] = override_model_configs[
+                    'speech_to_text'] \
+                    if 'speech_to_text' in override_model_configs else {"enabled": False}
                 model_config['more_like_this'] = override_model_configs['more_like_this'] \
                     if 'more_like_this' in override_model_configs else {"enabled": False}
                 model_config['user_input_form'] = override_model_configs['user_input_form']
@@ -239,6 +248,7 @@ class Conversation(db.Model):
             model_config['opening_statement'] = app_model_config.opening_statement
             model_config['suggested_questions'] = app_model_config.suggested_questions_list
             model_config['suggested_questions_after_answer'] = app_model_config.suggested_questions_after_answer_dict
+            model_config['speech_to_text'] = app_model_config.speech_to_text_dict
             model_config['more_like_this'] = app_model_config.more_like_this_dict
             model_config['user_input_form'] = app_model_config.user_input_form_list
 

+ 22 - 0
api/services/app_model_config_service.py

@@ -4,6 +4,7 @@ import uuid
 from core.constant import llm_constant
 from models.account import Account
 from services.dataset_service import DatasetService
+from core.llm.llm_builder import LLMBuilder
 
 
 class AppModelConfigService:
@@ -109,6 +110,26 @@ class AppModelConfigService:
         if not isinstance(config["suggested_questions_after_answer"]["enabled"], bool):
             raise ValueError("enabled in suggested_questions_after_answer must be of boolean type")
 
+        # speech_to_text
+        if 'speech_to_text' not in config or not config["speech_to_text"]:
+            config["speech_to_text"] = {
+                "enabled": False
+            }
+
+        if not isinstance(config["speech_to_text"], dict):
+            raise ValueError("speech_to_text must be of dict type")
+
+        if "enabled" not in config["speech_to_text"] or not config["speech_to_text"]["enabled"]:
+            config["speech_to_text"]["enabled"] = False
+
+        if not isinstance(config["speech_to_text"]["enabled"], bool):
+            raise ValueError("enabled in speech_to_text must be of boolean type")
+        
+        provider_name = LLMBuilder.get_default_provider(account.current_tenant_id)
+
+        if config["speech_to_text"]["enabled"] and provider_name != 'openai':
+            raise ValueError("provider not support speech to text")
+
         # more_like_this
         if 'more_like_this' not in config or not config["more_like_this"]:
             config["more_like_this"] = {
@@ -277,6 +298,7 @@ class AppModelConfigService:
             "opening_statement": config["opening_statement"],
             "suggested_questions": config["suggested_questions"],
             "suggested_questions_after_answer": config["suggested_questions_after_answer"],
+            "speech_to_text": config["speech_to_text"],
             "more_like_this": config["more_like_this"],
             "model": {
                 "provider": config["model"]["provider"],

+ 43 - 0
api/services/audio_service.py

@@ -0,0 +1,43 @@
+import io
+from werkzeug.datastructures import FileStorage
+from core.llm.llm_builder import LLMBuilder
+from core.llm.provider.llm_provider_service import LLMProviderService
+from services.errors.audio import NoAudioUploadedServiceError, AudioTooLargeServiceError, UnsupportedAudioTypeServiceError, ProviderNotSupportSpeechToTextServiceError
+from core.llm.whisper import Whisper
+from models.provider import ProviderName
+
+FILE_SIZE_LIMIT = 1 * 1024 * 1024
+ALLOWED_EXTENSIONS = ['mp3', 'mp4', 'mpeg', 'mpga', 'm4a', 'wav', 'webm']
+
+class AudioService:
+    @classmethod
+    def transcript(cls, tenant_id: str, file: FileStorage):
+        if file is None:
+            raise NoAudioUploadedServiceError()
+        
+        extension = file.mimetype
+        if extension not in [f'audio/{ext}' for ext in ALLOWED_EXTENSIONS]:
+            raise UnsupportedAudioTypeServiceError()
+
+        file_content = file.read()
+        file_size = len(file_content)
+
+        if file_size > FILE_SIZE_LIMIT:
+            message = f"({file_size} > {FILE_SIZE_LIMIT})"
+            raise AudioTooLargeServiceError(message)
+        
+        provider_name = LLMBuilder.get_default_provider(tenant_id)
+        if provider_name != ProviderName.OPENAI.value:
+            raise ProviderNotSupportSpeechToTextServiceError('haha')
+
+        provider_service = LLMProviderService(tenant_id, provider_name)
+
+        buffer = io.BytesIO(file_content)
+        buffer.name = 'temp.wav'
+
+        return Whisper(provider_service.provider).transcribe(buffer)
+
+
+
+        
+        

+ 1 - 1
api/services/errors/__init__.py

@@ -1,7 +1,7 @@
 # -*- coding:utf-8 -*-
 __all__ = [
     'base', 'conversation', 'message', 'index', 'app_model_config', 'account', 'document', 'dataset',
-    'app', 'completion'
+    'app', 'completion', 'audio'
 ]
 
 from . import *

+ 23 - 0
api/services/errors/audio.py

@@ -0,0 +1,23 @@
+from services.errors.base import BaseServiceError
+
+class NoAudioUploadedServiceError(BaseServiceError):
+    error_code = 'no_audio_uploaded'
+    description = "Please upload your audio."
+    code = 400
+
+
+class AudioTooLargeServiceError(BaseServiceError):
+    error_code = 'audio_too_large'
+    description = "Audio size exceeded. {message}"
+    code = 413
+
+
+class UnsupportedAudioTypeServiceError(BaseServiceError):
+    error_code = 'unsupported_audio_type'
+    description = "Audio type not allowed."
+    code = 415
+
+class ProviderNotSupportSpeechToTextServiceError(BaseServiceError):
+    error_code = 'provider_not_support_speech_to_text'
+    description = "Provider not support speech to text. {message}"
+    code = 400

+ 44 - 0
web/app/components/app/chat/index.tsx

@@ -3,6 +3,7 @@ import type { FC } from 'react'
 import React, { useEffect, useLayoutEffect, useRef, useState } from 'react'
 import { useContext } from 'use-context-selector'
 import cn from 'classnames'
+import Recorder from 'js-audio-recorder'
 import { HandThumbDownIcon, HandThumbUpIcon } from '@heroicons/react/24/outline'
 import { UserCircleIcon } from '@heroicons/react/24/solid'
 import { useTranslation } from 'react-i18next'
@@ -19,6 +20,10 @@ import AppContext from '@/context/app-context'
 import { Markdown } from '@/app/components/base/markdown'
 import { formatNumber } from '@/utils/format'
 import useBreakpoints, { MediaType } from '@/hooks/use-breakpoints'
+import VoiceInput from '@/app/components/base/voice-input'
+import { Microphone01 } from '@/app/components/base/icons/src/vender/line/mediaAndDevices'
+import { Microphone01 as Microphone01Solid } from '@/app/components/base/icons/src/vender/solid/mediaAndDevices'
+import { XCircle } from '@/app/components/base/icons/src/vender/solid/general'
 
 const stopIcon = (
   <svg width="14" height="14" viewBox="0 0 14 14" fill="none" xmlns="http://www.w3.org/2000/svg">
@@ -59,6 +64,7 @@ export type IChatProps = {
   controlFocus?: number
   isShowSuggestion?: boolean
   suggestionList?: string[]
+  isShowSpeechToText?: boolean
 }
 
 export type MessageMore = {
@@ -421,6 +427,7 @@ const Chat: FC<IChatProps> = ({
   controlFocus,
   isShowSuggestion,
   suggestionList,
+  isShowSpeechToText,
 }) => {
   const { t } = useTranslation()
   const { notify } = useContext(ToastContext)
@@ -488,6 +495,15 @@ const Chat: FC<IChatProps> = ({
     }
   }, [suggestionList])
 
+  const [voiceInputShow, setVoiceInputShow] = useState(false)
+  const handleVoiceInputShow = () => {
+    (Recorder as any).getPermission().then(() => {
+      setVoiceInputShow(true)
+    }, () => {
+      logError(t('common.voiceInput.notAllow'))
+    })
+  }
+
   return (
     <div className={cn('px-3.5', 'h-full')}>
       {/* Chat List */}
@@ -565,6 +581,26 @@ const Chat: FC<IChatProps> = ({
               />
               <div className="absolute top-0 right-2 flex items-center h-[48px]">
                 <div className={`${s.count} mr-4 h-5 leading-5 text-sm bg-gray-50 text-gray-500`}>{query.trim().length}</div>
+                {
+                  query
+                    ? (
+                      <div className='flex justify-center items-center w-8 h-8 cursor-pointer hover:bg-gray-100 rounded-lg' onClick={() => setQuery('')}>
+                        <XCircle className='w-4 h-4 text-[#98A2B3]' />
+                      </div>
+                    )
+                    : isShowSpeechToText
+                      ? (
+                        <div
+                          className='group flex justify-center items-center w-8 h-8 hover:bg-primary-50 rounded-lg cursor-pointer'
+                          onClick={handleVoiceInputShow}
+                        >
+                          <Microphone01 className='block w-4 h-4 text-gray-500 group-hover:hidden' />
+                          <Microphone01Solid className='hidden w-4 h-4 text-primary-600 group-hover:block' />
+                        </div>
+                      )
+                      : null
+                }
+                <div className='mx-2 w-[1px] h-4 bg-black opacity-5' />
                 {isMobile
                   ? sendBtn
                   : (
@@ -581,6 +617,14 @@ const Chat: FC<IChatProps> = ({
                     </Tooltip>
                   )}
               </div>
+              {
+                voiceInputShow && (
+                  <VoiceInput
+                    onCancel={() => setVoiceInputShow(false)}
+                    onConverted={text => setQuery(text)}
+                  />
+                )
+              }
             </div>
           </div>
         )

+ 1 - 1
web/app/components/app/chat/style.module.css

@@ -79,7 +79,7 @@
 .textArea {
   padding-top: 13px;
   padding-bottom: 13px;
-  padding-right: 90px;
+  padding-right: 130px;
   border-radius: 12px;
   line-height: 20px;
   background-color: #fff;

File diff suppressed because it is too large
+ 20 - 0
web/app/components/app/configuration/config/feature/choose-feature/feature-item/preview-imgs/speech-to-text.svg


+ 4 - 0
web/app/components/app/configuration/config/feature/choose-feature/feature-item/style.module.css

@@ -22,4 +22,8 @@
 
 .moreLikeThisPreview {
   background-image: url(./preview-imgs/more-like-this.svg);
+}
+
+.speechToTextPreview {
+  background-image: url(./preview-imgs/speech-to-text.svg);
 }

+ 16 - 0
web/app/components/app/configuration/config/feature/choose-feature/index.tsx

@@ -7,10 +7,12 @@ import MoreLikeThisIcon from '../../../base/icons/more-like-this-icon'
 import FeatureItem from './feature-item'
 import Modal from '@/app/components/base/modal'
 import SuggestedQuestionsAfterAnswerIcon from '@/app/components/app/configuration/base/icons/suggested-questions-after-answer-icon'
+import { Microphone01 } from '@/app/components/base/icons/src/vender/solid/mediaAndDevices'
 type IConfig = {
   openingStatement: boolean
   moreLikeThis: boolean
   suggestedQuestionsAfterAnswer: boolean
+  speechToText: boolean
 }
 
 export type IChooseFeatureProps = {
@@ -19,6 +21,7 @@ export type IChooseFeatureProps = {
   config: IConfig
   isChatApp: boolean
   onChange: (key: string, value: boolean) => void
+  showSpeechToTextItem?: boolean
 }
 
 const OpeningStatementIcon = (
@@ -33,6 +36,7 @@ const ChooseFeature: FC<IChooseFeatureProps> = ({
   isChatApp,
   config,
   onChange,
+  showSpeechToTextItem,
 }) => {
   const { t } = useTranslation()
 
@@ -69,6 +73,18 @@ const ChooseFeature: FC<IChooseFeatureProps> = ({
                 value={config.suggestedQuestionsAfterAnswer}
                 onChange={value => onChange('suggestedQuestionsAfterAnswer', value)}
               />
+              {
+                showSpeechToTextItem && (
+                  <FeatureItem
+                    icon={<Microphone01 className='w-4 h-4 text-[#7839EE]' />}
+                    previewImgClassName='speechToTextPreview'
+                    title={t('appDebug.feature.speechToText.title')}
+                    description={t('appDebug.feature.speechToText.description')}
+                    value={config.speechToText}
+                    onChange={value => onChange('speechToText', value)}
+                  />
+                )
+              }
             </>
           </FeatureGroup>
         )}

+ 14 - 8
web/app/components/app/configuration/config/feature/use-feature.tsx

@@ -7,6 +7,8 @@ function useFeature({
   setMoreLikeThis,
   suggestedQuestionsAfterAnswer,
   setSuggestedQuestionsAfterAnswer,
+  speechToText,
+  setSpeechToText,
 }: {
   introduction: string
   setIntroduction: (introduction: string) => void
@@ -14,13 +16,14 @@ function useFeature({
   setMoreLikeThis: (moreLikeThis: boolean) => void
   suggestedQuestionsAfterAnswer: boolean
   setSuggestedQuestionsAfterAnswer: (suggestedQuestionsAfterAnswer: boolean) => void
+  speechToText: boolean
+  setSpeechToText: (speechToText: boolean) => void
 }) {
   const [tempshowOpeningStatement, setTempShowOpeningStatement] = React.useState(!!introduction)
   useEffect(() => {
     // wait to api data back
-    if (!!introduction) {
+    if (introduction)
       setTempShowOpeningStatement(true)
-    }
   }, [introduction])
 
   // const [tempMoreLikeThis, setTempMoreLikeThis] = React.useState(moreLikeThis)
@@ -30,15 +33,16 @@ function useFeature({
 
   const featureConfig = {
     openingStatement: tempshowOpeningStatement,
-    moreLikeThis: moreLikeThis,
-    suggestedQuestionsAfterAnswer: suggestedQuestionsAfterAnswer
+    moreLikeThis,
+    suggestedQuestionsAfterAnswer,
+    speechToText,
   }
   const handleFeatureChange = (key: string, value: boolean) => {
     switch (key) {
       case 'openingStatement':
-        if (!value) {
+        if (!value)
           setIntroduction('')
-        }
+
         setTempShowOpeningStatement(value)
         break
       case 'moreLikeThis':
@@ -47,12 +51,14 @@ function useFeature({
       case 'suggestedQuestionsAfterAnswer':
         setSuggestedQuestionsAfterAnswer(value)
         break
+      case 'speechToText':
+        setSpeechToText(value)
     }
   }
   return {
     featureConfig,
-    handleFeatureChange
+    handleFeatureChange,
   }
 }
 
-export default useFeature
+export default useFeature

+ 15 - 1
web/app/components/app/configuration/config/index.tsx

@@ -4,6 +4,7 @@ import React from 'react'
 import { useContext } from 'use-context-selector'
 import produce from 'immer'
 import { useBoolean } from 'ahooks'
+import useSWR from 'swr'
 import DatasetConfig from '../dataset-config'
 import ChatGroup from '../features/chat-group'
 import ExperienceEnchanceGroup from '../features/experience-enchance-group'
@@ -19,6 +20,7 @@ import ConfigPrompt from '@/app/components/app/configuration/config-prompt'
 import ConfigVar from '@/app/components/app/configuration/config-var'
 import type { PromptVariable } from '@/models/debug'
 import { AppType } from '@/types/app'
+import { fetchTenantInfo } from '@/service/common'
 
 const Config: FC = () => {
   const {
@@ -33,8 +35,12 @@ const Config: FC = () => {
     setMoreLikeThisConfig,
     suggestedQuestionsAfterAnswerConfig,
     setSuggestedQuestionsAfterAnswerConfig,
+    speechToTextConfig,
+    setSpeechToTextConfig,
   } = useContext(ConfigContext)
   const isChatApp = mode === AppType.chat
+  const { data: userInfo } = useSWR({ url: '/info' }, fetchTenantInfo)
+  const targetProvider = userInfo?.providers?.find(({ token_is_set, is_valid }) => token_is_set && is_valid)
 
   const promptTemplate = modelConfig.configs.prompt_template
   const promptVariables = modelConfig.configs.prompt_variables
@@ -78,9 +84,15 @@ const Config: FC = () => {
         draft.enabled = value
       }))
     },
+    speechToText: speechToTextConfig.enabled,
+    setSpeechToText: (value) => {
+      setSpeechToTextConfig(produce(speechToTextConfig, (draft) => {
+        draft.enabled = value
+      }))
+    },
   })
 
-  const hasChatConfig = isChatApp && (featureConfig.openingStatement || featureConfig.suggestedQuestionsAfterAnswer)
+  const hasChatConfig = isChatApp && (featureConfig.openingStatement || featureConfig.suggestedQuestionsAfterAnswer || (featureConfig.speechToText && targetProvider?.provider_name === 'openai'))
   const hasToolbox = false
 
   const [showAutomatic, { setTrue: showAutomaticTrue, setFalse: showAutomaticFalse }] = useBoolean(false)
@@ -110,6 +122,7 @@ const Config: FC = () => {
             isChatApp={isChatApp}
             config={featureConfig}
             onChange={handleFeatureChange}
+            showSpeechToTextItem={targetProvider?.provider_name === 'openai'}
           />
         )}
         {showAutomatic && (
@@ -149,6 +162,7 @@ const Config: FC = () => {
                 }
               }
               isShowSuggestedQuestionsAfterAnswer={featureConfig.suggestedQuestionsAfterAnswer}
+              isShowSpeechText={featureConfig.speechToText}
             />
           )
         }

+ 4 - 0
web/app/components/app/configuration/debug/index.tsx

@@ -38,6 +38,7 @@ const Debug: FC<IDebug> = ({
     mode,
     introduction,
     suggestedQuestionsAfterAnswerConfig,
+    speechToTextConfig,
     moreLikeThisConfig,
     inputs,
     // setInputs,
@@ -159,6 +160,7 @@ const Debug: FC<IDebug> = ({
         enabled: false,
       },
       suggested_questions_after_answer: suggestedQuestionsAfterAnswerConfig,
+      speech_to_text: speechToTextConfig,
       agent_mode: {
         enabled: true,
         tools: [...postDatasets],
@@ -308,6 +310,7 @@ const Debug: FC<IDebug> = ({
       user_input_form: promptVariablesToUserInputsForm(modelConfig.configs.prompt_variables),
       opening_statement: introduction,
       suggested_questions_after_answer: suggestedQuestionsAfterAnswerConfig,
+      speech_to_text: speechToTextConfig,
       more_like_this: moreLikeThisConfig,
       agent_mode: {
         enabled: true,
@@ -386,6 +389,7 @@ const Debug: FC<IDebug> = ({
                   }}
                   isShowSuggestion={doShowSuggestion}
                   suggestionList={suggestQuestions}
+                  isShowSpeechToText={speechToTextConfig.enabled}
                 />
               </div>
             </div>

+ 16 - 6
web/app/components/app/configuration/features/chat-group/index.tsx

@@ -1,25 +1,30 @@
 'use client'
-import React, { FC } from 'react'
+import type { FC } from 'react'
+import React from 'react'
+import { useTranslation } from 'react-i18next'
 import GroupName from '../../base/group-name'
-import OpeningStatement, { IOpeningStatementProps } from './opening-statement'
+import type { IOpeningStatementProps } from './opening-statement'
+import OpeningStatement from './opening-statement'
 import SuggestedQuestionsAfterAnswer from './suggested-questions-after-answer'
-import { useTranslation } from 'react-i18next'
+import SpeechToText from './speech-to-text'
 
 /*
-* Include 
+* Include
 * 1. Conversation Opener
 * 2. Opening Suggestion
 * 3. Next question suggestion
 */
-interface ChatGroupProps {
+type ChatGroupProps = {
   isShowOpeningStatement: boolean
   openingStatementConfig: IOpeningStatementProps
   isShowSuggestedQuestionsAfterAnswer: boolean
+  isShowSpeechText: boolean
 }
 const ChatGroup: FC<ChatGroupProps> = ({
   isShowOpeningStatement,
   openingStatementConfig,
-  isShowSuggestedQuestionsAfterAnswer
+  isShowSuggestedQuestionsAfterAnswer,
+  isShowSpeechText,
 }) => {
   const { t } = useTranslation()
 
@@ -33,6 +38,11 @@ const ChatGroup: FC<ChatGroupProps> = ({
         {isShowSuggestedQuestionsAfterAnswer && (
           <SuggestedQuestionsAfterAnswer />
         )}
+        {
+          isShowSpeechText && (
+            <SpeechToText />
+          )
+        }
       </div>
     </div>
   )

+ 25 - 0
web/app/components/app/configuration/features/chat-group/speech-to-text/index.tsx

@@ -0,0 +1,25 @@
+'use client'
+import React, { type FC } from 'react'
+import { useTranslation } from 'react-i18next'
+import Panel from '@/app/components/app/configuration/base/feature-panel'
+import { Microphone01 } from '@/app/components/base/icons/src/vender/solid/mediaAndDevices'
+
+const SuggestedQuestionsAfterAnswer: FC = () => {
+  const { t } = useTranslation()
+
+  return (
+    <Panel
+      title={
+        <div className='flex items-center gap-2'>
+          <div>{t('appDebug.feature.speechToText.title')}</div>
+        </div>
+      }
+      headerIcon={<Microphone01 className='w-4 h-4 text-[#7839EE]' />}
+      headerRight={
+        <div className='text-xs text-gray-500'>{t('appDebug.feature.speechToText.resDes')}</div>
+      }
+      noBodySpacing
+    />
+  )
+}
+export default React.memo(SuggestedQuestionsAfterAnswer)

+ 15 - 0
web/app/components/app/configuration/index.tsx

@@ -53,6 +53,9 @@ const Configuration: FC = () => {
   const [suggestedQuestionsAfterAnswerConfig, setSuggestedQuestionsAfterAnswerConfig] = useState<MoreLikeThisConfig>({
     enabled: false,
   })
+  const [speechToTextConfig, setSpeechToTextConfig] = useState<MoreLikeThisConfig>({
+    enabled: false,
+  })
   const [formattingChanged, setFormattingChanged] = useState(false)
   const [inputs, setInputs] = useState<Inputs>({})
   const [query, setQuery] = useState('')
@@ -73,6 +76,7 @@ const Configuration: FC = () => {
     opening_statement: '',
     more_like_this: null,
     suggested_questions_after_answer: null,
+    speech_to_text: null,
     dataSets: [],
   })
 
@@ -102,6 +106,9 @@ const Configuration: FC = () => {
     setSuggestedQuestionsAfterAnswerConfig(modelConfig.suggested_questions_after_answer || {
       enabled: false,
     })
+    setSpeechToTextConfig(modelConfig.speech_to_text || {
+      enabled: false,
+    })
   }
 
   const [hasSetCustomAPIKEY, setHasSetCustomerAPIKEY] = useState(true)
@@ -146,6 +153,9 @@ const Configuration: FC = () => {
       if (modelConfig.suggested_questions_after_answer)
         setSuggestedQuestionsAfterAnswerConfig(modelConfig.suggested_questions_after_answer)
 
+      if (modelConfig.speech_to_text)
+        setSpeechToTextConfig(modelConfig.speech_to_text)
+
       const config = {
         modelConfig: {
           provider: model.provider,
@@ -157,6 +167,7 @@ const Configuration: FC = () => {
           opening_statement: modelConfig.opening_statement,
           more_like_this: modelConfig.more_like_this,
           suggested_questions_after_answer: modelConfig.suggested_questions_after_answer,
+          speech_to_text: modelConfig.speech_to_text,
           dataSets: datasets || [],
         },
         completionParams: model.completion_params,
@@ -187,6 +198,7 @@ const Configuration: FC = () => {
       opening_statement: introduction || '',
       more_like_this: moreLikeThisConfig,
       suggested_questions_after_answer: suggestedQuestionsAfterAnswerConfig,
+      speech_to_text: speechToTextConfig,
       agent_mode: {
         enabled: true,
         tools: [...postDatasets],
@@ -203,6 +215,7 @@ const Configuration: FC = () => {
       draft.opening_statement = introduction
       draft.more_like_this = moreLikeThisConfig
       draft.suggested_questions_after_answer = suggestedQuestionsAfterAnswerConfig
+      draft.speech_to_text = speechToTextConfig
       draft.dataSets = dataSets
     })
     setPublishedConfig({
@@ -245,6 +258,8 @@ const Configuration: FC = () => {
       setMoreLikeThisConfig,
       suggestedQuestionsAfterAnswerConfig,
       setSuggestedQuestionsAfterAnswerConfig,
+      speechToTextConfig,
+      setSpeechToTextConfig,
       formattingChanged,
       setFormattingChanged,
       inputs,

+ 10 - 0
web/app/components/base/icons/assets/vender/line/general/loading-02.svg

@@ -0,0 +1,10 @@
+<svg width="16" height="16" viewBox="0 0 16 16" fill="none" xmlns="http://www.w3.org/2000/svg">
+<g clip-path="url(#clip0_6037_51601)">
+<path d="M7.99992 1.33398V4.00065M7.99992 12.0007V14.6673M3.99992 8.00065H1.33325M14.6666 8.00065H11.9999M12.7189 12.7196L10.8333 10.834M12.7189 3.33395L10.8333 5.21956M3.28097 12.7196L5.16659 10.834M3.28097 3.33395L5.16659 5.21956" stroke="#667085" stroke-width="1.25" stroke-linecap="round" stroke-linejoin="round"/>
+</g>
+<defs>
+<clipPath id="clip0_6037_51601">
+<rect width="16" height="16" fill="white"/>
+</clipPath>
+</defs>
+</svg>

+ 5 - 0
web/app/components/base/icons/assets/vender/line/general/x-close.svg

@@ -0,0 +1,5 @@
+<svg width="16" height="16" viewBox="0 0 16 16" fill="none" xmlns="http://www.w3.org/2000/svg">
+<g id="x-close">
+<path id="Icon" d="M12 4L4 12M4 4L12 12" stroke="#667085" stroke-width="1.25" stroke-linecap="round" stroke-linejoin="round"/>
+</g>
+</svg>

+ 5 - 0
web/app/components/base/icons/assets/vender/line/mediaAndDevices/microphone-01.svg

@@ -0,0 +1,5 @@
+<svg width="16" height="16" viewBox="0 0 16 16" fill="none" xmlns="http://www.w3.org/2000/svg">
+<g id="microphone-01">
+<path id="Icon" d="M12.6666 6.66732V8.00065C12.6666 10.578 10.5772 12.6673 7.99992 12.6673M3.33325 6.66732V8.00065C3.33325 10.578 5.42259 12.6673 7.99992 12.6673M7.99992 12.6673V14.6673M5.33325 14.6673H10.6666M7.99992 10.0007C6.89535 10.0007 5.99992 9.10522 5.99992 8.00065V3.33398C5.99992 2.22941 6.89535 1.33398 7.99992 1.33398C9.10449 1.33398 9.99992 2.22941 9.99992 3.33398V8.00065C9.99992 9.10522 9.10449 10.0007 7.99992 10.0007Z" stroke="#667085" stroke-width="1.25" stroke-linecap="round" stroke-linejoin="round"/>
+</g>
+</svg>

+ 3 - 0
web/app/components/base/icons/assets/vender/solid/general/x-circle.svg

@@ -0,0 +1,3 @@
+<svg width="16" height="16" viewBox="0 0 16 16" fill="none" xmlns="http://www.w3.org/2000/svg">
+<path id="Solid" fill-rule="evenodd" clip-rule="evenodd" d="M8.00008 0.666016C3.94999 0.666016 0.666748 3.94926 0.666748 7.99935C0.666748 12.0494 3.94999 15.3327 8.00008 15.3327C12.0502 15.3327 15.3334 12.0494 15.3334 7.99935C15.3334 3.94926 12.0502 0.666016 8.00008 0.666016ZM10.4715 5.52794C10.7318 5.78829 10.7318 6.2104 10.4715 6.47075L8.94289 7.99935L10.4715 9.52794C10.7318 9.78829 10.7318 10.2104 10.4715 10.4708C10.2111 10.7311 9.78903 10.7311 9.52868 10.4708L8.00008 8.94216L6.47149 10.4708C6.21114 10.7311 5.78903 10.7311 5.52868 10.4708C5.26833 10.2104 5.26833 9.78829 5.52868 9.52794L7.05727 7.99935L5.52868 6.47075C5.26833 6.2104 5.26833 5.78829 5.52868 5.52794C5.78903 5.26759 6.21114 5.26759 6.47149 5.52794L8.00008 7.05654L9.52868 5.52794C9.78903 5.26759 10.2111 5.26759 10.4715 5.52794Z" fill="#98A2B3"/>
+</svg>

+ 8 - 0
web/app/components/base/icons/assets/vender/solid/mediaAndDevices/microphone-01.svg

@@ -0,0 +1,8 @@
+<svg width="16" height="16" viewBox="0 0 16 16" fill="none" xmlns="http://www.w3.org/2000/svg">
+<g id="microphone-01">
+<g id="Solid">
+<path fill-rule="evenodd" clip-rule="evenodd" d="M8.00008 0.666016C6.52732 0.666016 5.33341 1.85992 5.33341 3.33268V7.99935C5.33341 9.47211 6.52732 10.666 8.00008 10.666C9.47284 10.666 10.6667 9.47211 10.6667 7.99935V3.33268C10.6667 1.85992 9.47284 0.666016 8.00008 0.666016Z" fill="#155EEF"/>
+<path d="M4.00008 6.66602C4.00008 6.29783 3.7016 5.99935 3.33341 5.99935C2.96522 5.99935 2.66675 6.29783 2.66675 6.66602V7.99935C2.66675 10.7195 4.70319 12.9641 7.33466 13.2916C7.33384 13.3052 7.33341 13.3189 7.33341 13.3327V13.9993H5.33341C4.96522 13.9993 4.66675 14.2978 4.66675 14.666C4.66675 15.0342 4.96522 15.3327 5.33341 15.3327H10.6667C11.0349 15.3327 11.3334 15.0342 11.3334 14.666C11.3334 14.2978 11.0349 13.9993 10.6667 13.9993H8.66675V13.3327C8.66675 13.3189 8.66633 13.3052 8.6655 13.2916C11.297 12.9641 13.3334 10.7195 13.3334 7.99935V6.66602C13.3334 6.29783 13.0349 5.99935 12.6667 5.99935C12.2986 5.99935 12.0001 6.29783 12.0001 6.66602V7.99935C12.0001 10.2085 10.2092 11.9993 8.00008 11.9993C5.79094 11.9993 4.00008 10.2085 4.00008 7.99935V6.66602Z" fill="#155EEF"/>
+</g>
+</g>
+</svg>

+ 5 - 0
web/app/components/base/icons/assets/vender/solid/mediaAndDevices/stop-circle.svg

@@ -0,0 +1,5 @@
+<svg width="20" height="20" viewBox="0 0 20 20" fill="none" xmlns="http://www.w3.org/2000/svg">
+<g id="stop-circle">
+<path id="Solid" fill-rule="evenodd" clip-rule="evenodd" d="M9.99992 0.833984C4.93731 0.833984 0.833252 4.93804 0.833252 10.0007C0.833252 15.0633 4.93731 19.1673 9.99992 19.1673C15.0625 19.1673 19.1666 15.0633 19.1666 10.0007C19.1666 4.93804 15.0625 0.833984 9.99992 0.833984ZM6.75741 7.12232C6.66658 7.30058 6.66658 7.53394 6.66658 8.00065V12.0006C6.66658 12.4674 6.66658 12.7007 6.75741 12.879C6.83731 13.0358 6.96479 13.1633 7.12159 13.2432C7.29985 13.334 7.53321 13.334 7.99992 13.334H11.9999C12.4666 13.334 12.7 13.334 12.8782 13.2432C13.035 13.1633 13.1625 13.0358 13.2424 12.879C13.3333 12.7007 13.3333 12.4674 13.3333 12.0006V8.00065C13.3333 7.53394 13.3333 7.30058 13.2424 7.12232C13.1625 6.96552 13.035 6.83804 12.8782 6.75814C12.7 6.66732 12.4666 6.66732 11.9999 6.66732H7.99992C7.53321 6.66732 7.29985 6.66732 7.12159 6.75814C6.96479 6.83804 6.83731 6.96552 6.75741 7.12232Z" fill="#155EEF"/>
+</g>
+</svg>

+ 64 - 0
web/app/components/base/icons/src/vender/line/general/Loading02.json

@@ -0,0 +1,64 @@
+{
+	"icon": {
+		"type": "element",
+		"isRootNode": true,
+		"name": "svg",
+		"attributes": {
+			"width": "16",
+			"height": "16",
+			"viewBox": "0 0 16 16",
+			"fill": "none",
+			"xmlns": "http://www.w3.org/2000/svg"
+		},
+		"children": [
+			{
+				"type": "element",
+				"name": "g",
+				"attributes": {
+					"clip-path": "url(#clip0_6037_51601)"
+				},
+				"children": [
+					{
+						"type": "element",
+						"name": "path",
+						"attributes": {
+							"d": "M7.99992 1.33398V4.00065M7.99992 12.0007V14.6673M3.99992 8.00065H1.33325M14.6666 8.00065H11.9999M12.7189 12.7196L10.8333 10.834M12.7189 3.33395L10.8333 5.21956M3.28097 12.7196L5.16659 10.834M3.28097 3.33395L5.16659 5.21956",
+							"stroke": "currentColor",
+							"stroke-width": "1.25",
+							"stroke-linecap": "round",
+							"stroke-linejoin": "round"
+						},
+						"children": []
+					}
+				]
+			},
+			{
+				"type": "element",
+				"name": "defs",
+				"attributes": {},
+				"children": [
+					{
+						"type": "element",
+						"name": "clipPath",
+						"attributes": {
+							"id": "clip0_6037_51601"
+						},
+						"children": [
+							{
+								"type": "element",
+								"name": "rect",
+								"attributes": {
+									"width": "16",
+									"height": "16",
+									"fill": "white"
+								},
+								"children": []
+							}
+						]
+					}
+				]
+			}
+		]
+	},
+	"name": "Loading02"
+}

+ 14 - 0
web/app/components/base/icons/src/vender/line/general/Loading02.tsx

@@ -0,0 +1,14 @@
+// GENERATE BY script
+// DON NOT EDIT IT MANUALLY
+
+import * as React from 'react'
+import data from './Loading02.json'
+import IconBase from '@/app/components/base/icons/IconBase'
+import type { IconBaseProps, IconData } from '@/app/components/base/icons/IconBase'
+
+const Icon = React.forwardRef<React.MutableRefObject<SVGElement>, Omit<IconBaseProps, 'data'>>((
+  props,
+  ref,
+) => <IconBase {...props} ref={ref} data={data as IconData} />)
+
+export default Icon

+ 39 - 0
web/app/components/base/icons/src/vender/line/general/XClose.json

@@ -0,0 +1,39 @@
+{
+	"icon": {
+		"type": "element",
+		"isRootNode": true,
+		"name": "svg",
+		"attributes": {
+			"width": "16",
+			"height": "16",
+			"viewBox": "0 0 16 16",
+			"fill": "none",
+			"xmlns": "http://www.w3.org/2000/svg"
+		},
+		"children": [
+			{
+				"type": "element",
+				"name": "g",
+				"attributes": {
+					"id": "x-close"
+				},
+				"children": [
+					{
+						"type": "element",
+						"name": "path",
+						"attributes": {
+							"id": "Icon",
+							"d": "M12 4L4 12M4 4L12 12",
+							"stroke": "currentColor",
+							"stroke-width": "1.25",
+							"stroke-linecap": "round",
+							"stroke-linejoin": "round"
+						},
+						"children": []
+					}
+				]
+			}
+		]
+	},
+	"name": "XClose"
+}

+ 14 - 0
web/app/components/base/icons/src/vender/line/general/XClose.tsx

@@ -0,0 +1,14 @@
+// GENERATE BY script
+// DON NOT EDIT IT MANUALLY
+
+import * as React from 'react'
+import data from './XClose.json'
+import IconBase from '@/app/components/base/icons/IconBase'
+import type { IconBaseProps, IconData } from '@/app/components/base/icons/IconBase'
+
+const Icon = React.forwardRef<React.MutableRefObject<SVGElement>, Omit<IconBaseProps, 'data'>>((
+  props,
+  ref,
+) => <IconBase {...props} ref={ref} data={data as IconData} />)
+
+export default Icon

+ 2 - 0
web/app/components/base/icons/src/vender/line/general/index.ts

@@ -1,2 +1,4 @@
+export { default as Loading02 } from './Loading02'
 export { default as Trash03 } from './Trash03'
+export { default as XClose } from './XClose'
 export { default as X } from './X'

+ 39 - 0
web/app/components/base/icons/src/vender/line/mediaAndDevices/Microphone01.json

@@ -0,0 +1,39 @@
+{
+	"icon": {
+		"type": "element",
+		"isRootNode": true,
+		"name": "svg",
+		"attributes": {
+			"width": "16",
+			"height": "16",
+			"viewBox": "0 0 16 16",
+			"fill": "none",
+			"xmlns": "http://www.w3.org/2000/svg"
+		},
+		"children": [
+			{
+				"type": "element",
+				"name": "g",
+				"attributes": {
+					"id": "microphone-01"
+				},
+				"children": [
+					{
+						"type": "element",
+						"name": "path",
+						"attributes": {
+							"id": "Icon",
+							"d": "M12.6666 6.66732V8.00065C12.6666 10.578 10.5772 12.6673 7.99992 12.6673M3.33325 6.66732V8.00065C3.33325 10.578 5.42259 12.6673 7.99992 12.6673M7.99992 12.6673V14.6673M5.33325 14.6673H10.6666M7.99992 10.0007C6.89535 10.0007 5.99992 9.10522 5.99992 8.00065V3.33398C5.99992 2.22941 6.89535 1.33398 7.99992 1.33398C9.10449 1.33398 9.99992 2.22941 9.99992 3.33398V8.00065C9.99992 9.10522 9.10449 10.0007 7.99992 10.0007Z",
+							"stroke": "currentColor",
+							"stroke-width": "1.25",
+							"stroke-linecap": "round",
+							"stroke-linejoin": "round"
+						},
+						"children": []
+					}
+				]
+			}
+		]
+	},
+	"name": "Microphone01"
+}

+ 14 - 0
web/app/components/base/icons/src/vender/line/mediaAndDevices/Microphone01.tsx

@@ -0,0 +1,14 @@
+// GENERATE BY script
+// DON NOT EDIT IT MANUALLY
+
+import * as React from 'react'
+import data from './Microphone01.json'
+import IconBase from '@/app/components/base/icons/IconBase'
+import type { IconBaseProps, IconData } from '@/app/components/base/icons/IconBase'
+
+const Icon = React.forwardRef<React.MutableRefObject<SVGElement>, Omit<IconBaseProps, 'data'>>((
+  props,
+  ref,
+) => <IconBase {...props} ref={ref} data={data as IconData} />)
+
+export default Icon

+ 1 - 0
web/app/components/base/icons/src/vender/line/mediaAndDevices/index.ts

@@ -0,0 +1 @@
+export { default as Microphone01 } from './Microphone01'

+ 29 - 0
web/app/components/base/icons/src/vender/solid/general/XCircle.json

@@ -0,0 +1,29 @@
+{
+	"icon": {
+		"type": "element",
+		"isRootNode": true,
+		"name": "svg",
+		"attributes": {
+			"width": "16",
+			"height": "16",
+			"viewBox": "0 0 16 16",
+			"fill": "none",
+			"xmlns": "http://www.w3.org/2000/svg"
+		},
+		"children": [
+			{
+				"type": "element",
+				"name": "path",
+				"attributes": {
+					"id": "Solid",
+					"fill-rule": "evenodd",
+					"clip-rule": "evenodd",
+					"d": "M8.00008 0.666016C3.94999 0.666016 0.666748 3.94926 0.666748 7.99935C0.666748 12.0494 3.94999 15.3327 8.00008 15.3327C12.0502 15.3327 15.3334 12.0494 15.3334 7.99935C15.3334 3.94926 12.0502 0.666016 8.00008 0.666016ZM10.4715 5.52794C10.7318 5.78829 10.7318 6.2104 10.4715 6.47075L8.94289 7.99935L10.4715 9.52794C10.7318 9.78829 10.7318 10.2104 10.4715 10.4708C10.2111 10.7311 9.78903 10.7311 9.52868 10.4708L8.00008 8.94216L6.47149 10.4708C6.21114 10.7311 5.78903 10.7311 5.52868 10.4708C5.26833 10.2104 5.26833 9.78829 5.52868 9.52794L7.05727 7.99935L5.52868 6.47075C5.26833 6.2104 5.26833 5.78829 5.52868 5.52794C5.78903 5.26759 6.21114 5.26759 6.47149 5.52794L8.00008 7.05654L9.52868 5.52794C9.78903 5.26759 10.2111 5.26759 10.4715 5.52794Z",
+					"fill": "currentColor"
+				},
+				"children": []
+			}
+		]
+	},
+	"name": "XCircle"
+}

+ 14 - 0
web/app/components/base/icons/src/vender/solid/general/XCircle.tsx

@@ -0,0 +1,14 @@
+// GENERATE BY script
+// DON NOT EDIT IT MANUALLY
+
+import * as React from 'react'
+import data from './XCircle.json'
+import IconBase from '@/app/components/base/icons/IconBase'
+import type { IconBaseProps, IconData } from '@/app/components/base/icons/IconBase'
+
+const Icon = React.forwardRef<React.MutableRefObject<SVGElement>, Omit<IconBaseProps, 'data'>>((
+  props,
+  ref,
+) => <IconBase {...props} ref={ref} data={data as IconData} />)
+
+export default Icon

+ 1 - 0
web/app/components/base/icons/src/vender/solid/general/index.ts

@@ -1 +1,2 @@
 export { default as Download02 } from './Download02'
+export { default as XCircle } from './XCircle'

+ 55 - 0
web/app/components/base/icons/src/vender/solid/mediaAndDevices/Microphone01.json

@@ -0,0 +1,55 @@
+{
+	"icon": {
+		"type": "element",
+		"isRootNode": true,
+		"name": "svg",
+		"attributes": {
+			"width": "16",
+			"height": "16",
+			"viewBox": "0 0 16 16",
+			"fill": "none",
+			"xmlns": "http://www.w3.org/2000/svg"
+		},
+		"children": [
+			{
+				"type": "element",
+				"name": "g",
+				"attributes": {
+					"id": "microphone-01"
+				},
+				"children": [
+					{
+						"type": "element",
+						"name": "g",
+						"attributes": {
+							"id": "Solid"
+						},
+						"children": [
+							{
+								"type": "element",
+								"name": "path",
+								"attributes": {
+									"fill-rule": "evenodd",
+									"clip-rule": "evenodd",
+									"d": "M8.00008 0.666016C6.52732 0.666016 5.33341 1.85992 5.33341 3.33268V7.99935C5.33341 9.47211 6.52732 10.666 8.00008 10.666C9.47284 10.666 10.6667 9.47211 10.6667 7.99935V3.33268C10.6667 1.85992 9.47284 0.666016 8.00008 0.666016Z",
+									"fill": "currentColor"
+								},
+								"children": []
+							},
+							{
+								"type": "element",
+								"name": "path",
+								"attributes": {
+									"d": "M4.00008 6.66602C4.00008 6.29783 3.7016 5.99935 3.33341 5.99935C2.96522 5.99935 2.66675 6.29783 2.66675 6.66602V7.99935C2.66675 10.7195 4.70319 12.9641 7.33466 13.2916C7.33384 13.3052 7.33341 13.3189 7.33341 13.3327V13.9993H5.33341C4.96522 13.9993 4.66675 14.2978 4.66675 14.666C4.66675 15.0342 4.96522 15.3327 5.33341 15.3327H10.6667C11.0349 15.3327 11.3334 15.0342 11.3334 14.666C11.3334 14.2978 11.0349 13.9993 10.6667 13.9993H8.66675V13.3327C8.66675 13.3189 8.66633 13.3052 8.6655 13.2916C11.297 12.9641 13.3334 10.7195 13.3334 7.99935V6.66602C13.3334 6.29783 13.0349 5.99935 12.6667 5.99935C12.2986 5.99935 12.0001 6.29783 12.0001 6.66602V7.99935C12.0001 10.2085 10.2092 11.9993 8.00008 11.9993C5.79094 11.9993 4.00008 10.2085 4.00008 7.99935V6.66602Z",
+									"fill": "currentColor"
+								},
+								"children": []
+							}
+						]
+					}
+				]
+			}
+		]
+	},
+	"name": "Microphone01"
+}

+ 14 - 0
web/app/components/base/icons/src/vender/solid/mediaAndDevices/Microphone01.tsx

@@ -0,0 +1,14 @@
+// GENERATE BY script
+// DON NOT EDIT IT MANUALLY
+
+import * as React from 'react'
+import data from './Microphone01.json'
+import IconBase from '@/app/components/base/icons/IconBase'
+import type { IconBaseProps, IconData } from '@/app/components/base/icons/IconBase'
+
+const Icon = React.forwardRef<React.MutableRefObject<SVGElement>, Omit<IconBaseProps, 'data'>>((
+  props,
+  ref,
+) => <IconBase {...props} ref={ref} data={data as IconData} />)
+
+export default Icon

+ 38 - 0
web/app/components/base/icons/src/vender/solid/mediaAndDevices/StopCircle.json

@@ -0,0 +1,38 @@
+{
+	"icon": {
+		"type": "element",
+		"isRootNode": true,
+		"name": "svg",
+		"attributes": {
+			"width": "20",
+			"height": "20",
+			"viewBox": "0 0 20 20",
+			"fill": "none",
+			"xmlns": "http://www.w3.org/2000/svg"
+		},
+		"children": [
+			{
+				"type": "element",
+				"name": "g",
+				"attributes": {
+					"id": "stop-circle"
+				},
+				"children": [
+					{
+						"type": "element",
+						"name": "path",
+						"attributes": {
+							"id": "Solid",
+							"fill-rule": "evenodd",
+							"clip-rule": "evenodd",
+							"d": "M9.99992 0.833984C4.93731 0.833984 0.833252 4.93804 0.833252 10.0007C0.833252 15.0633 4.93731 19.1673 9.99992 19.1673C15.0625 19.1673 19.1666 15.0633 19.1666 10.0007C19.1666 4.93804 15.0625 0.833984 9.99992 0.833984ZM6.75741 7.12232C6.66658 7.30058 6.66658 7.53394 6.66658 8.00065V12.0006C6.66658 12.4674 6.66658 12.7007 6.75741 12.879C6.83731 13.0358 6.96479 13.1633 7.12159 13.2432C7.29985 13.334 7.53321 13.334 7.99992 13.334H11.9999C12.4666 13.334 12.7 13.334 12.8782 13.2432C13.035 13.1633 13.1625 13.0358 13.2424 12.879C13.3333 12.7007 13.3333 12.4674 13.3333 12.0006V8.00065C13.3333 7.53394 13.3333 7.30058 13.2424 7.12232C13.1625 6.96552 13.035 6.83804 12.8782 6.75814C12.7 6.66732 12.4666 6.66732 11.9999 6.66732H7.99992C7.53321 6.66732 7.29985 6.66732 7.12159 6.75814C6.96479 6.83804 6.83731 6.96552 6.75741 7.12232Z",
+							"fill": "currentColor"
+						},
+						"children": []
+					}
+				]
+			}
+		]
+	},
+	"name": "StopCircle"
+}

+ 14 - 0
web/app/components/base/icons/src/vender/solid/mediaAndDevices/StopCircle.tsx

@@ -0,0 +1,14 @@
+// GENERATE BY script
+// DON NOT EDIT IT MANUALLY
+
+import * as React from 'react'
+import data from './StopCircle.json'
+import IconBase from '@/app/components/base/icons/IconBase'
+import type { IconBaseProps, IconData } from '@/app/components/base/icons/IconBase'
+
+const Icon = React.forwardRef<React.MutableRefObject<SVGElement>, Omit<IconBaseProps, 'data'>>((
+  props,
+  ref,
+) => <IconBase {...props} ref={ref} data={data as IconData} />)
+
+export default Icon

+ 2 - 0
web/app/components/base/icons/src/vender/solid/mediaAndDevices/index.ts

@@ -0,0 +1,2 @@
+export { default as Microphone01 } from './Microphone01'
+export { default as StopCircle } from './StopCircle'

+ 10 - 0
web/app/components/base/voice-input/index.module.css

@@ -0,0 +1,10 @@
+.wrapper {
+  background: linear-gradient(131deg, #2250F2 0%, #0EBCF3 100%);
+  box-shadow: 0px 4px 6px -2px rgba(16, 24, 40, 0.03), 0px 12px 16px -4px rgba(16, 24, 40, 0.08);
+}
+
+.convert {
+  background: linear-gradient(91.92deg, #104AE1 -1.74%, #0098EE 75.74%);
+  background-clip: text;
+  color: transparent;
+}

+ 197 - 0
web/app/components/base/voice-input/index.tsx

@@ -0,0 +1,197 @@
+import { useCallback, useEffect, useRef, useState } from 'react'
+import { useTranslation } from 'react-i18next'
+import { useParams, usePathname } from 'next/navigation'
+import cn from 'classnames'
+import Recorder from 'js-audio-recorder'
+import { useRafInterval } from 'ahooks'
+import s from './index.module.css'
+import { StopCircle } from '@/app/components/base/icons/src/vender/solid/mediaAndDevices'
+import { Loading02, XClose } from '@/app/components/base/icons/src/vender/line/general'
+import { audioToText } from '@/service/share'
+
+type VoiceInputTypes = {
+  onConverted: (text: string) => void
+  onCancel: () => void
+}
+
+const VoiceInput = ({
+  onCancel,
+  onConverted,
+}: VoiceInputTypes) => {
+  const { t } = useTranslation()
+  const recorder = useRef(new Recorder())
+  const canvasRef = useRef<HTMLCanvasElement | null>(null)
+  const ctxRef = useRef<CanvasRenderingContext2D | null>(null)
+  const drawRecordId = useRef<number | null>(null)
+  const [originDuration, setOriginDuration] = useState(0)
+  const [startRecord, setStartRecord] = useState(false)
+  const [startConvert, setStartConvert] = useState(false)
+  const pathname = usePathname()
+  const params = useParams()
+  const clearInterval = useRafInterval(() => {
+    setOriginDuration(originDuration + 1)
+  }, 1000)
+
+  const drawRecord = useCallback(() => {
+    drawRecordId.current = requestAnimationFrame(drawRecord)
+    const canvas = canvasRef.current!
+    const ctx = ctxRef.current!
+    const dataUnit8Array = recorder.current.getRecordAnalyseData()
+    const dataArray = [].slice.call(dataUnit8Array)
+    const lineLength = parseInt(`${canvas.width / 3}`)
+    const gap = parseInt(`${1024 / lineLength}`)
+
+    ctx.clearRect(0, 0, canvas.width, canvas.height)
+    ctx.beginPath()
+    let x = 0
+    for (let i = 0; i < lineLength; i++) {
+      let v = dataArray.slice(i * gap, i * gap + gap).reduce((prev: number, next: number) => {
+        return prev + next
+      }, 0) / gap
+
+      if (v < 128)
+        v = 128
+      if (v > 178)
+        v = 178
+      const y = (v - 128) / 50 * canvas.height
+
+      ctx.moveTo(x, 16)
+      ctx.roundRect(x, 16 - y, 2, y, [1, 1, 0, 0])
+      ctx.fill()
+      x += 3
+    }
+    ctx.closePath()
+  }, [])
+  const handleStopRecorder = useCallback(async () => {
+    clearInterval()
+    setStartRecord(false)
+    setStartConvert(true)
+    recorder.current.stop()
+    drawRecordId.current && cancelAnimationFrame(drawRecordId.current)
+    drawRecordId.current = null
+    const canvas = canvasRef.current!
+    const ctx = ctxRef.current!
+    ctx.clearRect(0, 0, canvas.width, canvas.height)
+    const wavBlob = recorder.current.getWAVBlob()
+    const wavFile = new File([wavBlob], 'a.wav', { type: 'audio/wav' })
+    const formData = new FormData()
+    formData.append('file', wavFile)
+
+    let url = ''
+    let isPublic = false
+
+    if (params.token) {
+      url = '/audio-to-text'
+      isPublic = true
+    }
+    else if (params.appId) {
+      if (pathname.search('explore/installed') > -1)
+        url = `/installed-apps/${params.appId}/audio-to-text`
+      else
+        url = `/apps/${params.appId}/audio-to-text`
+    }
+
+    try {
+      const audioResponse = await audioToText(url, isPublic, formData)
+      onConverted(audioResponse.text)
+      onCancel()
+    }
+    catch (e) {
+      onConverted('')
+      onCancel()
+    }
+  }, [])
+  const handleStartRecord = async () => {
+    try {
+      await recorder.current.start()
+      setStartRecord(true)
+      setStartConvert(false)
+
+      if (canvasRef.current && ctxRef.current)
+        drawRecord()
+    }
+    catch (e) {
+      onCancel()
+    }
+  }
+
+  const initCanvas = () => {
+    const dpr = window.devicePixelRatio || 1
+    const canvas = document.getElementById('voice-input-record') as HTMLCanvasElement
+
+    if (canvas) {
+      const { width: cssWidth, height: cssHeight } = canvas.getBoundingClientRect()
+
+      canvas.width = dpr * cssWidth
+      canvas.height = dpr * cssHeight
+      canvasRef.current = canvas
+
+      const ctx = canvas.getContext('2d')
+      if (ctx) {
+        ctx.scale(dpr, dpr)
+        ctx.fillStyle = 'rgba(209, 224, 255, 1)'
+        ctxRef.current = ctx
+      }
+    }
+  }
+  if (originDuration >= 120 && startRecord)
+    handleStopRecorder()
+
+  useEffect(() => {
+    initCanvas()
+    handleStartRecord()
+  }, [])
+
+  const minutes = parseInt(`${parseInt(`${originDuration}`) / 60}`)
+  const seconds = parseInt(`${originDuration}`) % 60
+
+  return (
+    <div className={cn(s.wrapper, 'absolute inset-0 rounded-xl')}>
+      <div className='absolute inset-[1.5px] flex items-center pl-[14.5px] pr-[6.5px] py-[14px] bg-primary-25 rounded-[10.5px] overflow-hidden'>
+        <canvas id='voice-input-record' className='absolute left-0 bottom-0 w-full h-4' />
+        {
+          startConvert && <Loading02 className='animate-spin mr-2 w-4 h-4 text-primary-700' />
+        }
+        <div className='grow'>
+          {
+            startRecord && (
+              <div className='text-sm text-gray-500'>
+                {t('common.voiceInput.speaking')}
+              </div>
+            )
+          }
+          {
+            startConvert && (
+              <div className={cn(s.convert, 'text-sm')}>
+                {t('common.voiceInput.converting')}
+              </div>
+            )
+          }
+        </div>
+        {
+          startRecord && (
+            <div
+              className='flex justify-center items-center mr-1 w-8 h-8 hover:bg-primary-100 rounded-lg  cursor-pointer'
+              onClick={handleStopRecorder}
+            >
+              <StopCircle className='w-5 h-5 text-primary-600' />
+            </div>
+          )
+        }
+        {
+          startConvert && (
+            <div
+              className='flex justify-center items-center mr-1 w-8 h-8 hover:bg-gray-200 rounded-lg  cursor-pointer'
+              onClick={onCancel}
+            >
+              <XClose className='w-4 h-4 text-gray-500' />
+            </div>
+          )
+        }
+        <div className={`w-[45px] pl-1 text-xs font-medium ${originDuration > 110 ? 'text-[#F04438]' : 'text-gray-700'}`}>{`0${minutes.toFixed(0)}:${seconds >= 10 ? seconds : `0${seconds}`}`}</div>
+      </div>
+    </div>
+  )
+}
+
+export default VoiceInput

+ 1 - 1
web/app/components/explore/installed-app/index.tsx

@@ -29,7 +29,7 @@ const InstalledApp: FC<IInstalledAppProps> = ({
     <div className='h-full p-2'>
       {installedApp?.app.mode === 'chat'
         ? (
-          <ChatApp isInstalledApp installedAppInfo={installedApp}/>
+          <ChatApp isInstalledApp installedAppInfo={installedApp} />
         )
         : (
           <TextGenerationApp isInstalledApp installedAppInfo={installedApp}/>

+ 4 - 1
web/app/components/share/chat/index.tsx

@@ -149,6 +149,7 @@ const Main: FC<IMainProps> = ({
   }
 
   const [suggestedQuestionsAfterAnswerConfig, setSuggestedQuestionsAfterAnswerConfig] = useState<SuggestedQuestionsAfterAnswerConfig | null>(null)
+  const [speechToTextConfig, setSpeechToTextConfig] = useState<SuggestedQuestionsAfterAnswerConfig | null>(null)
 
   const [conversationIdChangeBecauseOfNew, setConversationIdChangeBecauseOfNew, getConversationIdChangeBecauseOfNew] = useGetState(false)
   const [isChatStarted, { setTrue: setChatStarted, setFalse: setChatNotStarted }] = useBoolean(false)
@@ -326,7 +327,7 @@ const Main: FC<IMainProps> = ({
         const isNotNewConversation = allConversations.some(item => item.id === _conversationId)
         setAllConversationList(allConversations)
         // fetch new conversation info
-        const { user_input_form, opening_statement: introduction, suggested_questions_after_answer }: any = appParams
+        const { user_input_form, opening_statement: introduction, suggested_questions_after_answer, speech_to_text }: any = appParams
         const prompt_variables = userInputsFormToPromptVariables(user_input_form)
         if (siteInfo.default_language)
           changeLanguage(siteInfo.default_language)
@@ -341,6 +342,7 @@ const Main: FC<IMainProps> = ({
           prompt_variables,
         } as PromptConfig)
         setSuggestedQuestionsAfterAnswerConfig(suggested_questions_after_answer)
+        setSpeechToTextConfig(speech_to_text)
 
         // setConversationList(conversations as ConversationItem[])
 
@@ -620,6 +622,7 @@ const Main: FC<IMainProps> = ({
                     controlFocus={controlFocus}
                     isShowSuggestion={doShowSuggestion}
                     suggestionList={suggestQuestions}
+                    isShowSpeechToText={speechToTextConfig?.enabled}
                   />
                 </div>
               </div>)

+ 7 - 1
web/context/debug-configuration.ts

@@ -1,5 +1,5 @@
 import { createContext } from 'use-context-selector'
-import type { CompletionParams, Inputs, ModelConfig, MoreLikeThisConfig, PromptConfig, SuggestedQuestionsAfterAnswerConfig } from '@/models/debug'
+import type { CompletionParams, Inputs, ModelConfig, MoreLikeThisConfig, PromptConfig, SpeechToTextConfig, SuggestedQuestionsAfterAnswerConfig } from '@/models/debug'
 import type { DataSet } from '@/models/datasets'
 
 type IDebugConfiguration = {
@@ -19,6 +19,8 @@ type IDebugConfiguration = {
   setMoreLikeThisConfig: (moreLikeThisConfig: MoreLikeThisConfig) => void
   suggestedQuestionsAfterAnswerConfig: SuggestedQuestionsAfterAnswerConfig
   setSuggestedQuestionsAfterAnswerConfig: (suggestedQuestionsAfterAnswerConfig: SuggestedQuestionsAfterAnswerConfig) => void
+  speechToTextConfig: SpeechToTextConfig
+  setSpeechToTextConfig: (speechToTextConfig: SpeechToTextConfig) => void
   formattingChanged: boolean
   setFormattingChanged: (formattingChanged: boolean) => void
   inputs: Inputs
@@ -59,6 +61,10 @@ const DebugConfigurationContext = createContext<IDebugConfiguration>({
     enabled: false,
   },
   setSuggestedQuestionsAfterAnswerConfig: () => { },
+  speechToTextConfig: {
+    enabled: false,
+  },
+  setSpeechToTextConfig: () => { },
   formattingChanged: false,
   setFormattingChanged: () => { },
   inputs: {},

+ 5 - 0
web/i18n/lang/app-debug.en.ts

@@ -46,6 +46,11 @@ const translation = {
       generateNumTip: 'Number of each generated times',
       tip: 'Using this feature will incur additional tokens overhead',
     },
+    speechToText: {
+      title: 'Speech to Text',
+      description: 'Once enabled, you can use voice input.',
+      resDes: 'Voice input is enabled',
+    },
     dataSet: {
       title: 'Context',
       noData: 'You can import datasets as context',

+ 5 - 0
web/i18n/lang/app-debug.zh.ts

@@ -46,6 +46,11 @@ const translation = {
       generateNumTip: '每次生成数',
       tip: '使用此功能将会额外消耗 tokens',
     },
+    speechToText: {
+      title: '语音转文字',
+      description: '启用后,您可以使用语音输入。',
+      resDes: '语音输入已启用',
+    },
     dataSet: {
       title: '上下文',
       noData: '您可以导入数据集作为上下文',

+ 5 - 0
web/i18n/lang/common.en.ts

@@ -225,6 +225,11 @@ const translation = {
     viewDoc: 'View documentation',
     relatedApp: 'linked apps',
   },
+  voiceInput: {
+    speaking: 'Speak now...',
+    converting: 'Converting to text...',
+    notAllow: 'microphone not authorized',
+  },
 }
 
 export default translation

+ 5 - 0
web/i18n/lang/common.zh.ts

@@ -226,6 +226,11 @@ const translation = {
     viewDoc: '查看文档',
     relatedApp: '个关联应用',
   },
+  voiceInput: {
+    speaking: '现在讲...',
+    converting: '正在转换为文本...',
+    notAllow: '麦克风未授权',
+  },
 }
 
 export default translation

+ 5 - 0
web/models/debug.ts

@@ -31,6 +31,8 @@ export type MoreLikeThisConfig = {
 
 export type SuggestedQuestionsAfterAnswerConfig = MoreLikeThisConfig
 
+export type SpeechToTextConfig = MoreLikeThisConfig
+
 // frontend use. Not the same as backend
 export type ModelConfig = {
   provider: string // LLM Provider: for example "OPENAI"
@@ -43,6 +45,9 @@ export type ModelConfig = {
   suggested_questions_after_answer: {
     enabled: boolean
   } | null
+  speech_to_text: {
+    enabled: boolean
+  } | null
   dataSets: any[]
 }
 

+ 3 - 0
web/package.json

@@ -48,6 +48,7 @@
     "i18next": "^22.4.13",
     "i18next-resources-to-backend": "^1.1.3",
     "immer": "^9.0.19",
+    "js-audio-recorder": "^1.0.7",
     "js-cookie": "^3.0.1",
     "katex": "^0.16.7",
     "lodash-es": "^4.17.21",
@@ -68,6 +69,7 @@
     "react-tooltip": "5.8.3",
     "react-window": "^1.8.9",
     "react-window-infinite-loader": "^1.0.9",
+    "recordrtc": "^5.6.2",
     "rehype-katex": "^6.0.2",
     "remark-breaks": "^3.0.2",
     "remark-gfm": "^3.0.1",
@@ -88,6 +90,7 @@
     "@types/js-cookie": "^3.0.3",
     "@types/negotiator": "^0.6.1",
     "@types/qs": "^6.9.7",
+    "@types/recordrtc": "^5.6.11",
     "@types/sortablejs": "^1.15.1",
     "eslint-config-next": "^13.4.7",
     "eslint-plugin-react-hooks": "^4.6.0",

+ 18 - 1
web/service/base.ts

@@ -35,7 +35,9 @@ export type IOnError = (msg: string) => void
 
 type IOtherOptions = {
   isPublicAPI?: boolean
+  bodyStringify?: boolean
   needAllResponseContent?: boolean
+  deleteContentType?: boolean
   onData?: IOnData // for stream
   onError?: IOnError
   onCompleted?: IOnCompleted // for stream
@@ -132,7 +134,9 @@ const baseFetch = (
   fetchOptions: any,
   {
     isPublicAPI = false,
+    bodyStringify = true,
     needAllResponseContent,
+    deleteContentType,
   }: IOtherOptions,
 ) => {
   const options = Object.assign({}, baseOptions, fetchOptions)
@@ -141,6 +145,15 @@ const baseFetch = (
     options.headers.set('Authorization', `bearer ${sharedToken}`)
   }
 
+  if (deleteContentType) {
+    options.headers.delete('Content-Type')
+  }
+  else {
+    const contentType = options.headers.get('Content-Type')
+    if (!contentType)
+      options.headers.set('Content-Type', ContentType.json)
+  }
+
   const urlPrefix = isPublicAPI ? PUBLIC_API_PREFIX : API_PREFIX
   let urlWithPrefix = `${urlPrefix}${url.startsWith('/') ? url : `/${url}`}`
 
@@ -160,7 +173,7 @@ const baseFetch = (
     delete options.params
   }
 
-  if (body)
+  if (body && bodyStringify)
     options.body = JSON.stringify(body)
 
   // Handle timeout
@@ -285,6 +298,10 @@ export const ssePost = (url: string, fetchOptions: any, { isPublicAPI = false, o
     signal: abortController.signal,
   }, fetchOptions)
 
+  const contentType = options.headers.get('Content-Type')
+  if (!contentType)
+    options.headers.set('Content-Type', ContentType.json)
+
   getAbortController?.(abortController)
 
   const urlPrefix = isPublicAPI ? PUBLIC_API_PREFIX : API_PREFIX

+ 4 - 0
web/service/share.ts

@@ -114,3 +114,7 @@ export const removeMessage = (messageId: string, isInstalledApp: boolean, instal
 export const fetchSuggestedQuestions = (messageId: string, isInstalledApp: boolean, installedAppId = '') => {
   return (getAction('get', isInstalledApp))(getUrl(`/messages/${messageId}/suggested-questions`, isInstalledApp, installedAppId))
 }
+
+export const audioToText = (url: string, isPublicAPI: boolean, body: FormData) => {
+  return (getAction('post', !isPublicAPI))(url, { body }, { bodyStringify: false, deleteContentType: true }) as Promise<{ text: string }>
+}

+ 3 - 0
web/types/app.ts

@@ -85,6 +85,9 @@ export type ModelConfig = {
   suggested_questions_after_answer: {
     enabled: boolean
   }
+  speech_to_text: {
+    enabled: boolean
+  }
   agent_mode: {
     enabled: boolean
     tools: ToolItem[]

Some files were not shown because too many files changed in this diff