|  | @@ -1,11 +1,7 @@
 | 
	
		
			
				|  |  |  import concurrent.futures
 | 
	
		
			
				|  |  | -from functools import reduce
 | 
	
		
			
				|  |  | -from io import BytesIO
 | 
	
		
			
				|  |  |  from typing import Optional
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -from flask import Response
 | 
	
		
			
				|  |  |  from openai import OpenAI
 | 
	
		
			
				|  |  | -from pydub import AudioSegment
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |  from core.model_runtime.errors.invoke import InvokeBadRequestError
 | 
	
		
			
				|  |  |  from core.model_runtime.errors.validate import CredentialsValidateFailedError
 | 
	
	
		
			
				|  | @@ -32,7 +28,8 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
 | 
	
		
			
				|  |  |          :return: text translated to audio file
 | 
	
		
			
				|  |  |          """
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -        if not voice or voice not in [d['value'] for d in self.get_tts_model_voices(model=model, credentials=credentials)]:
 | 
	
		
			
				|  |  | +        if not voice or voice not in [d['value'] for d in
 | 
	
		
			
				|  |  | +                                      self.get_tts_model_voices(model=model, credentials=credentials)]:
 | 
	
		
			
				|  |  |              voice = self._get_model_default_voice(model, credentials)
 | 
	
		
			
				|  |  |          # if streaming:
 | 
	
		
			
				|  |  |          return self._tts_invoke_streaming(model=model,
 | 
	
	
		
			
				|  | @@ -50,7 +47,7 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
 | 
	
		
			
				|  |  |          :return: text translated to audio file
 | 
	
		
			
				|  |  |          """
 | 
	
		
			
				|  |  |          try:
 | 
	
		
			
				|  |  | -            self._tts_invoke(
 | 
	
		
			
				|  |  | +            self._tts_invoke_streaming(
 | 
	
		
			
				|  |  |                  model=model,
 | 
	
		
			
				|  |  |                  credentials=credentials,
 | 
	
		
			
				|  |  |                  content_text='Hello Dify!',
 | 
	
	
		
			
				|  | @@ -59,46 +56,6 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
 | 
	
		
			
				|  |  |          except Exception as ex:
 | 
	
		
			
				|  |  |              raise CredentialsValidateFailedError(str(ex))
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -    def _tts_invoke(self, model: str, credentials: dict, content_text: str, voice: str) -> Response:
 | 
	
		
			
				|  |  | -        """
 | 
	
		
			
				|  |  | -        _tts_invoke text2speech model
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -        :param model: model name
 | 
	
		
			
				|  |  | -        :param credentials: model credentials
 | 
	
		
			
				|  |  | -        :param content_text: text content to be translated
 | 
	
		
			
				|  |  | -        :param voice: model timbre
 | 
	
		
			
				|  |  | -        :return: text translated to audio file
 | 
	
		
			
				|  |  | -        """
 | 
	
		
			
				|  |  | -        audio_type = self._get_model_audio_type(model, credentials)
 | 
	
		
			
				|  |  | -        word_limit = self._get_model_word_limit(model, credentials)
 | 
	
		
			
				|  |  | -        max_workers = self._get_model_workers_limit(model, credentials)
 | 
	
		
			
				|  |  | -        try:
 | 
	
		
			
				|  |  | -            sentences = list(self._split_text_into_sentences(org_text=content_text, max_length=word_limit))
 | 
	
		
			
				|  |  | -            audio_bytes_list = []
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -            # Create a thread pool and map the function to the list of sentences
 | 
	
		
			
				|  |  | -            with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
 | 
	
		
			
				|  |  | -                futures = [executor.submit(self._process_sentence, sentence=sentence, model=model, voice=voice,
 | 
	
		
			
				|  |  | -                                           credentials=credentials) for sentence in sentences]
 | 
	
		
			
				|  |  | -                for future in futures:
 | 
	
		
			
				|  |  | -                    try:
 | 
	
		
			
				|  |  | -                        if future.result():
 | 
	
		
			
				|  |  | -                            audio_bytes_list.append(future.result())
 | 
	
		
			
				|  |  | -                    except Exception as ex:
 | 
	
		
			
				|  |  | -                        raise InvokeBadRequestError(str(ex))
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -            if len(audio_bytes_list) > 0:
 | 
	
		
			
				|  |  | -                audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in
 | 
	
		
			
				|  |  | -                                  audio_bytes_list if audio_bytes]
 | 
	
		
			
				|  |  | -                combined_segment = reduce(lambda x, y: x + y, audio_segments)
 | 
	
		
			
				|  |  | -                buffer: BytesIO = BytesIO()
 | 
	
		
			
				|  |  | -                combined_segment.export(buffer, format=audio_type)
 | 
	
		
			
				|  |  | -                buffer.seek(0)
 | 
	
		
			
				|  |  | -                return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}")
 | 
	
		
			
				|  |  | -        except Exception as ex:
 | 
	
		
			
				|  |  | -            raise InvokeBadRequestError(str(ex))
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  |      def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str,
 | 
	
		
			
				|  |  |                                voice: str) -> any:
 | 
	
		
			
				|  |  |          """
 | 
	
	
		
			
				|  | @@ -114,7 +71,8 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
 | 
	
		
			
				|  |  |              # doc: https://platform.openai.com/docs/guides/text-to-speech
 | 
	
		
			
				|  |  |              credentials_kwargs = self._to_credential_kwargs(credentials)
 | 
	
		
			
				|  |  |              client = OpenAI(**credentials_kwargs)
 | 
	
		
			
				|  |  | -            model_support_voice = [x.get("value") for x in self.get_tts_model_voices(model=model, credentials=credentials)]
 | 
	
		
			
				|  |  | +            model_support_voice = [x.get("value") for x in
 | 
	
		
			
				|  |  | +                                   self.get_tts_model_voices(model=model, credentials=credentials)]
 | 
	
		
			
				|  |  |              if not voice or voice not in model_support_voice:
 | 
	
		
			
				|  |  |                  voice = self._get_model_default_voice(model, credentials)
 | 
	
		
			
				|  |  |              word_limit = self._get_model_word_limit(model, credentials)
 |