|  | @@ -322,8 +322,11 @@ class AzureOpenAILargeLanguageModel(_CommonAzureOpenAI, LargeLanguageModel):
 | 
	
		
			
				|  |  |                                                response: Stream[ChatCompletionChunk],
 | 
	
		
			
				|  |  |                                                prompt_messages: list[PromptMessage],
 | 
	
		
			
				|  |  |                                                tools: Optional[list[PromptMessageTool]] = None) -> Generator:
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | +        index = 0
 | 
	
		
			
				|  |  |          full_assistant_content = ''
 | 
	
		
			
				|  |  | +        real_model = model
 | 
	
		
			
				|  |  | +        system_fingerprint = None
 | 
	
		
			
				|  |  | +        completion = ''
 | 
	
		
			
				|  |  |          for chunk in response:
 | 
	
		
			
				|  |  |              if len(chunk.choices) == 0:
 | 
	
		
			
				|  |  |                  continue
 | 
	
	
		
			
				|  | @@ -349,40 +352,44 @@ class AzureOpenAILargeLanguageModel(_CommonAzureOpenAI, LargeLanguageModel):
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |              full_assistant_content += delta.delta.content if delta.delta.content else ''
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -            if delta.finish_reason is not None:
 | 
	
		
			
				|  |  | -                # calculate num tokens
 | 
	
		
			
				|  |  | -                prompt_tokens = self._num_tokens_from_messages(credentials, prompt_messages, tools)
 | 
	
		
			
				|  |  | +            real_model = chunk.model
 | 
	
		
			
				|  |  | +            system_fingerprint = chunk.system_fingerprint
 | 
	
		
			
				|  |  | +            completion += delta.delta.content if delta.delta.content else ''
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -                full_assistant_prompt_message = AssistantPromptMessage(
 | 
	
		
			
				|  |  | -                    content=full_assistant_content,
 | 
	
		
			
				|  |  | -                    tool_calls=tool_calls
 | 
	
		
			
				|  |  | +            yield LLMResultChunk(
 | 
	
		
			
				|  |  | +                model=real_model,
 | 
	
		
			
				|  |  | +                prompt_messages=prompt_messages,
 | 
	
		
			
				|  |  | +                system_fingerprint=system_fingerprint,
 | 
	
		
			
				|  |  | +                delta=LLMResultChunkDelta(
 | 
	
		
			
				|  |  | +                    index=index,
 | 
	
		
			
				|  |  | +                    message=assistant_prompt_message,
 | 
	
		
			
				|  |  |                  )
 | 
	
		
			
				|  |  | -                completion_tokens = self._num_tokens_from_messages(credentials, [full_assistant_prompt_message])
 | 
	
		
			
				|  |  | +            )
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -                # transform usage
 | 
	
		
			
				|  |  | -                usage = self._calc_response_usage(model, credentials, prompt_tokens, completion_tokens)
 | 
	
		
			
				|  |  | +            index += 0
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -                yield LLMResultChunk(
 | 
	
		
			
				|  |  | -                    model=chunk.model,
 | 
	
		
			
				|  |  | -                    prompt_messages=prompt_messages,
 | 
	
		
			
				|  |  | -                    system_fingerprint=chunk.system_fingerprint,
 | 
	
		
			
				|  |  | -                    delta=LLMResultChunkDelta(
 | 
	
		
			
				|  |  | -                        index=delta.index,
 | 
	
		
			
				|  |  | -                        message=assistant_prompt_message,
 | 
	
		
			
				|  |  | -                        finish_reason=delta.finish_reason,
 | 
	
		
			
				|  |  | -                        usage=usage
 | 
	
		
			
				|  |  | -                    )
 | 
	
		
			
				|  |  | -                )
 | 
	
		
			
				|  |  | -            else:
 | 
	
		
			
				|  |  | -                yield LLMResultChunk(
 | 
	
		
			
				|  |  | -                    model=chunk.model,
 | 
	
		
			
				|  |  | -                    prompt_messages=prompt_messages,
 | 
	
		
			
				|  |  | -                    system_fingerprint=chunk.system_fingerprint,
 | 
	
		
			
				|  |  | -                    delta=LLMResultChunkDelta(
 | 
	
		
			
				|  |  | -                        index=delta.index,
 | 
	
		
			
				|  |  | -                        message=assistant_prompt_message,
 | 
	
		
			
				|  |  | -                    )
 | 
	
		
			
				|  |  | -                )
 | 
	
		
			
				|  |  | +        # calculate num tokens
 | 
	
		
			
				|  |  | +        prompt_tokens = self._num_tokens_from_messages(credentials, prompt_messages, tools)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        full_assistant_prompt_message = AssistantPromptMessage(
 | 
	
		
			
				|  |  | +            content=completion
 | 
	
		
			
				|  |  | +        )
 | 
	
		
			
				|  |  | +        completion_tokens = self._num_tokens_from_messages(credentials, [full_assistant_prompt_message])
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        # transform usage
 | 
	
		
			
				|  |  | +        usage = self._calc_response_usage(model, credentials, prompt_tokens, completion_tokens)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        yield LLMResultChunk(
 | 
	
		
			
				|  |  | +            model=real_model,
 | 
	
		
			
				|  |  | +            prompt_messages=prompt_messages,
 | 
	
		
			
				|  |  | +            system_fingerprint=system_fingerprint,
 | 
	
		
			
				|  |  | +            delta=LLMResultChunkDelta(
 | 
	
		
			
				|  |  | +                index=index,
 | 
	
		
			
				|  |  | +                message=AssistantPromptMessage(content=''),
 | 
	
		
			
				|  |  | +                finish_reason='stop',
 | 
	
		
			
				|  |  | +                usage=usage
 | 
	
		
			
				|  |  | +            )
 | 
	
		
			
				|  |  | +        )
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |      @staticmethod
 | 
	
		
			
				|  |  |      def _extract_response_tool_calls(response_tool_calls: list[ChatCompletionMessageToolCall | ChoiceDeltaToolCall]) \
 |