add input transcription support for live/streaming.

Copybara import of the project:

--
d481e0604a79470e2c1308827b3ecb78bfb5327e by Alan B <alan@nerds.ai>:

feat: 🚧 catch user transcription

--
bba436bb76d1d2f9d5ba969fce38ff8b8a443254 by Alan B <alan@nerds.ai>:

feat:  send user transcription event as llm_response

--
ad2abf540c60895b79c50f9051a6289ce394b98d by Alan B <death1027@outlook.com>:

style: 💄 update lint problems

--
744703c06716300c0f9f41633d3bafdf4cb180a1 by Hangfei Lin <hangfeilin@gmail.com>:

fix: set right order for input transcription

--
31a5d42d6155b0e5caad0c73c8df43255322016f by Hangfei Lin <hangfeilin@gmail.com>:

remove print

--
59e5d9c72060f97d124883150989315401a4c1b5 by Hangfei Lin <hangfeilin@gmail.com>:

remove api version

COPYBARA_INTEGRATE_REVIEW=https://github.com/google/adk-python/pull/495 from BloodBoy21:main ea29015af041f9785abaa8583e2c767f9d8c8bc8
PiperOrigin-RevId: 755401615
This commit is contained in:
Alan 2025-05-06 09:26:00 -07:00 committed by Copybara-Service
parent 905c20dad6
commit fcca0afdac
5 changed files with 32 additions and 3 deletions

View File

@ -33,7 +33,7 @@ dependencies = [
"google-cloud-secret-manager>=2.22.0", # Fetching secrets in RestAPI Tool
"google-cloud-speech>=2.30.0", # For Audo Transcription
"google-cloud-storage>=2.18.0, <3.0.0", # For GCS Artifact service
"google-genai>=1.11.0", # Google GenAI SDK
"google-genai>=1.12.1", # Google GenAI SDK
"graphviz>=0.20.2", # Graphviz for graph rendering
"mcp>=1.5.0;python_version>='3.10'", # For MCP Toolset
"opentelemetry-api>=1.31.0", # OpenTelemetry

View File

@ -65,6 +65,9 @@ class RunConfig(BaseModel):
output_audio_transcription: Optional[types.AudioTranscriptionConfig] = None
"""Output transcription for live agents with audio response."""
input_audio_transcription: Optional[types.AudioTranscriptionConfig] = None
"""Input transcription for live agents with audio input from user."""
max_llm_calls: int = 500
"""
A limit on the total number of llm calls for a given run.

View File

@ -190,6 +190,16 @@ class BaseLlmFlow(ABC):
llm_request: LlmRequest,
) -> AsyncGenerator[Event, None]:
"""Receive data from model and process events using BaseLlmConnection."""
def get_author(llm_response):
"""Get the author of the event.
When the model returns transcription, the author is "user". Otherwise, the author is the agent.
"""
if llm_response and llm_response.content and llm_response.content.role == "user":
return "user"
else:
return invocation_context.agent.name
assert invocation_context.live_request_queue
try:
while True:
@ -197,7 +207,7 @@ class BaseLlmFlow(ABC):
model_response_event = Event(
id=Event.new_id(),
invocation_id=invocation_context.invocation_id,
author=invocation_context.agent.name,
author=get_author(llm_response),
)
async for event in self._postprocess_live(
invocation_context,

View File

@ -62,6 +62,9 @@ class _BasicLlmRequestProcessor(BaseLlmRequestProcessor):
llm_request.live_connect_config.output_audio_transcription = (
invocation_context.run_config.output_audio_transcription
)
llm_request.live_connect_config.input_audio_transcription = (
invocation_context.run_config.input_audio_transcription
)
# TODO: handle tool append here, instead of in BaseTool.process_llm_request.

View File

@ -145,7 +145,20 @@ class GeminiLlmConnection(BaseLlmConnection):
yield self.__build_full_text_response(text)
text = ''
yield llm_response
if (
message.server_content.input_transcription
and message.server_content.input_transcription.text
):
user_text = message.server_content.input_transcription.text
parts = [
types.Part.from_text(
text=user_text,
)
]
llm_response = LlmResponse(
content=types.Content(role='user', parts=parts)
)
yield llm_response
if (
message.server_content.output_transcription
and message.server_content.output_transcription.text