From fcca0afdac3caccbf9de8ced7e1e39aa2cded949 Mon Sep 17 00:00:00 2001
From: Alan <60404530+BloodBoy21@users.noreply.github.com>
Date: Tue, 6 May 2025 09:26:00 -0700
Subject: [PATCH] add input transcription support for live/streaming.

Copybara import of the project:

--
d481e0604a79470e2c1308827b3ecb78bfb5327e by Alan B <alan@nerds.ai>:

feat: :construction: catch user transcription

--
bba436bb76d1d2f9d5ba969fce38ff8b8a443254 by Alan B <alan@nerds.ai>:

feat: :sparkles: send user transcription event as llm_response

--
ad2abf540c60895b79c50f9051a6289ce394b98d by Alan B <death1027@outlook.com>:

style: :lipstick: update lint problems

--
744703c06716300c0f9f41633d3bafdf4cb180a1 by Hangfei Lin <hangfeilin@gmail.com>:

fix: set right order for input transcription

--
31a5d42d6155b0e5caad0c73c8df43255322016f by Hangfei Lin <hangfeilin@gmail.com>:

remove print

--
59e5d9c72060f97d124883150989315401a4c1b5 by Hangfei Lin <hangfeilin@gmail.com>:

remove api version

COPYBARA_INTEGRATE_REVIEW=https://github.com/google/adk-python/pull/495 from BloodBoy21:main ea29015af041f9785abaa8583e2c767f9d8c8bc8
PiperOrigin-RevId: 755401615
---
 pyproject.toml                                  |  2 +-
 src/google/adk/agents/run_config.py             |  3 +++
 src/google/adk/flows/llm_flows/base_llm_flow.py | 12 +++++++++++-
 src/google/adk/flows/llm_flows/basic.py         |  3 +++
 src/google/adk/models/gemini_llm_connection.py  | 15 ++++++++++++++-
 5 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index f49482d..2ed9d63 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -33,7 +33,7 @@ dependencies = [
   "google-cloud-secret-manager>=2.22.0",     # Fetching secrets in RestAPI Tool
   "google-cloud-speech>=2.30.0",             # For Audo Transcription
   "google-cloud-storage>=2.18.0, <3.0.0",    # For GCS Artifact service
-  "google-genai>=1.11.0",                    # Google GenAI SDK
+  "google-genai>=1.12.1",                    # Google GenAI SDK
   "graphviz>=0.20.2",                        # Graphviz for graph rendering
   "mcp>=1.5.0;python_version>='3.10'",       # For MCP Toolset
   "opentelemetry-api>=1.31.0",               # OpenTelemetry
diff --git a/src/google/adk/agents/run_config.py b/src/google/adk/agents/run_config.py
index e121750..f19ae0f 100644
--- a/src/google/adk/agents/run_config.py
+++ b/src/google/adk/agents/run_config.py
@@ -65,6 +65,9 @@ class RunConfig(BaseModel):
   output_audio_transcription: Optional[types.AudioTranscriptionConfig] = None
   """Output transcription for live agents with audio response."""
 
+  input_audio_transcription: Optional[types.AudioTranscriptionConfig] = None
+  """Input transcription for live agents with audio input from user."""
+
   max_llm_calls: int = 500
   """
   A limit on the total number of llm calls for a given run.
diff --git a/src/google/adk/flows/llm_flows/base_llm_flow.py b/src/google/adk/flows/llm_flows/base_llm_flow.py
index 188f3a5..31904e3 100644
--- a/src/google/adk/flows/llm_flows/base_llm_flow.py
+++ b/src/google/adk/flows/llm_flows/base_llm_flow.py
@@ -190,6 +190,16 @@ class BaseLlmFlow(ABC):
       llm_request: LlmRequest,
   ) -> AsyncGenerator[Event, None]:
     """Receive data from model and process events using BaseLlmConnection."""
+    def get_author(llm_response):
+      """Get the author of the event.
+      
+      When the model returns transcription, the author is "user". Otherwise, the author is the agent.
+      """
+      if llm_response and llm_response.content and llm_response.content.role == "user":
+        return "user"
+      else:
+        return invocation_context.agent.name
+      
     assert invocation_context.live_request_queue
     try:
       while True:
@@ -197,7 +207,7 @@ class BaseLlmFlow(ABC):
           model_response_event = Event(
               id=Event.new_id(),
               invocation_id=invocation_context.invocation_id,
-              author=invocation_context.agent.name,
+              author=get_author(llm_response),
           )
           async for event in self._postprocess_live(
               invocation_context,
diff --git a/src/google/adk/flows/llm_flows/basic.py b/src/google/adk/flows/llm_flows/basic.py
index 278b4cf..d48c8cd 100644
--- a/src/google/adk/flows/llm_flows/basic.py
+++ b/src/google/adk/flows/llm_flows/basic.py
@@ -62,6 +62,9 @@ class _BasicLlmRequestProcessor(BaseLlmRequestProcessor):
     llm_request.live_connect_config.output_audio_transcription = (
         invocation_context.run_config.output_audio_transcription
     )
+    llm_request.live_connect_config.input_audio_transcription = (
+        invocation_context.run_config.input_audio_transcription
+    )
 
     # TODO: handle tool append here, instead of in BaseTool.process_llm_request.
 
diff --git a/src/google/adk/models/gemini_llm_connection.py b/src/google/adk/models/gemini_llm_connection.py
index 30f1fb2..4018975 100644
--- a/src/google/adk/models/gemini_llm_connection.py
+++ b/src/google/adk/models/gemini_llm_connection.py
@@ -145,7 +145,20 @@ class GeminiLlmConnection(BaseLlmConnection):
             yield self.__build_full_text_response(text)
             text = ''
           yield llm_response
-
+        if (
+            message.server_content.input_transcription
+            and message.server_content.input_transcription.text
+        ):
+            user_text = message.server_content.input_transcription.text
+            parts = [
+                types.Part.from_text(
+                    text=user_text,
+                )
+            ]
+            llm_response = LlmResponse(
+                content=types.Content(role='user', parts=parts)
+            )
+            yield llm_response
         if (
             message.server_content.output_transcription
             and message.server_content.output_transcription.text