add input transcription support for live/streaming.

Copybara import of the project: -- d481e0604a79470e2c1308827b3ecb78bfb5327e by Alan B <alan@nerds.ai>: feat: 🚧 catch user transcription -- bba436bb76d1d2f9d5ba969fce38ff8b8a443254 by Alan B <alan@nerds.ai>: feat: ✨ send user transcription event as llm_response -- ad2abf540c60895b79c50f9051a6289ce394b98d by Alan B <death1027@outlook.com>: style: 💄 update lint problems -- 744703c06716300c0f9f41633d3bafdf4cb180a1 by Hangfei Lin <hangfeilin@gmail.com>: fix: set right order for input transcription -- 31a5d42d6155b0e5caad0c73c8df43255322016f by Hangfei Lin <hangfeilin@gmail.com>: remove print -- 59e5d9c72060f97d124883150989315401a4c1b5 by Hangfei Lin <hangfeilin@gmail.com>: remove api version COPYBARA_INTEGRATE_REVIEW=https://github.com/google/adk-python/pull/495 from BloodBoy21:main ea29015af041f9785abaa8583e2c767f9d8c8bc8 PiperOrigin-RevId: 755401615
2025-07-13 15:14:50 -06:00 · 2025-05-06 09:26:00 -07:00 · 2025-05-06 09:26:00 -07:00 · fcca0afdac
commit fcca0afdac
parent 905c20dad6
5 changed files with 32 additions and 3 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -33,7 +33,7 @@ dependencies = [
  "google-cloud-secret-manager>=2.22.0",     # Fetching secrets in RestAPI Tool
  "google-cloud-speech>=2.30.0",             # For Audo Transcription
  "google-cloud-storage>=2.18.0, <3.0.0",    # For GCS Artifact service
-  "google-genai>=1.11.0",                    # Google GenAI SDK
+  "google-genai>=1.12.1",                    # Google GenAI SDK
  "graphviz>=0.20.2",                        # Graphviz for graph rendering
  "mcp>=1.5.0;python_version>='3.10'",       # For MCP Toolset
  "opentelemetry-api>=1.31.0",               # OpenTelemetry
--- a/src/google/adk/agents/run_config.py
+++ b/src/google/adk/agents/run_config.py
@ -65,6 +65,9 @@ class RunConfig(BaseModel):
  output_audio_transcription: Optional[types.AudioTranscriptionConfig] = None
  """Output transcription for live agents with audio response."""

+  input_audio_transcription: Optional[types.AudioTranscriptionConfig] = None
+  """Input transcription for live agents with audio input from user."""
+
  max_llm_calls: int = 500
  """
  A limit on the total number of llm calls for a given run.
--- a/src/google/adk/flows/llm_flows/base_llm_flow.py
+++ b/src/google/adk/flows/llm_flows/base_llm_flow.py
@ -190,6 +190,16 @@ class BaseLlmFlow(ABC):
      llm_request: LlmRequest,
  ) -> AsyncGenerator[Event, None]:
    """Receive data from model and process events using BaseLlmConnection."""
+    def get_author(llm_response):
+      """Get the author of the event.
+      
+      When the model returns transcription, the author is "user". Otherwise, the author is the agent.
+      """
+      if llm_response and llm_response.content and llm_response.content.role == "user":
+        return "user"
+      else:
+        return invocation_context.agent.name
+      
    assert invocation_context.live_request_queue
    try:
      while True:
@ -197,7 +207,7 @@ class BaseLlmFlow(ABC):
          model_response_event = Event(
              id=Event.new_id(),
              invocation_id=invocation_context.invocation_id,
-              author=invocation_context.agent.name,
+              author=get_author(llm_response),
          )
          async for event in self._postprocess_live(
              invocation_context,
--- a/src/google/adk/flows/llm_flows/basic.py
+++ b/src/google/adk/flows/llm_flows/basic.py
@ -62,6 +62,9 @@ class _BasicLlmRequestProcessor(BaseLlmRequestProcessor):
    llm_request.live_connect_config.output_audio_transcription = (
        invocation_context.run_config.output_audio_transcription
    )
+    llm_request.live_connect_config.input_audio_transcription = (
+        invocation_context.run_config.input_audio_transcription
+    )

    # TODO: handle tool append here, instead of in BaseTool.process_llm_request.

--- a/src/google/adk/models/gemini_llm_connection.py
+++ b/src/google/adk/models/gemini_llm_connection.py
@ -145,7 +145,20 @@ class GeminiLlmConnection(BaseLlmConnection):
            yield self.__build_full_text_response(text)
            text = ''
          yield llm_response
-
+        if (
+            message.server_content.input_transcription
+            and message.server_content.input_transcription.text
+        ):
+            user_text = message.server_content.input_transcription.text
+            parts = [
+                types.Part.from_text(
+                    text=user_text,
+                )
+            ]
+            llm_response = LlmResponse(
+                content=types.Content(role='user', parts=parts)
+            )
+            yield llm_response
        if (
            message.server_content.output_transcription
            and message.server_content.output_transcription.text