Voice

Real-time speech-to-text in the chat composer. The user speaks, the runtime transcribes, the agent runs the resulting prompt.

"""LlamaIndex AG-UI AgentUses llama-index-protocols-ag-ui to expose a LlamaIndex workflow as anAG-UI compatible FastAPI router. The router handles all four demoscenarios (agentic-chat, tool-rendering, hitl, gen-ui-tool-based) througha single endpoint since LlamaIndex's get_ag_ui_workflow_router buildsthe full AG-UI protocol surface automatically.NOTE: Uses FixedAGUIChatWorkflow from hitl_in_chat_agent to fix threeupstream library bugs (duplicate tool-call rendering, missingparent_message_id, and incorrect tool-result message roles). Seehitl_in_chat_agent.py module docstring for details."""import jsonimport osfrom typing import Annotatedfrom llama_index.llms.openai import OpenAIfrom llama_index.protocols.ag_ui.router import get_ag_ui_workflow_routerfrom agents.hitl_in_chat_agent import FixedAGUIChatWorkflow# Import shared tool implementationsfrom tools import (    get_weather_impl,    query_data_impl,    manage_sales_todos_impl,    get_sales_todos_impl,    schedule_meeting_impl,    search_flights_impl,    build_a2ui_operations_from_tool_call,)# --- Frontend tools (executed client-side, agent just returns a confirmation) ---def change_background(    background: Annotated[str, "CSS background value. Prefer gradients."],) -> str:    """Change the background color/gradient of the chat area."""    return f"Background changed to {background}"def generate_haiku(    japanese: Annotated[list[str], "3 lines of haiku in Japanese"],    english: Annotated[list[str], "3 lines of haiku translated to English"],    image_name: Annotated[str, "One relevant image name from the valid set"],    gradient: Annotated[str, "CSS Gradient color for the background"],) -> str:    """Generate a haiku with Japanese text, English translation, and a background image."""    return "Haiku generated!"def generate_task_steps(    steps: Annotated[        list[dict],        "Array of step objects with 'description' (string) and 'status' ('enabled' or 'disabled')",    ],) -> str:    """Generate a list of task steps for the user to review and approve."""    return f"Generated {len(steps)} steps for review"def book_call(    topic: Annotated[str, "What the call is about (e.g. 'Intro with sales')"],    attendee: Annotated[str, "Who the call is with (e.g. 'Alice from Sales')"],) -> str:    """Ask the user to pick a time slot for a call. The picker UI presents fixed candidate slots; the user's choice is returned to the agent."""    return f"Booking call about {topic} with {attendee}"def show_card(    title: Annotated[str, "Short heading for the card."],    body: Annotated[str, "Body text for the card."],) -> str:    """Display a titled card with a short body of text. Rendered on the frontend via useComponent."""    return f"Displayed card: {title}"# --- Backend tools (executed server-side, using shared implementations) ---async def get_weather(    location: Annotated[str, "The location to get the weather for."],) -> str:    """Get the weather for a given location. Returns temperature, conditions, humidity, wind speed, and feels-like temperature."""    return json.dumps(get_weather_impl(location))async def query_data(    query: Annotated[str, "Natural language query for financial data."],) -> str:    """Query financial database for chart data. Always call before showing a chart or graph."""    return json.dumps(query_data_impl(query))async def manage_sales_todos(    todos: Annotated[        list[dict], "Complete list of sales todos to replace the current list."    ],) -> str:    """Manage the sales pipeline by replacing the entire list of todos."""    result = manage_sales_todos_impl(todos)    return json.dumps(        {"status": "updated", "count": len(result), "todos": [dict(t) for t in result]}    )async def get_sales_todos_tool() -> str:    """Get the current sales pipeline todos."""    return json.dumps(get_sales_todos_impl(None))async def schedule_meeting(    reason: Annotated[str, "Reason for the meeting."],) -> str:    """Schedule a meeting with the user. Requires human approval."""    return json.dumps(schedule_meeting_impl(reason))async def search_flights(    flights: Annotated[        list[dict],        "List of flight objects to search and display as rich cards. Return exactly 2 flights.",    ],) -> str:    """Search for flights and display the results as rich A2UI cards.    Each flight must have: airline, airlineLogo, flightNumber, origin, destination,    date, departureTime, arrivalTime, duration, status, statusColor, price, currency.    """    result = search_flights_impl(flights)    return json.dumps(result)async def generate_a2ui(    context: Annotated[str, "Conversation context to generate UI from."],) -> str:    """Generate dynamic A2UI components based on the conversation.    A secondary LLM designs the UI schema and data. The result is    returned as an a2ui_operations container for the middleware to detect.    """    from openai import OpenAI    client = OpenAI()    tool_schema = {        "type": "function",        "function": {            "name": "render_a2ui",            "description": "Render a dynamic A2UI v0.9 surface.",            "parameters": {                "type": "object",                "properties": {                    "surfaceId": {"type": "string"},                    "catalogId": {"type": "string"},                    "components": {"type": "array", "items": {"type": "object"}},                    "data": {"type": "object"},                },                "required": ["surfaceId", "catalogId", "components"],            },        },    }    response = client.chat.completions.create(        model="gpt-4.1",        messages=[            {"role": "system", "content": context or "Generate a useful dashboard UI."},            {                "role": "user",                "content": "Generate a dynamic A2UI dashboard based on the conversation.",            },        ],        tools=[tool_schema],        tool_choice={"type": "function", "function": {"name": "render_a2ui"}},    )    if not response.choices[0].message.tool_calls:        return json.dumps({"error": "LLM did not call render_a2ui"})    tool_call = response.choices[0].message.tool_calls[0]    args = json.loads(tool_call.function.arguments)    result = build_a2ui_operations_from_tool_call(args)    return json.dumps(result)_openai_kwargs = {}if os.environ.get("OPENAI_BASE_URL"):    _openai_kwargs["api_base"] = os.environ["OPENAI_BASE_URL"]_AGENT_SYSTEM_PROMPT = (    "You are a polished, professional demo assistant for CopilotKit. "    "Keep responses brief and clear -- 1 to 2 sentences max.\n\n"    "You can:\n"    "- Chat naturally with the user\n"    "- Change the UI background when asked (via frontend tool)\n"    "- Query data and render charts (via query_data tool)\n"    "- Get weather information (via get_weather tool)\n"    "- Schedule meetings with the user (via schedule_meeting tool)\n"    "- Manage sales pipeline todos (via manage_sales_todos / get_sales_todos tools)\n"    "- Search flights and display rich A2UI cards (via search_flights tool)\n"    "- Generate dynamic A2UI dashboards from conversation context (via generate_a2ui tool)\n"    "- Generate step-by-step plans for user review (human-in-the-loop)\n"    "- Book calls with people (via book_call frontend tool)\n"    "- Show titled cards with a body of text (via show_card frontend tool)\n"    "When asked about weather, always use the get_weather tool. "    "When asked about financial data or charts, use query_data first. "    "When asked to book a call, use the book_call tool with topic and name.")async def _agent_workflow_factory():    wf = FixedAGUIChatWorkflow(        llm=OpenAI(model="gpt-4.1", **_openai_kwargs),        frontend_tools=[            change_background,            generate_haiku,            generate_task_steps,            book_call,            show_card,            get_weather,        ],        backend_tools=[            query_data,            manage_sales_todos,            get_sales_todos_tool,            schedule_meeting,            search_flights,            generate_a2ui,        ],        system_prompt=_AGENT_SYSTEM_PROMPT,        initial_state={            "todos": [],        },    )    # Tools that use useRenderTool on the frontend — emit    # TOOL_CALL_RESULT so the render transitions to "complete".    wf.render_only_tool_names = {"get_weather"}    return wfagent_router = get_ag_ui_workflow_router(    workflow_factory=_agent_workflow_factory,)

You have a working chat surface and you want users to be able to speak instead of type. By the end of this guide, the chat composer will sprout a mic button, recorded audio will be transcribed by the runtime, and the transcript will auto-send to the agent like any other message.

When to use this#

Hands-free or accessibility flows where typing isn't the right input modality.
Mobile or kiosk surfaces where a long voice query is faster than thumb-typing.
Demo and test loops where you want canned audio to drive the chat without a microphone.

If you only need file uploads (audio, images, video, documents), use Multimodal Attachments instead. Voice is specifically about live transcription of recorded speech into chat input.

Frontend#

<CopilotChat /> renders the mic button automatically when the runtime advertises audioFileTranscriptionEnabled: true on its /info endpoint. There's nothing to wire up on the chat surface itself:

page.tsx

import { useCallback } from "react";import { CopilotKit, CopilotChat } from "@copilotkit/react-core/v2";import { SampleAudioButton } from "./sample-audio-button";const RUNTIME_URL = "/api/copilotkit-voice";const AGENT_ID = "voice-demo";const SAMPLE_TEXT = "What is the weather in Tokyo?";// Voice demo.//// Two affordances live on this page://// 1. The default mic button rendered by <CopilotChat /> when the runtime at//    RUNTIME_URL advertises `audioFileTranscriptionEnabled: true`. Click it,//    speak, click again — text is transcribed into the composer.//// 2. The <SampleAudioButton /> below the chat, which synchronously injects a//    canned phrase into the chat's textarea (bypassing mic permissions and//    the runtime's transcription endpoint so Playwright and screenshot flows//    work too). The mic path is the only affordance that exercises real//    transcription; the sample button is a deterministic test/demo//    affordance.//// Injecting text into the composer goes through the DOM: CopilotChat owns its// input state internally (no external controlled-input API on v2 today), but// the textarea is tagged `data-testid="copilot-chat-textarea"`. Setting its// value via the native HTMLTextareaElement value setter and dispatching a// synthetic `input` event is the React-compatible way to flip the managed// state without reaching into CopilotChat's internals.export default function VoiceDemoPage() {  const handleTranscribed = useCallback((text: string) => {    if (typeof document === "undefined") return;    const textarea = document.querySelector<HTMLTextAreaElement>(      '[data-testid="copilot-chat-textarea"]',    );    if (!textarea) {      console.warn(        "[voice-demo] could not find copilot-chat-textarea to populate",      );      return;    }    // React tracks its own "last known value" on controlled inputs. Calling    // the native setter is what makes React observe the change on the next    // input event.    const nativeSetter = Object.getOwnPropertyDescriptor(      window.HTMLTextAreaElement.prototype,      "value",    )?.set;    if (nativeSetter) {      nativeSetter.call(textarea, text);    } else {      textarea.value = text;    }    textarea.dispatchEvent(new Event("input", { bubbles: true }));    textarea.focus();  }, []);  return (    <CopilotKit      runtimeUrl={RUNTIME_URL}      agent={AGENT_ID}      useSingleEndpoint={false}    >      <div className="flex h-screen flex-col gap-3 p-6">        <header>          <h1 className="text-lg font-semibold">Voice input</h1>          <p className="text-sm text-black/60 dark:text-white/60">            Click the microphone to record, or play the bundled sample audio.            Speech is transcribed into the input field — you click send.          </p>        </header>        <SampleAudioButton          onTranscribed={handleTranscribed}          sampleText={SAMPLE_TEXT}        />        <div className="min-h-0 flex-1 overflow-hidden rounded-md border border-black/10 dark:border-white/10">          <CopilotChat agentId={AGENT_ID} className="h-full" />        </div>      </div>    </CopilotKit>  );}

When the user clicks the mic, the chat captures audio, POSTs it to the runtime's /transcribe endpoint, drops the resulting transcript into the composer, and submits.

Driving the demo without a mic#

For Playwright runs, screenshots, or any flow where prompting for mic permissions is awkward, ship a button that POSTs a bundled audio clip directly to the same /transcribe endpoint:

sample-audio-button.tsx

export function SampleAudioButton({  onTranscribed,  sampleText,}: SampleAudioButtonProps) {  return (    <div      data-testid="voice-sample-audio"      className="flex items-center gap-3 rounded-md border border-black/10 bg-black/[0.02] px-3 py-2 text-sm dark:border-white/10 dark:bg-white/[0.02]"    >      <button        type="button"        data-testid="voice-sample-audio-button"        onClick={() => onTranscribed(sampleText)}        className="rounded border border-black/10 bg-white px-3 py-1 text-xs font-medium hover:bg-black/5 dark:border-white/10 dark:bg-black/30 dark:hover:bg-white/10"      >        Play sample      </button>      <span className="text-black/60 dark:text-white/60">        Sample: &ldquo;{sampleText}&rdquo;      </span>    </div>  );}

The caller can drop the resulting text into the composer's textarea (matched via data-testid="copilot-chat-textarea") using the native value setter and a synthetic input event so React's managed state updates correctly.

Backend#

Wire up the V2 runtime with a TranscriptionService. The V1 wrapper drops the transcriptionService option, so use createCopilotRuntimeHandler from @copilotkit/runtime/v2 directly:

route.ts

import type { NextRequest } from "next/server";import {  CopilotRuntime,  TranscriptionService,  createCopilotRuntimeHandler,} from "@copilotkit/runtime/v2";import type { TranscribeFileOptions } from "@copilotkit/runtime/v2";import { HttpAgent } from "@ag-ui/client";import { TranscriptionServiceOpenAI } from "@copilotkit/voice";import OpenAI from "openai";const AGENT_URL = process.env.AGENT_URL || "http://localhost:8000";// Point at the tool-free /voice endpoint so aimock returns a direct text// response instead of a tool call that the agent can't summarize.const voiceDemoAgent = new HttpAgent({ url: `${AGENT_URL}/voice/run` });class GuardedOpenAITranscriptionService extends TranscriptionService {  private delegate: TranscriptionServiceOpenAI | null;  constructor() {    super();    const apiKey = process.env.OPENAI_API_KEY;    this.delegate = apiKey      ? new TranscriptionServiceOpenAI({ openai: new OpenAI({ apiKey }) })      : null;  }  async transcribeFile(options: TranscribeFileOptions): Promise<string> {    if (!this.delegate) {      throw new Error(        "OPENAI_API_KEY not configured for this deployment (api key missing). " +          "Set OPENAI_API_KEY to enable voice transcription.",      );    }    return this.delegate.transcribeFile(options);  }}let cachedHandler: ((req: Request) => Promise<Response>) | null = null;function getHandler(): (req: Request) => Promise<Response> {  if (cachedHandler) return cachedHandler;  const runtime = new CopilotRuntime({    // @ts-ignore -- Published CopilotRuntime agents type wraps Record in    // MaybePromise<NonEmptyRecord<...>> which rejects plain Records; fixed in    // source, pending release.    agents: {      "voice-demo": voiceDemoAgent,      default: voiceDemoAgent,    },    transcriptionService: new GuardedOpenAITranscriptionService(),  });  cachedHandler = createCopilotRuntimeHandler({    runtime,    basePath: "/api/copilotkit-voice",  });  return cachedHandler;}export const POST = (req: NextRequest) => getHandler()(req);export const GET = (req: NextRequest) => getHandler()(req);export const PUT = (req: NextRequest) => getHandler()(req);export const DELETE = (req: NextRequest) => getHandler()(req);

With transcriptionService set, the runtime advertises audioFileTranscriptionEnabled: true on /info (which is what tells the chat to render the mic button) and routes POST /transcribe to the service.

Custom transcription backends#

TranscriptionService from @copilotkit/runtime/v2 is an abstract class. Subclass it to plug in any transcription provider — Whisper, AssemblyAI, Deepgram, your own model. The library ships TranscriptionServiceOpenAI as the canonical reference implementation.

A useful pattern is wrapping your service in a guard that returns a clean 4xx when credentials aren't configured, instead of an opaque 5xx from the underlying SDK: