Voice
Real-time speech-to-text in the chat composer. The user speaks, the runtime transcribes, the agent runs the resulting prompt.
"""LangGraph agent for the CopilotKit Showcase (FastAPI variant).Uses copilotkit's create_agent (wrapping langgraph) with CopilotKitMiddlewareso frontend-registered tools (useHumanInTheLoop, useFrontendTool) are properlyinjected into the LLM's tool list and executed on the frontend rather thanlocally."""from tools import ( get_weather_impl, query_data_impl, schedule_meeting_impl, manage_sales_todos_impl, get_sales_todos_impl, search_flights_impl, build_a2ui_operations_from_tool_call,)from tools.types import SalesTodo, Flightimport jsonimport timefrom typing import Anyfrom langchain_openai import ChatOpenAIfrom langchain_core.tools import tool as lc_toolfrom langchain_core.messages import SystemMessagefrom langchain.agents import AgentState as BaseAgentState, create_agentfrom langchain.tools import ToolRuntime, toolfrom langchain.messages import ToolMessagefrom langgraph.types import Commandfrom copilotkit import CopilotKitMiddlewareclass AgentState(BaseAgentState): todos: list[SalesTodo]@lc_tooldef get_weather(location: str): """Get the current weather for a location.""" return get_weather_impl(location)@lc_tooldef query_data(query: str): """Query the database. Takes natural language. Always call before showing a chart.""" return query_data_impl(query)@lc_tooldef schedule_meeting(reason: str, duration_minutes: int = 30): """Schedule a meeting. The user will be asked to pick a time via the UI.""" return schedule_meeting_impl(reason, duration_minutes)@lc_tooldef search_flights(flights: list[Flight]) -> str: """Search for flights and display the results as rich cards. Return exactly 2 flights. Each flight must have: airline, airlineLogo, flightNumber, origin, destination, date (short readable format like "Tue, Mar 18" -- use near-future dates), departureTime, arrivalTime, duration (e.g. "4h 25m"), status (e.g. "On Time" or "Delayed"), statusColor (hex color for status dot), price (e.g. "$289"), and currency (e.g. "USD"). For airlineLogo use Google favicon API: https://www.google.com/s2/favicons?domain={airline_domain}&sz=128 """ result = search_flights_impl(flights) return json.dumps(result)@tooldef manage_sales_todos(todos: list[SalesTodo], runtime: ToolRuntime) -> Command: """ Manage the current sales todos. Pass the full updated list. """ updated = manage_sales_todos_impl(todos) return Command( update={ "todos": updated, "messages": [ ToolMessage( content="Successfully updated sales todos", tool_call_id=runtime.tool_call_id, ) ], } )@tooldef get_sales_todos(runtime: ToolRuntime): """ Get the current sales todos. """ current = runtime.state.get("todos", []) return get_sales_todos_impl(current if current else None)@lc_tooldef render_a2ui( surfaceId: str, catalogId: str, components: list[dict], data: dict | None = None,) -> str: """Render a dynamic A2UI v0.9 surface.""" return "rendered"@tool()def generate_a2ui(runtime: ToolRuntime[Any]) -> str: """Generate dynamic A2UI components based on the conversation. A secondary LLM designs the UI schema and data. """ t0 = time.time() messages = runtime.state["messages"][:-1] context_entries = runtime.state.get("copilotkit", {}).get("context", []) context_text = "\n\n".join( entry.get("value", "") for entry in context_entries if isinstance(entry, dict) and entry.get("value") ) model = ChatOpenAI(model="gpt-4.1") model_with_tool = model.bind_tools([render_a2ui], tool_choice="render_a2ui") response = model_with_tool.invoke( [SystemMessage(content=context_text), *messages], ) if not response.tool_calls: return json.dumps({"error": "LLM did not call render_a2ui"}) args = response.tool_calls[0]["args"] result = build_a2ui_operations_from_tool_call(args) return json.dumps(result)model = ChatOpenAI(model="gpt-4o-mini")SYSTEM_PROMPT = """You are a polished, professional demo assistant for CopilotKit.Keep responses brief and clear -- 1 to 2 sentences max.You can:- Chat naturally with the user- Change the UI background when asked (via frontend tool)- Query data and render charts (via query_data tool)- Get weather information (via get_weather tool)- Schedule meetings with the user (via schedule_meeting tool -- the user picks a time in the UI)- Manage sales pipeline todos (via manage_sales_todos / get_sales_todos tools)- Search flights and display rich A2UI cards (via search_flights tool)- Generate dynamic A2UI dashboards from conversation context (via generate_a2ui tool)- Generate step-by-step plans for user review (human-in-the-loop)"""graph = create_agent( model=model, tools=[ get_weather, query_data, schedule_meeting, search_flights, generate_a2ui, manage_sales_todos, get_sales_todos, ], middleware=[CopilotKitMiddleware()], state_schema=AgentState, system_prompt=SYSTEM_PROMPT,)You have a working chat surface and you want users to be able to speak instead of type. By the end of this guide, the chat composer will sprout a mic button, recorded audio will be transcribed by the runtime, and the transcript will auto-send to the agent like any other message.
When to use this#
- Hands-free or accessibility flows where typing isn't the right input modality.
- Mobile or kiosk surfaces where a long voice query is faster than thumb-typing.
- Demo and test loops where you want canned audio to drive the chat without a microphone.
If you only need file uploads (audio, images, video, documents), use Multimodal Attachments instead. Voice is specifically about live transcription of recorded speech into chat input.
Frontend#
<CopilotChat /> renders the mic button automatically when the runtime advertises audioFileTranscriptionEnabled: true on its /info endpoint. There's nothing to wire up on the chat surface itself:
import { useCallback } from "react";import { CopilotKit, CopilotChat } from "@copilotkit/react-core/v2";import { SampleAudioButton } from "./sample-audio-button";const RUNTIME_URL = "/api/copilotkit-voice";const AGENT_ID = "voice-demo";const SAMPLE_TEXT = "What is the weather in Tokyo?";// Voice demo.//// Two affordances live on this page://// 1. The default mic button rendered by <CopilotChat /> when the runtime at// RUNTIME_URL advertises `audioFileTranscriptionEnabled: true`. Click it,// speak, click again — text is transcribed into the composer.//// 2. The <SampleAudioButton /> below the chat, which synchronously injects a// canned phrase into the chat's textarea (bypassing mic permissions and// the runtime's transcription endpoint so Playwright and screenshot flows// work too). The mic path is the only affordance that exercises real// transcription; the sample button is a deterministic test/demo// affordance.//// Injecting text into the composer goes through the DOM: CopilotChat owns its// input state internally (no external controlled-input API on v2 today), but// the textarea is tagged `data-testid="copilot-chat-textarea"`. Setting its// value via the native HTMLTextareaElement value setter and dispatching a// synthetic `input` event is the React-compatible way to flip the managed// state without reaching into CopilotChat's internals.export default function VoiceDemoPage() { const handleTranscribed = useCallback((text: string) => { if (typeof document === "undefined") return; const textarea = document.querySelector<HTMLTextAreaElement>( '[data-testid="copilot-chat-textarea"]', ); if (!textarea) { console.warn( "[voice-demo] could not find copilot-chat-textarea to populate", ); return; } // React tracks its own "last known value" on controlled inputs. Calling // the native setter is what makes React observe the change on the next // input event. const nativeSetter = Object.getOwnPropertyDescriptor( window.HTMLTextAreaElement.prototype, "value", )?.set; if (nativeSetter) { nativeSetter.call(textarea, text); } else { textarea.value = text; } textarea.dispatchEvent(new Event("input", { bubbles: true })); textarea.focus(); }, []); return ( <CopilotKit runtimeUrl={RUNTIME_URL} agent={AGENT_ID} useSingleEndpoint={false} > <div className="flex h-screen flex-col gap-3 p-6"> <header> <h1 className="text-lg font-semibold">Voice input</h1> <p className="text-sm text-black/60 dark:text-white/60"> Click the microphone to record, or play the bundled sample audio. Speech is transcribed into the input field — you click send. </p> </header> <SampleAudioButton onTranscribed={handleTranscribed} sampleText={SAMPLE_TEXT} /> <div className="min-h-0 flex-1 overflow-hidden rounded-md border border-black/10 dark:border-white/10"> <CopilotChat agentId={AGENT_ID} className="h-full" /> </div> </div> </CopilotKit> );}When the user clicks the mic, the chat captures audio, POSTs it to the runtime's /transcribe endpoint, drops the resulting transcript into the composer, and submits.
Driving the demo without a mic#
For Playwright runs, screenshots, or any flow where prompting for mic permissions is awkward, ship a button that POSTs a bundled audio clip directly to the same /transcribe endpoint:
export function SampleAudioButton({ onTranscribed, sampleText,}: SampleAudioButtonProps) { return ( <div data-testid="voice-sample-audio" className="flex items-center gap-3 rounded-md border border-black/10 bg-black/[0.02] px-3 py-2 text-sm dark:border-white/10 dark:bg-white/[0.02]" > <button type="button" data-testid="voice-sample-audio-button" onClick={() => onTranscribed(sampleText)} className="rounded border border-black/10 bg-white px-3 py-1 text-xs font-medium hover:bg-black/5 dark:border-white/10 dark:bg-black/30 dark:hover:bg-white/10" > Play sample </button> <span className="text-black/60 dark:text-white/60"> Sample: “{sampleText}” </span> </div> );}The caller can drop the resulting text into the composer's textarea (matched via data-testid="copilot-chat-textarea") using the native value setter and a synthetic input event so React's managed state updates correctly.
Backend#
Wire up the V2 runtime with a TranscriptionService. The V1 wrapper drops the transcriptionService option, so use createCopilotRuntimeHandler from @copilotkit/runtime/v2 directly:
import type { NextRequest } from "next/server";import { CopilotRuntime, TranscriptionService, createCopilotRuntimeHandler,} from "@copilotkit/runtime/v2";import type { TranscribeFileOptions } from "@copilotkit/runtime/v2";import { LangGraphAgent } from "@copilotkit/runtime/langgraph";import { TranscriptionServiceOpenAI } from "@copilotkit/voice";import OpenAI from "openai";const LANGGRAPH_URL = process.env.AGENT_URL || process.env.LANGGRAPH_DEPLOYMENT_URL || "http://localhost:8123";const voiceDemoAgent = new LangGraphAgent({ deploymentUrl: `${LANGGRAPH_URL}/`, graphId: "sample_agent",});/** * Transcription service wrapper that reports a clean, typed auth error when * OPENAI_API_KEY is not configured. When the key is present we delegate to * the real OpenAI-backed service; any upstream Whisper error keeps its * natural categorization. */class GuardedOpenAITranscriptionService extends TranscriptionService { private delegate: TranscriptionServiceOpenAI | null; constructor() { super(); const apiKey = process.env.OPENAI_API_KEY; this.delegate = apiKey ? new TranscriptionServiceOpenAI({ openai: new OpenAI({ apiKey }) }) : null; } async transcribeFile(options: TranscribeFileOptions): Promise<string> { if (!this.delegate) { // "api key" substring → handleTranscribe maps to AUTH_FAILED → 401. throw new Error( "OPENAI_API_KEY not configured for this deployment (api key missing). " + "Set OPENAI_API_KEY to enable voice transcription.", ); } return this.delegate.transcribeFile(options); }}// Cache the runtime + handler across invocations so the transcription service// is constructed once per Node process instead of per request. The guarded// service reads OPENAI_API_KEY lazily in its transcribeFile call path, so// deferring construction past module load is not required for cold-start// safety under missing-key conditions.let cachedHandler: ((req: Request) => Promise<Response>) | null = null;function getHandler(): (req: Request) => Promise<Response> { if (cachedHandler) return cachedHandler; const runtime = new CopilotRuntime({ // @ts-ignore -- Published CopilotRuntime agents type wraps Record in // MaybePromise<NonEmptyRecord<...>> which rejects plain Records; fixed in // source, pending release. agents: { // The page mounts <CopilotKit agent="voice-demo">; resolve that to // the neutral sample_agent graph. "voice-demo": voiceDemoAgent, // useAgent() with no args defaults to "default"; alias so any internal // default-agent lookups resolve against the same graph. default: voiceDemoAgent, }, transcriptionService: new GuardedOpenAITranscriptionService(), }); cachedHandler = createCopilotRuntimeHandler({ runtime, basePath: "/api/copilotkit-voice", }); return cachedHandler;}// Next.js App Router bindings. This file lives at// `src/app/api/copilotkit-voice/[[...slug]]/route.ts` — the catchall slug// pattern forwards every sub-path (`/info`, `/agent/:id/run`,// `/transcribe`, ...) to the V2 handler so its URL router can dispatch.export const POST = (req: NextRequest) => getHandler()(req);export const GET = (req: NextRequest) => getHandler()(req);export const PUT = (req: NextRequest) => getHandler()(req);export const DELETE = (req: NextRequest) => getHandler()(req);With transcriptionService set, the runtime advertises audioFileTranscriptionEnabled: true on /info (which is what tells the chat to render the mic button) and routes POST /transcribe to the service.
Custom transcription backends#
TranscriptionService from @copilotkit/runtime/v2 is an abstract class. Subclass it to plug in any transcription provider — Whisper, AssemblyAI, Deepgram, your own model. The library ships TranscriptionServiceOpenAI as the canonical reference implementation.
A useful pattern is wrapping your service in a guard that returns a clean 4xx when credentials aren't configured, instead of an opaque 5xx from the underlying SDK:
import type { NextRequest } from "next/server";import { CopilotRuntime, TranscriptionService, createCopilotRuntimeHandler,} from "@copilotkit/runtime/v2";import type { TranscribeFileOptions } from "@copilotkit/runtime/v2";import { LangGraphAgent } from "@copilotkit/runtime/langgraph";import { TranscriptionServiceOpenAI } from "@copilotkit/voice";import OpenAI from "openai";const LANGGRAPH_URL = process.env.AGENT_URL || process.env.LANGGRAPH_DEPLOYMENT_URL || "http://localhost:8123";const voiceDemoAgent = new LangGraphAgent({ deploymentUrl: `${LANGGRAPH_URL}/`, graphId: "sample_agent",});/** * Transcription service wrapper that reports a clean, typed auth error when * OPENAI_API_KEY is not configured. When the key is present we delegate to * the real OpenAI-backed service; any upstream Whisper error keeps its * natural categorization. */class GuardedOpenAITranscriptionService extends TranscriptionService { private delegate: TranscriptionServiceOpenAI | null; constructor() { super(); const apiKey = process.env.OPENAI_API_KEY; this.delegate = apiKey ? new TranscriptionServiceOpenAI({ openai: new OpenAI({ apiKey }) }) : null; } async transcribeFile(options: TranscribeFileOptions): Promise<string> { if (!this.delegate) { // "api key" substring → handleTranscribe maps to AUTH_FAILED → 401. throw new Error( "OPENAI_API_KEY not configured for this deployment (api key missing). " + "Set OPENAI_API_KEY to enable voice transcription.", ); } return this.delegate.transcribeFile(options); }}