Voice
Real-time speech-to-text in the chat composer. The user speaks, the runtime transcribes, the agent runs the resulting prompt.
"""MS Agent Framework agent with sales todos state, weather tool, query data,and HITL schedule meeting tool.Adapted from examples/integrations/ms-agent-framework-python/agent/src/agent.py"""from __future__ import annotationsimport jsonfrom textwrap import dedentfrom typing import Annotatedfrom agent_framework import Agent, BaseChatClient, toolfrom agent_framework_ag_ui import AgentFrameworkAgentfrom pydantic import Field# =====================================================================# Shared tool implementations# =====================================================================from tools import ( get_weather_impl, query_data_impl, manage_sales_todos_impl, get_sales_todos_impl, schedule_meeting_impl, search_flights_impl, build_a2ui_operations_from_tool_call,)STATE_SCHEMA: dict[str, object] = { "salesTodos": { "type": "array", "items": { "type": "object", "properties": { "id": {"type": "string"}, "title": {"type": "string"}, "stage": {"type": "string"}, "value": {"type": "number"}, "dueDate": {"type": "string"}, "assignee": {"type": "string"}, "completed": {"type": "boolean"}, }, }, "description": "Ordered list of the user's sales pipeline todos.", }}PREDICT_STATE_CONFIG: dict[str, dict[str, str]] = { "salesTodos": { "tool": "manage_sales_todos", "tool_argument": "todos", }}@tool( name="manage_sales_todos", description=( "Replace the entire list of sales todos with the provided values. " "Always include every todo you want to keep." ),)def manage_sales_todos( todos: Annotated[ list[dict], Field( description=( "The complete source of truth for the user's sales todos. " "Maintain ordering and include the full list on each call." ) ), ],) -> str: """Persist the provided set of sales todos.""" result = manage_sales_todos_impl(todos) return f"Sales todos updated. Tracking {len(result)} item(s)."@tool( name="get_sales_todos", description="Get the current list of sales todos.",)def get_sales_todos() -> str: """Return the current sales todos or defaults.""" result = get_sales_todos_impl() return json.dumps(result)@tool( name="get_weather", description="Get the current weather for a location. Use this to render the frontend weather card.",)def get_weather( location: Annotated[ str, Field( description="The city or region to describe. Use fully spelled out names." ), ],) -> str: """Return weather data as JSON for UI rendering.""" result = get_weather_impl(location) return json.dumps(result)@tool( name="query_data", description="Query the database. Takes natural language. Always call before showing a chart or graph.",)def query_data( query: Annotated[ str, Field(description="Natural language query to run against the database.") ],) -> str: """Query the database and return results as JSON.""" result = query_data_impl(query) return json.dumps(result)@tool( name="schedule_meeting", description="Schedule a meeting. The user will be asked to pick a time via the meeting time picker UI.", approval_mode="always_require",)def schedule_meeting( reason: Annotated[str, Field(description="Reason for scheduling the meeting.")], duration_minutes: Annotated[ int, Field(description="Duration of the meeting in minutes.") ] = 30,) -> str: """Request human approval to schedule a meeting.""" result = schedule_meeting_impl(reason, duration_minutes) return json.dumps(result)@tool( name="search_flights", description=( "Search for flights and display the results as rich A2UI cards. Return exactly 2 flights. " "Each flight must have: airline, airlineLogo, flightNumber, origin, destination, " "date, departureTime, arrivalTime, duration, status, statusColor, price, currency." ),)def search_flights( flights: Annotated[ list[dict], Field(description="List of flight objects to search and display."), ],) -> str: """Search for flights and display as rich cards.""" result = search_flights_impl(flights) return json.dumps(result)@tool( name="generate_a2ui", description=( "Generate dynamic A2UI components based on the conversation. " "A secondary LLM designs the UI schema and data." ),)def generate_a2ui( context: Annotated[ str, Field(description="Conversation context to generate UI from.") ],) -> str: """Generate dynamic A2UI dashboard from conversation context.""" from openai import OpenAI client = OpenAI() tool_schema = { "type": "function", "function": { "name": "_design_a2ui_surface", "description": "Render a dynamic A2UI v0.9 surface.", "parameters": { "type": "object", "properties": { "surfaceId": {"type": "string"}, "catalogId": {"type": "string"}, "components": {"type": "array", "items": {"type": "object"}}, "data": {"type": "object"}, }, "required": ["surfaceId", "catalogId", "components"], }, }, } response = client.chat.completions.create( model="gpt-4.1", messages=[ {"role": "system", "content": context or "Generate a useful dashboard UI."}, { "role": "user", "content": "Generate a dynamic A2UI dashboard based on the conversation.", }, ], tools=[tool_schema], tool_choice={"type": "function", "function": {"name": "_design_a2ui_surface"}}, ) if not response.choices[0].message.tool_calls: return json.dumps({"error": "LLM did not call _design_a2ui_surface"}) tool_call = response.choices[0].message.tool_calls[0] args = json.loads(tool_call.function.arguments) result = build_a2ui_operations_from_tool_call(args) return json.dumps(result)def create_agent(chat_client: BaseChatClient) -> AgentFrameworkAgent: """Instantiate the CopilotKit demo agent backed by Microsoft Agent Framework.""" base_agent = Agent( client=chat_client, name="sales_agent", instructions=dedent( """ You help users manage their sales pipeline, check weather, query data, and schedule meetings. State sync: - The current list of sales todos is provided in the conversation context. - When you add, remove, or reorder todos, call `manage_sales_todos` with the full list. Never send partial updates--always include every todo that should exist. - CRITICAL: When asked to "add" a todo, you must: 1. First, identify ALL existing todos from the conversation history 2. Create EXACTLY ONE new todo (never more than one unless explicitly requested) 3. Call manage_sales_todos with: [all existing todos] + [the one new todo] - When asked to "remove" a todo, remove exactly ONE item unless user specifies otherwise. Tool usage rules: - When user asks to schedule a meeting, you MUST call the `schedule_meeting` tool immediately. Do NOT ask for approval yourself--the tool's approval workflow and the client UI will handle it. Frontend integrations: - `get_weather` renders a weather card in the UI. Only call this tool when the user explicitly asks for weather. Do NOT call it after unrelated tasks or approvals. - `query_data` fetches database records. Always call before showing charts or graphs. - `schedule_meeting` requires explicit user approval before you proceed. Only use it when a user asks to schedule or set up a meeting. Always call the tool instead of asking manually. Conversation tips: - Reference the latest todo list before suggesting changes. - Keep responses concise and friendly unless the user requests otherwise. - After you finish executing tools for the user's request, provide a brief, final assistant message summarizing exactly what changed. Do NOT call additional tools or switch topics after that summary unless the user asks. ALWAYS send this conversational summary so the message persists. """.strip() ), tools=[ manage_sales_todos, get_sales_todos, get_weather, query_data, schedule_meeting, search_flights, generate_a2ui, ], ) return AgentFrameworkAgent( agent=base_agent, name="CopilotKitMicrosoftAgentFrameworkAgent", description="Manages sales pipeline todos, weather, data queries, and meeting scheduling.", predict_state_config=PREDICT_STATE_CONFIG, require_confirmation=False, )You have a working chat surface and you want users to be able to speak instead of type. By the end of this guide, the chat composer will sprout a mic button, recorded audio will be transcribed by the runtime, and the transcript will auto-send to the agent like any other message.
When to use this#
- Hands-free or accessibility flows where typing isn't the right input modality.
- Mobile or kiosk surfaces where a long voice query is faster than thumb-typing.
- Demo and test loops where you want canned audio to drive the chat without a microphone.
If you only need file uploads (audio, images, video, documents), use Multimodal Attachments instead. Voice is specifically about live transcription of recorded speech into chat input.
Frontend#
<CopilotChat /> renders the mic button automatically when the runtime advertises audioFileTranscriptionEnabled: true on its /info endpoint. There's nothing to wire up on the chat surface itself:
import { CopilotKit } from "@copilotkit/react-core/v2";import { VoiceChat } from "./voice-chat";export default function VoiceDemoPage() { return ( <CopilotKit runtimeUrl="/api/copilotkit-voice" agent="voice-demo" useSingleEndpoint={false} // The dev-only `<cpk-web-inspector>` overlay (auto-enabled on // localhost via shouldShowDevConsole) intercepts pointer events // on top of the voice sample-audio button, so dev/D5 probe runs // can't click it through Playwright. Production isn't localhost // so the inspector never mounts there — voice is D5 in prod and // D4 locally for this reason alone. Disable explicitly here so // the demo behaves the same in both environments. enableInspector={false} > <VoiceChat /> </CopilotKit> );}When the user clicks the mic, the chat captures audio, POSTs it to the runtime's /transcribe endpoint, drops the resulting transcript into the composer, and submits.
Driving the demo without a mic#
For Playwright runs, screenshots, or any flow where prompting for mic permissions is awkward, ship a button that POSTs a bundled audio clip directly to the same /transcribe endpoint:
export function SampleAudioButton({ onTranscribed, sampleText,}: SampleAudioButtonProps) { return ( <button type="button" data-testid="voice-sample-audio-button" onClick={() => onTranscribed(sampleText)} title={`Inserts: "${sampleText}"`} className="inline-flex w-fit items-center gap-2 rounded-md border border-black/10 bg-white px-3 py-1.5 text-xs font-medium hover:bg-black/5 dark:border-white/10 dark:bg-black/30 dark:hover:bg-white/10" > <span aria-hidden>🎙</span> <span>Try a sample audio</span> </button> );}The caller can drop the resulting text into the composer's textarea (matched via data-testid="copilot-chat-textarea") using the native value setter and a synthetic input event so React's managed state updates correctly.
Backend#
Wire up the V2 runtime with a TranscriptionService. The V1 wrapper drops the transcriptionService option, so use createCopilotRuntimeHandler from @copilotkit/runtime/v2 directly:
import type { NextRequest } from "next/server";import { CopilotRuntime, TranscriptionService, createCopilotRuntimeHandler,} from "@copilotkit/runtime/v2";import type { TranscribeFileOptions } from "@copilotkit/runtime/v2";import { HttpAgent } from "@ag-ui/client";import { TranscriptionServiceOpenAI } from "@copilotkit/voice";import OpenAI from "openai";const AGENT_URL = process.env.AGENT_URL || "http://localhost:8000";// Point at the tool-free /voice endpoint so aimock returns a direct text// response instead of a tool call that the agent can't summarize.//// No trailing slash on the URL. FastAPI mounts this agent at `/voice`// exactly (via `add_agent_framework_fastapi_endpoint(path="/voice")` in// agent_server.py); posting to `/voice/` triggers FastAPI's// redirect-to-canonical 307, which kills the streaming SSE response and// surfaces as `fetch failed` / `INCOMPLETE_STREAM` in the runtime.const voiceDemoAgent = new HttpAgent({ url: `${AGENT_URL}/voice` });/** * Transcription service wrapper that reports a clean, typed auth error when * OPENAI_API_KEY is not configured. When the key is present we delegate to * the real OpenAI-backed service; any upstream Whisper error keeps its * natural categorization. * * Note: We pin `baseURL` to real OpenAI (or `OPENAI_TRANSCRIPTION_BASE_URL` * when explicitly set) instead of falling through to `OPENAI_BASE_URL`. In * local docker / Railway preview environments `OPENAI_BASE_URL` points at * aimock so LLM completions stay deterministic, but aimock has a catchall * `endpoint: "transcription"` fixture that would otherwise intercept every * real mic recording and return the canned "What is the weather in Tokyo?" * phrase regardless of what the user actually said — and on production * aimock's transcription proxy returns a 502 "Invalid file format" before * any phrase reaches the user. The sample-audio button is the deterministic * affordance (synchronous text injection); the mic is the only path that * should exercise real Whisper. * * Mirrors langgraph-python's voice route exactly. */class GuardedOpenAITranscriptionService extends TranscriptionService { private delegate: TranscriptionServiceOpenAI | null; constructor() { super(); const apiKey = process.env.OPENAI_API_KEY; const baseURL = process.env.OPENAI_TRANSCRIPTION_BASE_URL ?? "https://api.openai.com/v1"; this.delegate = apiKey ? new TranscriptionServiceOpenAI({ openai: new OpenAI({ apiKey, baseURL }), }) : null; } async transcribeFile(options: TranscribeFileOptions): Promise<string> { if (!this.delegate) { throw new Error( "OPENAI_API_KEY not configured for this deployment (api key missing). " + "Set OPENAI_API_KEY to enable voice transcription.", ); } return this.delegate.transcribeFile(options); }}let cachedHandler: ((req: Request) => Promise<Response>) | null = null;function getHandler(): (req: Request) => Promise<Response> { if (cachedHandler) return cachedHandler; const runtime = new CopilotRuntime({ // @ts-ignore -- Published CopilotRuntime agents type wraps Record in // MaybePromise<NonEmptyRecord<...>> which rejects plain Records; fixed in // source, pending release. agents: { "voice-demo": voiceDemoAgent, default: voiceDemoAgent, }, transcriptionService: new GuardedOpenAITranscriptionService(), }); cachedHandler = createCopilotRuntimeHandler({ runtime, basePath: "/api/copilotkit-voice", }); return cachedHandler;}export const POST = (req: NextRequest) => getHandler()(req);export const GET = (req: NextRequest) => getHandler()(req);export const PUT = (req: NextRequest) => getHandler()(req);export const DELETE = (req: NextRequest) => getHandler()(req);With transcriptionService set, the runtime advertises audioFileTranscriptionEnabled: true on /info (which is what tells the chat to render the mic button) and routes POST /transcribe to the service.
Custom transcription backends#
TranscriptionService from @copilotkit/runtime/v2 is an abstract class. Subclass it to plug in any transcription provider — Whisper, AssemblyAI, Deepgram, your own model. The library ships TranscriptionServiceOpenAI as the canonical reference implementation.
A useful pattern is wrapping your service in a guard that returns a clean 4xx when credentials aren't configured, instead of an opaque 5xx from the underlying SDK:
import type { NextRequest } from "next/server";import { CopilotRuntime, TranscriptionService, createCopilotRuntimeHandler,} from "@copilotkit/runtime/v2";import type { TranscribeFileOptions } from "@copilotkit/runtime/v2";import { HttpAgent } from "@ag-ui/client";import { TranscriptionServiceOpenAI } from "@copilotkit/voice";import OpenAI from "openai";const AGENT_URL = process.env.AGENT_URL || "http://localhost:8000";// Point at the tool-free /voice endpoint so aimock returns a direct text// response instead of a tool call that the agent can't summarize.//// No trailing slash on the URL. FastAPI mounts this agent at `/voice`// exactly (via `add_agent_framework_fastapi_endpoint(path="/voice")` in// agent_server.py); posting to `/voice/` triggers FastAPI's// redirect-to-canonical 307, which kills the streaming SSE response and// surfaces as `fetch failed` / `INCOMPLETE_STREAM` in the runtime.const voiceDemoAgent = new HttpAgent({ url: `${AGENT_URL}/voice` });/** * Transcription service wrapper that reports a clean, typed auth error when * OPENAI_API_KEY is not configured. When the key is present we delegate to * the real OpenAI-backed service; any upstream Whisper error keeps its * natural categorization. * * Note: We pin `baseURL` to real OpenAI (or `OPENAI_TRANSCRIPTION_BASE_URL` * when explicitly set) instead of falling through to `OPENAI_BASE_URL`. In * local docker / Railway preview environments `OPENAI_BASE_URL` points at * aimock so LLM completions stay deterministic, but aimock has a catchall * `endpoint: "transcription"` fixture that would otherwise intercept every * real mic recording and return the canned "What is the weather in Tokyo?" * phrase regardless of what the user actually said — and on production * aimock's transcription proxy returns a 502 "Invalid file format" before * any phrase reaches the user. The sample-audio button is the deterministic * affordance (synchronous text injection); the mic is the only path that * should exercise real Whisper. * * Mirrors langgraph-python's voice route exactly. */class GuardedOpenAITranscriptionService extends TranscriptionService { private delegate: TranscriptionServiceOpenAI | null; constructor() { super(); const apiKey = process.env.OPENAI_API_KEY; const baseURL = process.env.OPENAI_TRANSCRIPTION_BASE_URL ?? "https://api.openai.com/v1"; this.delegate = apiKey ? new TranscriptionServiceOpenAI({ openai: new OpenAI({ apiKey, baseURL }), }) : null; } async transcribeFile(options: TranscribeFileOptions): Promise<string> { if (!this.delegate) { throw new Error( "OPENAI_API_KEY not configured for this deployment (api key missing). " + "Set OPENAI_API_KEY to enable voice transcription.", ); } return this.delegate.transcribeFile(options); }}