import os
from livekit import agents, rtc
from livekit.agents import AgentServer, AgentSession, Agent
from livekit.plugins import silero, deepgram, openai, cartesia
from livekit.plugins.turn_detector.multilingual import MultilingualModel
class PrivateVoiceAgent(Agent):
def __init__(self):
super().__init__(
instructions="You are a professional enterprise assistant. Keep responses under 2 sentences to minimize TTS latency."
)
server = AgentServer()
@server.rtc_session(agent_name="enterprise-agent")
async def voice_pipeline(ctx: agents.JobContext):
# Initialize the streaming STT -> LLM -> TTS pipeline
session = AgentSession(
stt=deepgram.STT(model="nova-3"), # Ultra-low latency Speech-to-Text
llm=openai.LLM(model="gpt-4o-mini"), # Fast TTFT reasoning
tts=cartesia.TTS(model="sonic-3", voice="default"), # Streaming Text-to-Speech
vad=silero.VAD.load(), # Local Voice Activity Detection
turn_detection=MultilingualModel(),
)
# π CRITICAL COMPLIANCE NOTE: THE AIR-GAPPED ANOMALY
# While the code above is fast, using deepgram.STT or openai.LLM means your voice data
# still leaves the server. It is NOT "Air-Gapped" or "Zero-Trust".
#
# For ultimate HIPAA compliance, swap these cloud APIs with local models
# deployed directly on your Bare Metal GPUs:
#
# stt=whisper.STT() <-- Local Open Source
# llm=vllm.LLM(model="meta-llama/Llama-3") <-- Local Bare Metal GPU
# tts=piper.TTS() <-- Local Synthesis
await session.start(room=ctx.room, agent=PrivateVoiceAgent())
await session.generate_reply(instructions="Greet the caller.")
if __name__ == "__main__":
agents.cli.run_app(server)