Audio to Video WebRTC

Endpoint

WebSocket URL: wss://api.agenthuman.com/webrtc
Local Development: ws://jw.agenthuman.com:8765

Overview

The Audio-to-Video WebRTC endpoint enables real-time generation of AI-powered talking head avatars. Send audio via WebSocket and receive synchronized video stream through WebRTC for the lowest possible latency.

Connection

const ws = new WebSocket('wss://api.agenthuman.com/webrtc');
let pc = null;
let sessionId = null;

// Handle connection establishment
ws.onopen = () => {
    console.log('WebSocket connected');
};

ws.onmessage = async (event) => {
    const message = JSON.parse(event.data);
    
    if (message.type === 'connection.established') {
        sessionId = message.session_id;
        await establishWebRTC();
    } else if (message.type === 'webrtc.answer') {
        await pc.setRemoteDescription({
            type: 'answer',
            sdp: message.sdp
        });
    } else if (message.type === 'webrtc.ice_candidate') {
        await pc.addIceCandidate(message.candidate);
    }
};

// Establish WebRTC connection
async function establishWebRTC() {
    pc = new RTCPeerConnection({
        iceServers: [{ urls: 'stun:stun.l.google.com:19302' }]
    });
    
    // Handle incoming video/audio
    pc.ontrack = (event) => {
        const videoElement = document.getElementById('avatar-video');
        videoElement.srcObject = event.streams[0];
    };
    
    // Create and send offer
    const offer = await pc.createOffer();
    await pc.setLocalDescription(offer);
    
    ws.send(JSON.stringify({
        type: 'webrtc.offer',
        session_id: sessionId,
        sdp: offer.sdp
    }));
    
    // Handle ICE candidates
    pc.onicecandidate = (event) => {
        if (event.candidate) {
            ws.send(JSON.stringify({
                type: 'webrtc.ice_candidate',
                session_id: sessionId,
                candidate: {
                    candidate: event.candidate.candidate,
                    sdpMid: event.candidate.sdpMid,
                    sdpMLineIndex: event.candidate.sdpMLineIndex
                }
            }));
        }
    };
}

Message Types

Client → Server Messages

1. WebRTC Offer

Establish WebRTC connection for video streaming.

{
  "type": "webrtc.offer",
  "session_id": "session-uuid",
  "sdp": "v=0\r\no=- ..."
}

2. Send Complete Audio

Send a complete audio file for video generation.

{
  "type": "agent.speak",
  "session_id": "session-uuid",
  "audio": "base64_encoded_audio_data",
  "format": "wav",
  "sample_rate": 16000,
  "chunk_index": 0,
  "is_final": true
}

3. Stream Audio Chunks

Stream audio in chunks for real-time processing.

{
  "type": "agent.audio.chunk",
  "session_id": "session-uuid",
  "chunk": "base64_encoded_chunk",
  "chunk_index": 1,
  "is_first": false,
  "is_final": false
}

4. End Audio Stream

Signal the end of audio streaming.

{
  "type": "agent.speak_end",
  "session_id": "session-uuid"
}

5. Interrupt Playback

Stop current video generation and clear the playlist.

{
  "type": "agent.interrupt",
  "session_id": "session-uuid"
}

Server → Client Events

Connection Established

{
  "type": "connection.established",
  "session_id": "unique-session-id"
}

WebRTC Answer

{
  "type": "webrtc.answer",
  "session_id": "session-uuid",
  "sdp": "v=0\r\no=- ..."
}

Generation Events

{
  "type": "generation.started",
  "session_id": "session-uuid",
  "chunk_index": 1
}

{
  "type": "generation.completed",
  "session_id": "session-uuid",
  "chunk_index": 1,
  "video_path": "output.mp4"
}

Error Event

{
  "type": "error",
  "error": "Detailed error message",
  "details": {}
}

Audio Requirements

Parameter	Value	Description
Format	WAV, PCM	Audio file format
Sample Rate	16000 Hz	Recommended sample rate
Channels	Mono	Single channel audio
Bit Depth	16-bit	Audio bit depth
Min Duration	1 second	Minimum for quality video
Max Chunk Size	1MB	For chunked streaming

Video Output

Parameter	Value	Description
Format	H.264	Video codec via WebRTC
Resolution	512x512	Default avatar resolution
Frame Rate	30 FPS	Smooth video playback
Latency	<500ms	End-to-end latency

Complete Example

<!DOCTYPE html>
<html>
<head>
    <title>AgentHuman WebRTC Avatar</title>
</head>
<body>
    <video id="avatar-video" autoplay playsinline></video>
    <button id="send-audio">Send Audio</button>
    
    <script>
        const ws = new WebSocket('wss://api.agenthuman.com/webrtc');
        let pc = null;
        let sessionId = null;
        
        ws.onmessage = async (event) => {
            const message = JSON.parse(event.data);
            
            if (message.type === 'connection.established') {
                sessionId = message.session_id;
                console.log('Connected with session:', sessionId);
                await setupWebRTC();
            } else if (message.type === 'webrtc.answer') {
                await handleAnswer(message);
            } else if (message.type === 'webrtc.ice_candidate') {
                await handleIceCandidate(message);
            }
        };
        
        async function setupWebRTC() {
            pc = new RTCPeerConnection({
                iceServers: [{ urls: 'stun:stun.l.google.com:19302' }]
            });
            
            pc.ontrack = (event) => {
                document.getElementById('avatar-video').srcObject = event.streams[0];
            };
            
            pc.onicecandidate = (event) => {
                if (event.candidate) {
                    ws.send(JSON.stringify({
                        type: 'webrtc.ice_candidate',
                        session_id: sessionId,
                        candidate: {
                            candidate: event.candidate.candidate,
                            sdpMid: event.candidate.sdpMid,
                            sdpMLineIndex: event.candidate.sdpMLineIndex
                        }
                    }));
                }
            };
            
            const offer = await pc.createOffer();
            await pc.setLocalDescription(offer);
            
            ws.send(JSON.stringify({
                type: 'webrtc.offer',
                session_id: sessionId,
                sdp: offer.sdp
            }));
        }
        
        async function handleAnswer(message) {
            await pc.setRemoteDescription({
                type: 'answer',
                sdp: message.sdp
            });
        }
        
        async function handleIceCandidate(message) {
            await pc.addIceCandidate(message.candidate);
        }
        
        document.getElementById('send-audio').onclick = async () => {
            const response = await fetch('sample-audio.wav');
            const blob = await response.blob();
            const reader = new FileReader();
            
            reader.onload = () => {
                const base64 = reader.result.split(',')[1];
                ws.send(JSON.stringify({
                    type: 'agent.speak',
                    session_id: sessionId,
                    audio: base64,
                    format: 'wav',
                    sample_rate: 16000
                }));
            };
            
            reader.readAsDataURL(blob);
        };
    </script>
</body>
</html>

Installation & Setup

Server Installation

# Clone the WebRTC server
git clone https://github.com/agenthuman/webrtc-server
cd agenthuman-webrtcserver

# Install dependencies
pip install -r requirements.txt

# Run the server
python run_server.py

Docker Deployment

FROM python:3.9-slim

WORKDIR /app

# Install system dependencies
RUN apt-get update && apt-get install -y \
    libglib2.0-0 \
    libsm6 \
    libxext6 \
    libxrender-dev \
    libgomp1 \
    wget \
    && rm -rf /var/lib/apt/lists/*

# Copy requirements
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

# Copy application
COPY . .

# Expose WebSocket port
EXPOSE 8765

# Run server
CMD ["python", "run_server.py"]

Docker Compose

version: '3.8'

services:
  webrtc-server:
    build: .
    ports:
      - "8765:8765"
    volumes:
      - ./output_videos:/app/output_videos
      - ./temp_audio:/app/temp_audio
    environment:
      - MODEL_TYPE=LivePortrait
      - PARALLEL_PROCESSING=true
      - VIDEO_FPS=30
    restart: unless-stopped

Best Practices

Audio Quality

Use 16kHz sample rate for optimal results
Ensure clear audio without background noise
Minimum 1 second duration for smooth video
Send audio chunks >80KB for better quality

Performance

Enable parallel processing for reduced latency
Keep WebSocket connections alive for multiple interactions
Use chunked streaming for long audio content
Monitor chunk processing times

Error Handling

Validate WebRTC offer/answer SDP
Handle connection drops with reconnection logic
Check audio format before sending
Monitor generation status events

Error Codes

Code	Description	Solution
`WS_1001`	Invalid session ID	Reconnect and get new session
`WS_1002`	Invalid message format	Check JSON structure
`WS_1003`	Missing required field	Include all required fields
`RTC_2001`	Invalid SDP	Validate SDP format
`RTC_2002`	ICE connection failed	Check network connectivity
`AUDIO_3001`	Invalid audio format	Use WAV/PCM format
`AUDIO_3002`	Audio too short	Minimum 1 second duration
`GEN_4001`	Generation failed	Check server logs
`GEN_4002`	Model not loaded	Restart server

Getting Started

Avatars

Agents

Sessions

WebRTC

Endpoint

Overview

Connection

Message Types

Client → Server Messages

1. WebRTC Offer

2. Send Complete Audio

3. Stream Audio Chunks

4. End Audio Stream

5. Interrupt Playback

Server → Client Events

Connection Established

WebRTC Answer

Generation Events

Error Event

Audio Requirements

Video Output

Complete Example

Installation & Setup

Server Installation

Docker Deployment

Docker Compose

Best Practices

Audio Quality

Performance

Error Handling

Error Codes

Getting Started

Avatars

Agents

Sessions

WebRTC

​Endpoint

​Overview

​Connection

​Message Types

​Client → Server Messages

​1. WebRTC Offer

​2. Send Complete Audio

​3. Stream Audio Chunks

​4. End Audio Stream

​5. Interrupt Playback

​Server → Client Events

​Connection Established

​WebRTC Answer

​Generation Events

​Error Event

​Audio Requirements

​Video Output

​Complete Example

​Installation & Setup

​Server Installation

​Docker Deployment

​Docker Compose

​Best Practices

​Audio Quality

​Performance

​Error Handling

​Error Codes

Endpoint

Overview

Connection

Message Types

Client → Server Messages

1. WebRTC Offer

2. Send Complete Audio

3. Stream Audio Chunks

4. End Audio Stream

5. Interrupt Playback

Server → Client Events

Connection Established

WebRTC Answer

Generation Events

Error Event

Audio Requirements

Video Output

Complete Example

Installation & Setup

Server Installation

Docker Deployment

Docker Compose

Best Practices

Audio Quality

Performance

Error Handling

Error Codes