-
-
Notifications
You must be signed in to change notification settings - Fork 530
Realtime
Tolga Kayhan edited this page Nov 11, 2024
·
3 revisions
using System.Text.Json;
using Betalgo.Ranul.OpenAI.Managers;
using Betalgo.Ranul.OpenAI.ObjectModels.RealtimeModels;
using Betalgo.Ranul.OpenAI.ObjectModels.SharedModels;
namespace OpenAI.Playground.TestHelpers.RealtimeHelpers;
/// <summary>
/// A comprehensive example implementation of OpenAI's Realtime API for audio interactions.
/// This class demonstrates how to:
/// - Establish and maintain a WebSocket connection with OpenAI's Realtime server
/// - Handle bidirectional audio streaming
/// - Process transcriptions and responses
/// - Implement function calling capabilities
/// - Manage the full lifecycle of a realtime conversation
/// </summary>
public class RealtimeAudioExample : IDisposable
{
// Core services for the realtime interaction
private readonly IOpenAIRealtimeService _ai; // Manages the WebSocket connection and event handling
private readonly VoiceInput _voiceInput; // Handles audio input capture and processing
private readonly VoiceOutput _voiceOutput; // Manages audio output playback
/// <summary>
/// Initializes a new instance of the RealtimeAudioExample.
/// Sets up the necessary components for audio interaction with OpenAI's Realtime API.
/// </summary>
/// <param name="ai">The OpenAI Realtime service instance that will manage the WebSocket connection</param>
public RealtimeAudioExample(IOpenAIRealtimeService ai)
{
_ai = ai;
_voiceInput = new(_ai); // Initialize audio input handling
_voiceOutput = new(); // Initialize audio output handling
}
/// <summary>
/// Implements IDisposable to properly clean up resources.
/// This is crucial for releasing audio hardware and closing network connections.
/// </summary>
public void Dispose()
{
_voiceInput.Dispose(); // Release audio input resources
_voiceOutput.Dispose(); // Release audio output resources
_ai.Dispose(); // Close WebSocket connection and clean up
}
/// <summary>
/// Main execution method that orchestrates the entire realtime interaction.
/// This method:
/// 1. Sets up all necessary event handlers
/// 2. Establishes the WebSocket connection
/// 3. Configures the initial session parameters
/// 4. Handles user input for recording control
/// </summary>
public async Task Run()
{
// Initialize all event handlers before connecting
SetupEventHandlers();
// Establish WebSocket connection to OpenAI's Realtime server
// This creates a new session and prepares for bi-directional communication
await _ai.ConnectAsync();
// Configure the session with initial settings using session.update event
// This configuration defines how the AI will behave and what capabilities it has
await _ai.ClientEvents.Session.Update(new()
{
Session = new()
{
// Define the AI's personality and behavior
// This is similar to system messages in the regular Chat API
Instructions = "You are a great, upbeat friend. You made jokes all the time and your voices is full of joy.",
// Select the voice for audio responses
// Options in Realtime API: 'alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer'
Voice = "verse",
// Enable both text and audio capabilities
// This allows the AI to respond with both text transcriptions and spoken audio
Modalities = ["text", "audio"],
// Define tools (functions) that the AI can call during conversation
// This example implements a weather checking function
Tools =
[
new()
{
Type = "function",
Name = "get_current_weather",
Description = "Get the current weather",
// Define the function parameters using JSON Schema
Parameters = PropertyDefinition.DefineObject(new Dictionary<string, PropertyDefinition>
{
// Location parameter is required
{ "location", PropertyDefinition.DefineString("The city and state, e.g. San Francisco, CA") },
// Unit parameter is optional but must be either celsius or fahrenheit
{ "unit", PropertyDefinition.DefineEnum(["celsius", "fahrenheit"], string.Empty) }
}, ["location"], null, null, null)
}
]
}
});
// Main interaction loop - Handle user commands for recording
Console.WriteLine("Press 'R' to start recording, 'S' to stop, 'Q' to quit");
while (true)
{
var key = Console.ReadKey(true).Key;
switch (key)
{
case ConsoleKey.R:
// Start capturing audio input
_voiceInput.StartRecording();
Console.WriteLine("Recording started...");
break;
case ConsoleKey.S:
// Stop recording and process the audio
await StopAndSendAudio();
break;
case ConsoleKey.Q:
// Exit the application
return;
}
}
}
/// <summary>
/// Handles the process of stopping audio recording and sending it to OpenAI.
/// This method:
/// 1. Stops the audio recording
/// 2. Commits the recorded audio buffer to create a user message
/// 3. Requests an AI response
/// </summary>
private async Task StopAndSendAudio()
{
// Stop capturing audio input
_voiceInput.StopRecording();
Console.WriteLine("Recording stopped.");
// Commit the audio buffer to create a user message
// This triggers the input_audio_buffer.commit event
await _ai.ClientEvents.InputAudioBuffer.Commit();
// Request an AI response for the committed audio
// This triggers the response.create event
await _ai.ClientEvents.Response.Create();
}
/// <summary>
/// Utility method to send pre-recorded audio files to the API.
/// This is useful for testing or processing existing audio files.
/// </summary>
/// <param name="filePath">Path to the audio file to be sent</param>
private async Task SendPreRecordedAudio(string filePath)
{
Console.WriteLine($"Sending pre-recorded audio: {filePath}");
// Send the audio file contents
await _voiceInput.SendAudioFile(filePath);
// Commit the audio buffer to create a user message
await _ai.ClientEvents.InputAudioBuffer.Commit();
}
/// <summary>
/// Sets up all event handlers for the realtime session.
/// This method configures handlers for:
/// - Audio input processing and transcription
/// - Speech detection
/// - AI response processing
/// - Function calls
/// - Error handling
///
/// Each event handler corresponds to specific server events as defined in the OpenAI Realtime API documentation.
/// </summary>
private void SetupEventHandlers()
{
// AUDIO INPUT HANDLING EVENTS
// Handle successful audio transcriptions
// This event is triggered when input audio is successfully converted to text
_ai.ServerEvents.Conversation.Item.InputAudioTranscription.OnCompleted += (sender, args) => {
Console.WriteLine($"Transcription completed: {args.Transcript}");
};
// Handle failed transcription attempts
// This helps identify issues with audio quality or processing
_ai.ServerEvents.Conversation.Item.InputAudioTranscription.OnFailed += (sender, args) => {
Console.WriteLine($"Transcription failed: {args.Error}");
};
// AUDIO BUFFER STATE EVENTS
// Triggered when audio buffer is successfully committed
// This indicates the audio has been properly sent to the server
_ai.ServerEvents.InputAudioBuffer.OnCommitted += (sender, args) => {
Console.WriteLine("Audio buffer committed.");
};
// Triggered when audio buffer is cleared
// This happens when starting fresh or discarding unused audio
_ai.ServerEvents.InputAudioBuffer.OnCleared += (sender, args) => {
Console.WriteLine("Audio buffer cleared.");
};
// SPEECH DETECTION EVENTS
// Handle speech end detection
// This helps in identifying when the user has finished speaking
_ai.ServerEvents.InputAudioBuffer.OnSpeechStopped += (sender, args) => {
Console.WriteLine("Speech stopped detected.");
};
// Handle speech start detection
// This is useful for implementing real-time interaction
_ai.ServerEvents.InputAudioBuffer.OnSpeechStarted += async (sender, args) =>
{
Console.WriteLine("Speech started detected.");
// Clear any ongoing audio output when user starts speaking
_voiceOutput.StopAndClear();
// Cancel any in-progress AI responses
// This ensures a more natural conversation flow
await _ai.ClientEvents.Response.Cancel();
};
// AI RESPONSE HANDLING EVENTS
// Handle incoming text transcripts from the AI
// This shows what the AI is saying in text form
_ai.ServerEvents.Response.AudioTranscript.OnDelta += (sender, args) =>
{
Console.ForegroundColor = ConsoleColor.DarkGreen;
Console.Write($"{args.Delta}");
Console.ResetColor();
};
// AUDIO OUTPUT HANDLING
// Process incoming audio data from the AI
// This handles the AI's voice response in chunks
_ai.ServerEvents.Response.Audio.OnDelta += (sender, args) =>
{
try
{
if (!string.IsNullOrEmpty(args.Delta))
{
// Convert base64 audio data to bytes and queue for playback
var audioData = Convert.FromBase64String(args.Delta);
_voiceOutput.EnqueueAudioData(audioData);
}
}
catch (Exception ex)
{
Console.WriteLine($"Error processing audio delta: {ex.Message}");
}
};
// Handle completion of audio response
_ai.ServerEvents.Response.Audio.OnDone += (sender, args) =>
{
Console.WriteLine();
Console.WriteLine("Audio response completed.");
};
// FUNCTION CALLING EVENTS
// Handle incoming function call arguments
// This shows the AI's attempts to use tools/functions
_ai.ServerEvents.Response.FunctionCallArguments.OnDelta += (sender, args) =>
{
Console.ForegroundColor = ConsoleColor.Yellow;
Console.WriteLine($"Function call arguments delta: {args.Delta}");
Console.ResetColor();
};
// Process completed function calls
_ai.ServerEvents.Response.FunctionCallArguments.OnDone += async (sender, args) =>
{
if (args.Arguments != null)
{
Console.WriteLine($"Function call completed: {args.Arguments}");
// Handle weather function calls specifically
if (args.Name == "get_current_weather")
{
await HandleWeatherFunction(args.Arguments, args.CallId);
}
}
};
// ERROR HANDLING
// Global error handler for any API errors
_ai.ServerEvents.OnError += (sender, args) => {
Console.WriteLine($"Error: {args.Error.Message}");
};
// Debug event handler for all server events
//_ai.ServerEvents.OnAll += (sender, args) =>
//{
// Console.WriteLine($"Debug: {args}");
//};
}
/// <summary>
/// Handles weather function calls from the AI.
/// This method:
/// 1. Parses the function arguments
/// 2. Simulates a weather API call
/// 3. Returns the results to the AI
/// 4. Triggers a new response based on the weather data
/// </summary>
/// <param name="arguments">JSON string containing the function arguments</param>
/// <param name="callId">Unique identifier for the function call</param>
private async Task HandleWeatherFunction(string arguments, string callId)
{
try
{
// Parse the weather query arguments
var args = JsonSerializer.Deserialize<WeatherArgs>(arguments);
// Simulate getting weather data
// In a real application, this would call an actual weather API
var weatherResult = new
{
temperature = args.unit == "celsius" ? 22 : 72,
unit = args.unit,
description = "Sunny with light clouds",
location = args.location
};
// Send the weather data back to the conversation
// This creates a function_call_output item in the conversation
await _ai.ClientEvents.Conversation.Item.Create(new()
{
Item = new()
{
Type = ItemType.FunctionCallOutput,
CallId = callId,
Output = JsonSerializer.Serialize(weatherResult)
}
});
// Request a new AI response based on the weather data
await _ai.ClientEvents.Response.Create();
}
catch (Exception ex)
{
Console.WriteLine($"Error handling weather function: {ex.Message}");
}
}
/// <summary>
/// Data model for weather function arguments.
/// This class maps to the JSON schema defined in the function parameters.
/// </summary>
private class WeatherArgs
{
public string location { get; set; } // Required: city and state
public string unit { get; set; } // Optional: celsius or fahrenheit
}
}
/// <summary>
/// Handles voice input capture and processing for real-time communication with OpenAI's API.
/// This class manages audio recording, buffering, and transmission of audio data.
/// </summary>
public class VoiceInput : IDisposable
{
// Minimum amount of audio to buffer before sending (in milliseconds)
private const int MinimumBufferMs = 100;
// Buffer to store audio data before sending
private readonly List<byte> _audioBuffer;
// Reference to the OpenAI real-time service client
private readonly IOpenAIRealtimeService _client;
// NAudio's wave input device for capturing audio
private readonly WaveInEvent _waveIn;
// Flag to track recording state
private bool _isRecording;
/// <summary>
/// Initializes a new instance of VoiceInput with specified OpenAI client.
/// </summary>
/// <param name="client">The OpenAI real-time service client</param>
public VoiceInput(IOpenAIRealtimeService client)
{
_client = client;
// Configure audio input with specific format:
// - 24000 Hz sample rate
// - 16 bits per sample
// - 1 channel (mono)
_waveIn = new()
{
WaveFormat = new(24000, 16, 1),
BufferMilliseconds = 50 // How often to receive audio data
};
_audioBuffer = [];
_waveIn.DataAvailable += OnDataAvailable!;
}
/// <summary>
/// Releases resources used by the voice input system
/// </summary>
public void Dispose()
{
_waveIn.Dispose();
}
/// <summary>
/// Starts recording audio from the default input device
/// </summary>
public void StartRecording()
{
if (_isRecording) return;
_isRecording = true;
_audioBuffer.Clear();
_waveIn.StartRecording();
}
/// <summary>
/// Stops recording audio and sends any remaining buffered data
/// </summary>
public void StopRecording()
{
if (!_isRecording) return;
_isRecording = false;
_waveIn.StopRecording();
// Send any remaining buffered audio before stopping
if (_audioBuffer.Count > 0)
{
_client.ClientEvents.InputAudioBuffer.Append(_audioBuffer.ToArray());
_audioBuffer.Clear();
}
}
/// <summary>
/// Handles incoming audio data from the recording device
/// </summary>
private void OnDataAvailable(object sender, WaveInEventArgs e)
{
if (!_isRecording) return;
// Add new audio data to the buffer
_audioBuffer.AddRange(e.Buffer.Take(e.BytesRecorded));
// Calculate current buffer duration in milliseconds
var bufferDurationMs = _audioBuffer.Count * 1000.0 / _waveIn.WaveFormat.AverageBytesPerSecond;
// Only send when we have accumulated enough audio data
if (bufferDurationMs >= MinimumBufferMs)
{
_client.ClientEvents.InputAudioBuffer.Append(_audioBuffer.ToArray());
_audioBuffer.Clear();
}
}
/// <summary>
/// Sends an audio file to the OpenAI API by streaming it in chunks
/// </summary>
/// <param name="filePath">Path to the audio file to send</param>
public async Task SendAudioFile(string filePath)
{
using var audioFileReader = new AudioFileReader(filePath);
// Calculate buffer size based on minimum buffer duration
var bufferSize = (int)(audioFileReader.WaveFormat.AverageBytesPerSecond * (MinimumBufferMs / 1000.0));
var buffer = new byte[bufferSize];
int bytesRead;
// Read and send the file in chunks
while ((bytesRead = await audioFileReader.ReadAsync(buffer, 0, buffer.Length)) > 0)
{
if (bytesRead < buffer.Length)
{
// Handle the last chunk if it's smaller than the buffer
var lastBuffer = new byte[bytesRead];
Array.Copy(buffer, lastBuffer, bytesRead);
buffer = lastBuffer;
}
// Resample the audio to match required format and send
var resampledBuffer = ResampleAudio(buffer, bytesRead, audioFileReader.WaveFormat, _waveIn.WaveFormat);
await _client.ClientEvents.InputAudioBuffer.Append(resampledBuffer);
}
}
/// <summary>
/// Resamples audio data to match the target format required by the API
/// </summary>
/// <param name="buffer">Original audio data</param>
/// <param name="bytesRead">Number of bytes in the buffer</param>
/// <param name="sourceFormat">Original audio format</param>
/// <param name="targetFormat">Desired output format</param>
/// <returns>Resampled audio data</returns>
private static byte[] ResampleAudio(byte[] buffer, int bytesRead, WaveFormat sourceFormat, WaveFormat targetFormat)
{
// Skip resampling if formats match
if (sourceFormat.SampleRate == targetFormat.SampleRate &&
sourceFormat.BitsPerSample == targetFormat.BitsPerSample &&
sourceFormat.Channels == targetFormat.Channels)
{
var trimmedBuffer = new byte[bytesRead];
Array.Copy(buffer, trimmedBuffer, bytesRead);
return trimmedBuffer;
}
// Perform resampling using MediaFoundation
using var sourceStream = new RawSourceWaveStream(buffer, 0, bytesRead, sourceFormat);
using var resampler = new MediaFoundationResampler(sourceStream, targetFormat);
resampler.ResamplerQuality = 60; // Set high quality resampling
// Calculate and allocate buffer for resampled audio
var resampledBytes = (int)(bytesRead * ((double)targetFormat.AverageBytesPerSecond / sourceFormat.AverageBytesPerSecond));
var resampledBuffer = new byte[resampledBytes];
var resampledBytesRead = resampler.Read(resampledBuffer, 0, resampledBytes);
// Trim the buffer to actual size and return
var trimmedBuffer2 = new byte[resampledBytesRead];
Array.Copy(resampledBuffer, trimmedBuffer2, resampledBytesRead);
return trimmedBuffer2;
}
}
using Betalgo.Ranul.OpenAI.ObjectModels.RealtimeModels;
using NAudio.Wave;
namespace OpenAI.Playground.TestHelpers.RealtimeHelpers;
/// <summary>
/// Handles real-time audio playback for OpenAI's audio responses
/// Manages buffering and streaming of audio data
/// </summary>
public class VoiceOutput : IDisposable
{
// Core components for audio handling
private readonly BufferedWaveProvider _bufferedWaveProvider; // Manages audio data buffering
private readonly WaveOutEvent _waveOut; // Handles audio output device
private bool _isPlaying; // Tracks current playback status
/// <summary>
/// Initializes the voice output system with OpenAI's default audio settings
/// </summary>
public VoiceOutput()
{
// Initialize audio output device
_waveOut = new();
// Register for playback stopped events
_waveOut.PlaybackStopped += OnPlaybackStopped!;
// Configure audio buffer with OpenAI's default settings
_bufferedWaveProvider = new(new(
RealtimeConstants.Audio.DefaultSampleRate, // Standard sample rate
RealtimeConstants.Audio.DefaultBitsPerSample, // Bit depth for audio
RealtimeConstants.Audio.DefaultChannels // Number of audio channels
))
{
BufferLength = 10 * 1024 * 1024, // Set 10 MB buffer size for smooth playback
DiscardOnBufferOverflow = true // Prevent buffer overflow by discarding excess data
};
// Connect the buffer to the audio output
_waveOut.Init(_bufferedWaveProvider);
}
/// <summary>
/// Cleanup resources when object is disposed
/// </summary>
public void Dispose()
{
// Stop playback and release audio device resources
_waveOut.Stop();
_waveOut.Dispose();
}
/// <summary>
/// Add new audio data to the playback queue
/// Automatically starts playback if not already playing
/// </summary>
/// <param name="data">Raw audio data bytes to be played</param>
public void EnqueueAudioData(byte[]? data)
{
// Ignore empty or null data
if (data == null || data.Length == 0)
return;
// Add new audio data to the buffer
_bufferedWaveProvider.AddSamples(data, 0, data.Length);
// Start playback if not already playing
if (!_isPlaying)
{
_waveOut.Play();
_isPlaying = true;
}
}
/// <summary>
/// Stops playback and clears any remaining buffered audio
/// </summary>
public void StopAndClear()
{
// Stop playback if currently playing
if (_isPlaying)
{
_waveOut.Stop();
_isPlaying = false;
}
// Clear any remaining audio from buffer
_bufferedWaveProvider.ClearBuffer();
Console.WriteLine("Playback stopped and buffer cleared.");
}
/// <summary>
/// Event handler for when playback stops
/// Restarts playback if there's more data in buffer
/// </summary>
private void OnPlaybackStopped(object sender, StoppedEventArgs e)
{
// If there's more audio in the buffer, continue playing
if (_bufferedWaveProvider.BufferedBytes > 0)
{
_waveOut.Play();
}
// Otherwise, mark playback as stopped
else
{
_isPlaying = false;
}
}
}