Skip to content

Commit

Permalink
add twilio integration
Browse files Browse the repository at this point in the history
* Add Restful endpoint to return TwiML media stream

* Add Websocket endpoint to handle Twilio messages

* Add 'twilio' platform in speech to text component to support twilio audio bytes.
  • Loading branch information
yc1i committed Sep 12, 2023
1 parent 81a70ea commit 0e341de
Show file tree
Hide file tree
Showing 5 changed files with 199 additions and 0 deletions.
5 changes: 5 additions & 0 deletions realtime_ai_character/audio/speech_to_text/google.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,11 @@
'max_alternatives': 1,
'enable_automatic_punctuation': True,
},
'twilio': {
'encoding': speech.RecognitionConfig.AudioEncoding.MULAW,
'sample_rate_hertz': 8000,
'language_code': 'en-uS',
}
})


Expand Down
18 changes: 18 additions & 0 deletions realtime_ai_character/audio/speech_to_text/whisper.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ def transcribe(self, audio_bytes, platform, prompt="", language="en-US", suppres
logger.info("Transcribing audio...")
if platform == "web":
audio = self._convert_webm_to_wav(audio_bytes, self.use == "local")
elif platform == "twilio":
audio = self._ulaw_to_wav(audio_bytes, self.use == "local")
else:
audio = self._convert_bytes_to_wav(audio_bytes, self.use == "local")
if self.use == "local":
Expand Down Expand Up @@ -104,3 +106,19 @@ def _convert_bytes_to_wav(self, audio_bytes, local=True):
audio = io.BytesIO(sr.AudioData(audio_bytes, 44100, 2).get_wav_data())
return audio
return sr.AudioData(audio_bytes, 44100, 2)

def _ulaw_to_wav(self, audio_bytes, local=True):
sound = AudioSegment(
data=audio_bytes,
sample_width=1,
frame_rate=8000,
channels=1
)

audio = io.BytesIO()
sound.export(audio, format="wav")
if local:
return audio

return sr.AudioData(audio_bytes, 8000, 1)

2 changes: 2 additions & 0 deletions realtime_ai_character/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from realtime_ai_character.character_catalog.catalog_manager import CatalogManager
from realtime_ai_character.memory.memory_manager import MemoryManager
from realtime_ai_character.restful_routes import router as restful_router
from realtime_ai_character.twilio.websocket import twilio_router
from realtime_ai_character.utils import ConnectionManager
from realtime_ai_character.websocket_routes import router as websocket_router

Expand All @@ -30,6 +31,7 @@

app.include_router(restful_router)
app.include_router(websocket_router)
app.include_router(twilio_router)

web_build_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
'..', 'client', 'web', 'build')
Expand Down
11 changes: 11 additions & 0 deletions realtime_ai_character/twilio/ulaw_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@

def is_mulaw_silence_bytes(byte8bit: bytes):
count0xff = 0
count = 0

for b in byte8bit:
count += 1
if b > 250:
count0xff += 1

return (count0xff / count) > 0.5
163 changes: 163 additions & 0 deletions realtime_ai_character/twilio/websocket.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
import asyncio
import os
import json
import base64
import collections

from pydub import AudioSegment
from functools import reduce
from fastapi import APIRouter, Depends, Request, Response, WebSocket, WebSocketDisconnect, Query
from twilio.twiml.voice_response import VoiceResponse, Connect

from typing import Callable

from realtime_ai_character.twilio.ulaw_util import is_mulaw_silence_bytes

from realtime_ai_character.audio.speech_to_text import (SpeechToText,
get_speech_to_text)
from realtime_ai_character.audio.text_to_speech import (TextToSpeech,
get_text_to_speech)
from realtime_ai_character.character_catalog.catalog_manager import (
CatalogManager, get_catalog_manager, Character)
from realtime_ai_character.llm import get_llm, LLM
from realtime_ai_character.logger import get_logger
from realtime_ai_character.utils import get_connection_manager

logger = get_logger(__name__)

twilio_router = APIRouter(
prefix="/twilio",
)

manager = get_connection_manager()

@twilio_router.get("/voice")
async def get_websocket(request: Request):
# Start our TwiML response
resp = VoiceResponse()

request.url.hostname
connect = Connect()
connect.stream(
name = 'RealChar Endpoint',
url=f'wss://{request.url.hostname}/twilio/ws'
)
resp.append(connect)

return Response(content=str(resp), media_type="application/xml")

class AudioBytesBuffer():
def __init__(self):
self._buffer = collections.deque()
self._frame_count = 0
self._silence_count = 0

def register_callback(self, callback: Callable[[bytes], None]):
self._callback = callback

async def add_bytes(self, chunk: bytes):
if is_mulaw_silence_bytes(chunk):
self._silence_count += 1
else:
self._silence_count = 0 # reset
self._buffer.append(chunk)
self._frame_count += 1

if len(self._buffer) > 25 and self._silence_count >= 50:
logger.info("going to invoke callback")
answer = reduce(lambda x, y: x + y, self._buffer)
# call the callback func
await self._callback(answer)
self.reset()

def reset(self):
self._buffer.clear()
self._frame_count = 0



@twilio_router.websocket("/ws")
async def websocket_endpoint(websocket: WebSocket,
llm_model: str = Query(default=os.getenv(
'LLM_MODEL_USE', 'gpt-3.5-turbo-16k')),
language: str = Query(default='en-US'),
catalog_manager=Depends(get_catalog_manager),
speech_to_text=Depends(get_speech_to_text),
default_text_to_speech=Depends(get_text_to_speech)):
llm = get_llm(model=llm_model)
await manager.connect(websocket)
character = catalog_manager.get_character('loki')
try:
main_task = asyncio.create_task(
handle_receive(websocket, llm, catalog_manager,
character, language, speech_to_text, default_text_to_speech))
await asyncio.gather(main_task)

except WebSocketDisconnect:
await manager.disconnect(websocket)


async def handle_receive(websocket: WebSocket, llm: LLM,
catalog_manager: CatalogManager, character: Character,
language: str, speech_to_text: SpeechToText,
default_text_to_speech: TextToSpeech):
buffer = AudioBytesBuffer()

async def audio_buffer_callback(binary_data: bytes):
logger.info("callback invoked")
sound = AudioSegment(
data=binary_data,
sample_width=1,
frame_rate=8000,
channels=1
)
sound.export("output.wav", format="wav", codec="pcm_mulaw")
# run transcribe
transcript: str = (await asyncio.to_thread(
speech_to_text.transcribe, binary_data,
platform='twilio')).strip()
logger.info(f"Receive transcription: {transcript}")

buffer.register_callback(audio_buffer_callback)

while True:
try:
# expect twilio to send connect event
data = await websocket.receive()

if data['type'] != 'websocket.receive':
raise WebSocketDisconnect('disconnected')

msg = data["text"]
try:
obj = json.loads(msg)
except ValueError:
logger.error("Twilio message can not be parsed to json")
raise WebSocketDisconnect('disconnected')

# {"event": "connected", "protocol": "Call", "version": "1.0.0"
if obj["event"] == "connected":
logger.info("Receive twilio connect event")
continue

if obj["event"] == "start":
logger.info(
f"websocket receives twilio payload: {obj}")
logger.info("Receive twilio start event")
continue

if obj["event"] == "media":
# logger.info("Receive twilio media event")
media = obj["media"]
chunk = base64.b64decode(media["payload"])
await buffer.add_bytes(bytes(chunk))
continue

if obj["event"] == "stop":
logger.info("Receive twilio stop event")
websocket.close()
break

except WebSocketDisconnect:
await manager.disconnect(websocket)

0 comments on commit 0e341de

Please sign in to comment.