-
Notifications
You must be signed in to change notification settings - Fork 738
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Add Restful endpoint to return TwiML media stream * Add Websocket endpoint to handle Twilio messages * Add 'twilio' platform in speech to text component to support twilio audio bytes.
- Loading branch information
Showing
5 changed files
with
199 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
|
||
def is_mulaw_silence_bytes(byte8bit: bytes): | ||
count0xff = 0 | ||
count = 0 | ||
|
||
for b in byte8bit: | ||
count += 1 | ||
if b > 250: | ||
count0xff += 1 | ||
|
||
return (count0xff / count) > 0.5 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,163 @@ | ||
import asyncio | ||
import os | ||
import json | ||
import base64 | ||
import collections | ||
|
||
from pydub import AudioSegment | ||
from functools import reduce | ||
from fastapi import APIRouter, Depends, Request, Response, WebSocket, WebSocketDisconnect, Query | ||
from twilio.twiml.voice_response import VoiceResponse, Connect | ||
|
||
from typing import Callable | ||
|
||
from realtime_ai_character.twilio.ulaw_util import is_mulaw_silence_bytes | ||
|
||
from realtime_ai_character.audio.speech_to_text import (SpeechToText, | ||
get_speech_to_text) | ||
from realtime_ai_character.audio.text_to_speech import (TextToSpeech, | ||
get_text_to_speech) | ||
from realtime_ai_character.character_catalog.catalog_manager import ( | ||
CatalogManager, get_catalog_manager, Character) | ||
from realtime_ai_character.llm import get_llm, LLM | ||
from realtime_ai_character.logger import get_logger | ||
from realtime_ai_character.utils import get_connection_manager | ||
|
||
logger = get_logger(__name__) | ||
|
||
twilio_router = APIRouter( | ||
prefix="/twilio", | ||
) | ||
|
||
manager = get_connection_manager() | ||
|
||
@twilio_router.get("/voice") | ||
async def get_websocket(request: Request): | ||
# Start our TwiML response | ||
resp = VoiceResponse() | ||
|
||
request.url.hostname | ||
connect = Connect() | ||
connect.stream( | ||
name = 'RealChar Endpoint', | ||
url=f'wss://{request.url.hostname}/twilio/ws' | ||
) | ||
resp.append(connect) | ||
|
||
return Response(content=str(resp), media_type="application/xml") | ||
|
||
class AudioBytesBuffer(): | ||
def __init__(self): | ||
self._buffer = collections.deque() | ||
self._frame_count = 0 | ||
self._silence_count = 0 | ||
|
||
def register_callback(self, callback: Callable[[bytes], None]): | ||
self._callback = callback | ||
|
||
async def add_bytes(self, chunk: bytes): | ||
if is_mulaw_silence_bytes(chunk): | ||
self._silence_count += 1 | ||
else: | ||
self._silence_count = 0 # reset | ||
self._buffer.append(chunk) | ||
self._frame_count += 1 | ||
|
||
if len(self._buffer) > 25 and self._silence_count >= 50: | ||
logger.info("going to invoke callback") | ||
answer = reduce(lambda x, y: x + y, self._buffer) | ||
# call the callback func | ||
await self._callback(answer) | ||
self.reset() | ||
|
||
def reset(self): | ||
self._buffer.clear() | ||
self._frame_count = 0 | ||
|
||
|
||
|
||
@twilio_router.websocket("/ws") | ||
async def websocket_endpoint(websocket: WebSocket, | ||
llm_model: str = Query(default=os.getenv( | ||
'LLM_MODEL_USE', 'gpt-3.5-turbo-16k')), | ||
language: str = Query(default='en-US'), | ||
catalog_manager=Depends(get_catalog_manager), | ||
speech_to_text=Depends(get_speech_to_text), | ||
default_text_to_speech=Depends(get_text_to_speech)): | ||
llm = get_llm(model=llm_model) | ||
await manager.connect(websocket) | ||
character = catalog_manager.get_character('loki') | ||
try: | ||
main_task = asyncio.create_task( | ||
handle_receive(websocket, llm, catalog_manager, | ||
character, language, speech_to_text, default_text_to_speech)) | ||
await asyncio.gather(main_task) | ||
|
||
except WebSocketDisconnect: | ||
await manager.disconnect(websocket) | ||
|
||
|
||
async def handle_receive(websocket: WebSocket, llm: LLM, | ||
catalog_manager: CatalogManager, character: Character, | ||
language: str, speech_to_text: SpeechToText, | ||
default_text_to_speech: TextToSpeech): | ||
buffer = AudioBytesBuffer() | ||
|
||
async def audio_buffer_callback(binary_data: bytes): | ||
logger.info("callback invoked") | ||
sound = AudioSegment( | ||
data=binary_data, | ||
sample_width=1, | ||
frame_rate=8000, | ||
channels=1 | ||
) | ||
sound.export("output.wav", format="wav", codec="pcm_mulaw") | ||
# run transcribe | ||
transcript: str = (await asyncio.to_thread( | ||
speech_to_text.transcribe, binary_data, | ||
platform='twilio')).strip() | ||
logger.info(f"Receive transcription: {transcript}") | ||
|
||
buffer.register_callback(audio_buffer_callback) | ||
|
||
while True: | ||
try: | ||
# expect twilio to send connect event | ||
data = await websocket.receive() | ||
|
||
if data['type'] != 'websocket.receive': | ||
raise WebSocketDisconnect('disconnected') | ||
|
||
msg = data["text"] | ||
try: | ||
obj = json.loads(msg) | ||
except ValueError: | ||
logger.error("Twilio message can not be parsed to json") | ||
raise WebSocketDisconnect('disconnected') | ||
|
||
# {"event": "connected", "protocol": "Call", "version": "1.0.0" | ||
if obj["event"] == "connected": | ||
logger.info("Receive twilio connect event") | ||
continue | ||
|
||
if obj["event"] == "start": | ||
logger.info( | ||
f"websocket receives twilio payload: {obj}") | ||
logger.info("Receive twilio start event") | ||
continue | ||
|
||
if obj["event"] == "media": | ||
# logger.info("Receive twilio media event") | ||
media = obj["media"] | ||
chunk = base64.b64decode(media["payload"]) | ||
await buffer.add_bytes(bytes(chunk)) | ||
continue | ||
|
||
if obj["event"] == "stop": | ||
logger.info("Receive twilio stop event") | ||
websocket.close() | ||
break | ||
|
||
except WebSocketDisconnect: | ||
await manager.disconnect(websocket) | ||
|