From 2e51afe36eaaf19f7cb4759ee01762149e3f963a Mon Sep 17 00:00:00 2001 From: MayamaTakeshi Date: Sun, 17 Mar 2024 16:30:38 +0900 Subject: [PATCH] #79: preserving work so far --- src/pjmedia/include/pjmedia/ws_speech_port.h | 6 +- src/pjmedia/src/pjmedia/ws_speech_port.c | 14 +- src/sip.cpp | 148 +++++++++++++------ 3 files changed, 118 insertions(+), 50 deletions(-) diff --git a/src/pjmedia/include/pjmedia/ws_speech_port.h b/src/pjmedia/include/pjmedia/ws_speech_port.h index 2bb66d8..f17061d 100644 --- a/src/pjmedia/include/pjmedia/ws_speech_port.h +++ b/src/pjmedia/include/pjmedia/ws_speech_port.h @@ -21,9 +21,13 @@ PJ_DEF(pj_status_t) pjmedia_ws_speech_port_create( pj_pool_t *pool, unsigned samples_per_frame, unsigned bits_per_sample, pj_websock_endpoint *ws_endpt, - char *server_url, + const char *server_url, + const char *voice, + const char *text, void (*cb)(pjmedia_port*, void *user_data, enum ws_speech_event, char *data), void *cb_user_data, + unsigned flags, + pj_bool_t end_of_speech_event, pjmedia_port **p_port); PJ_END_DECL diff --git a/src/pjmedia/src/pjmedia/ws_speech_port.c b/src/pjmedia/src/pjmedia/ws_speech_port.c index dbc2e94..a6ea1ca 100644 --- a/src/pjmedia/src/pjmedia/ws_speech_port.c +++ b/src/pjmedia/src/pjmedia/ws_speech_port.c @@ -24,6 +24,7 @@ #include #include #include +#include #define SIGNATURE PJMEDIA_SIGNATURE('w', 's', 's', 'p') #define THIS_FILE "ws_speech_port.c" @@ -131,9 +132,7 @@ static pj_bool_t on_rx_msg(pj_websock_t *c, static void on_state_change(pj_websock_t *c, int state) { char buf[1000]; - PJ_LOG(4, (THIS_FILE, "%s() %s %s", __FUNCTION__, - pj_websock_print(c, buf, sizeof(buf)), - pj_websock_state_str(state))); + printf("%s() %s %s", __FUNCTION__, pj_websock_print(c, buf, sizeof(buf)), pj_websock_state_str(state)); } @@ -156,9 +155,13 @@ PJ_DEF(pj_status_t) pjmedia_ws_speech_port_create(pj_pool_t *pool, unsigned samples_per_frame, unsigned bits_per_sample, struct pj_websock_endpoint *ws_endpt, - char *server_url, + const char *server_url, + const char *voice, + const char *text, void (*cb)(pjmedia_port*, void *user_data, enum ws_speech_event, char *transcript), void *cb_user_data, + unsigned flags, + pj_bool_t end_of_speech_event, pjmedia_port **p_port) { struct ws_speech_t *port; @@ -200,8 +203,7 @@ PJ_DEF(pj_status_t) pjmedia_ws_speech_port_create(pj_pool_t *pool, pj_websock_connect(port->ws_endpt, server_url, &ws_cb, port, &hdr, 1, &port->wc); } - TRACE_((THIS_FILE, "ws_speech port created: %u/%u/%u/%u", clock_rate, - channel_count, samples_per_frame, bits_per_sample)); + printf("ws_speech port created: %u/%u/%u/%u", clock_rate, channel_count, samples_per_frame, bits_per_sample); *p_port = &port->base; return PJ_SUCCESS; diff --git a/src/sip.cpp b/src/sip.cpp index 4326eea..9344f98 100644 --- a/src/sip.cpp +++ b/src/sip.cpp @@ -310,10 +310,15 @@ struct Subscription { bool initialized; }; +#define IMPLEMENTATION_FLITE 1 +#define IMPLEMENTATION_POCKETSPHINX 2 +#define IMPLEMENTATION_WS_SPEECH 3 + struct ConfBridgePort { unsigned slot; pjmedia_port *port; short connection_mode; + short implementation; }; #define FP_DTMFDET 0 @@ -631,7 +636,7 @@ bool prepare_dtmfdet(Call *call, AudioEndpoint *ae); bool prepare_wav_player(Call *call, AudioEndpoint *ae, const char *file, unsigned flags, bool end_of_file_event); bool prepare_wav_writer(Call *call, AudioEndpoint *ae, const char *file); bool prepare_fax(Call *call, AudioEndpoint *ae, bool is_sender, const char *file, unsigned flags); -bool prepare_speech_synth(Call *call, AudioEndpoint *ae, const char *voice, bool end_of_speech_event); +bool prepare_speech_synth(Call *call, AudioEndpoint *ae, const char *server_url, const char *voice, const char *text, unsigned flags, bool end_of_speech_event); bool prepare_speech_recog(Call *call, AudioEndpoint *ae); void prepare_error_event(ostringstream *oss, char *scope, char *details); @@ -914,6 +919,24 @@ static void on_speech_transcript(pjmedia_port*, void *user_data, char* transcrip dispatch_event(evt); } +static void on_ws_speech_event(pjmedia_port*, void *user_data, enum ws_speech_event e, char *data) { + if (g_shutting_down) + return; + + long call_id; + if (!g_call_ids.get_id((long)user_data, call_id)) { + addon_log( + L_DBG, + "on_ws_speech_event: Failed to get call_id. Event will not be notified.\n"); + return; + } + + char evt[1024]; + //make_evt_ws_speech_event(evt, sizeof(evt), call_id, data); + //dispatch_event(evt); +} + + void dispatch_event(const char *evt) { addon_log(L_DBG, "dispach_event called with evt=%s\n", evt); // g_event_sink(evt); @@ -3820,7 +3843,7 @@ int pjw_call_start_play_wav(long call_id, const char *json) { return 0; } -pj_status_t audio_endpoint_start_speech_synth(Call *call, AudioEndpoint *ae, const char * voice, const char *text, unsigned flags, bool end_of_speech_event) { +pj_status_t audio_endpoint_start_speech_synth(Call *call, AudioEndpoint *ae, const char *server_url, const char *voice, const char *text, unsigned flags, bool end_of_speech_event) { pj_status_t status; if(!ae->stream_cbp.port) { @@ -3834,12 +3857,10 @@ pj_status_t audio_endpoint_start_speech_synth(Call *call, AudioEndpoint *ae, con return -1; } - if (!prepare_speech_synth(call, ae, voice, end_of_speech_event)) { + if (!prepare_speech_synth(call, ae, server_url, voice, text, flags, end_of_speech_event)) { return -1; } - pjmedia_flite_port_speak(ae->feature_cbps[FP_SPEECH_SYNTH].port, text, flags); - return PJ_SUCCESS; } @@ -3859,6 +3880,8 @@ int pjw_call_start_speech_synth(long call_id, const char *json) { int media_id = -1; + char *server_url = (char*)""; + char *voice; char *text; @@ -3873,7 +3896,7 @@ int pjw_call_start_speech_synth(long call_id, const char *json) { Document document; - const char *valid_params[] = {"voice", "text", "media_id", "end_of_speech_event", "no_loop", ""}; + const char *valid_params[] = {"server_url", "voice", "text", "media_id", "end_of_speech_event", "no_loop", ""}; if (!g_call_ids.get(call_id, val)) { set_error("Invalid call_id"); @@ -3896,6 +3919,10 @@ int pjw_call_start_speech_synth(long call_id, const char *json) { goto out; } + if (json_get_string_param(document, "server_url", true, &server_url) <= 0) { + goto out; + } + if (json_get_string_param(document, "voice", false, &voice) <= 0) { goto out; } @@ -3937,7 +3964,7 @@ int pjw_call_start_speech_synth(long call_id, const char *json) { MediaEndpoint *me = (MediaEndpoint *)call->media[i]; if (me->type == ENDPOINT_TYPE_AUDIO) { AudioEndpoint *ae = (AudioEndpoint *)me->endpoint.audio; - status = audio_endpoint_start_speech_synth(call, ae, voice, text, flags, end_of_speech_event); + status = audio_endpoint_start_speech_synth(call, ae, server_url, voice, text, flags, end_of_speech_event); if (status != PJ_SUCCESS) goto out; } } @@ -3955,7 +3982,7 @@ int pjw_call_start_speech_synth(long call_id, const char *json) { ae = (AudioEndpoint *)me->endpoint.audio; - audio_endpoint_start_speech_synth(call, ae, voice, text, flags, end_of_speech_event); + audio_endpoint_start_speech_synth(call, ae, server_url, voice, text, flags, end_of_speech_event); } out: @@ -4004,16 +4031,6 @@ int pjw_call_start_speech_recog(long call_id, const char *json) { int media_id = -1; - char *voice; - - char *text; - - bool end_of_speech_event = false; - - unsigned flags = 0; - - bool no_loop = false; - char buffer[MAX_JSON_INPUT]; Document document; @@ -6687,8 +6704,6 @@ bool is_media_active(Call *call, MediaEndpoint *me) { void close_media_endpoint(Call *call, MediaEndpoint *me) { printf("close_media_endpoint %p\n", (void*)me); - pj_status_t status; - if(!me) return; if (ENDPOINT_TYPE_AUDIO == me->type) { @@ -6883,7 +6898,7 @@ bool prepare_fax(Call *call, AudioEndpoint *ae, bool is_sender, const char *file return connect_feature_port_to_stream_port(call, ae, fp); } -bool prepare_speech_synth(Call *call, AudioEndpoint *ae, const char *voice, bool end_of_speech_event) { +bool prepare_speech_synth(Call *call, AudioEndpoint *ae, const char *server_url, const char *voice, const char *text, unsigned flags, bool end_of_speech_event) { pj_status_t status; ConfBridgePort *fp = &ae->feature_cbps[FP_SPEECH_SYNTH]; @@ -6893,35 +6908,82 @@ bool prepare_speech_synth(Call *call, AudioEndpoint *ae, const char *voice, bool return true; } - status = pjmedia_flite_port_create( - call->inv->pool, PJMEDIA_PIA_SRATE(&ae->stream_cbp.port->info), - PJMEDIA_PIA_CCNT(&ae->stream_cbp.port->info), - PJMEDIA_PIA_SPF(&ae->stream_cbp.port->info), - PJMEDIA_PIA_BITS(&ae->stream_cbp.port->info), - voice, - &fp->port); - if (status != PJ_SUCCESS) { - set_error("pjmedia_flite_port_create failed"); - return false; - } + if(!server_url[0]) { + status = pjmedia_flite_port_create( + call->inv->pool, + PJMEDIA_PIA_SRATE(&ae->stream_cbp.port->info), + PJMEDIA_PIA_CCNT(&ae->stream_cbp.port->info), + PJMEDIA_PIA_SPF(&ae->stream_cbp.port->info), + PJMEDIA_PIA_BITS(&ae->stream_cbp.port->info), + voice, + &fp->port); + if (status != PJ_SUCCESS) { + set_error("pjmedia_flite_port_create failed"); + return false; + } + + if (end_of_speech_event) { + status = pjmedia_flite_port_set_eof_cb(fp->port, (void*)call, on_end_of_speech); + if (status != PJ_SUCCESS) { + set_error("pjmedia_flite_port_set_eof_cb failed"); + return false; + } + } - if (end_of_speech_event) { - status = pjmedia_flite_port_set_eof_cb(fp->port, (void*)call, on_end_of_speech); + status = pjmedia_conf_add_port(ae->conf, call->inv->pool, fp->port, NULL, &fp->slot); if (status != PJ_SUCCESS) { - set_error("pjmedia_flite_port_set_eof_cb failed"); + set_error("pjmedia_conf_add_port failed"); return false; } - } - status = pjmedia_conf_add_port(ae->conf, call->inv->pool, fp->port, NULL, &fp->slot); - if (status != PJ_SUCCESS) { - set_error("pjmedia_conf_add_port failed"); - return false; - } + fp->connection_mode = CONNECTION_MODE_SOURCE; - fp->connection_mode = CONNECTION_MODE_SOURCE; + status = connect_feature_port_to_stream_port(call, ae, fp); + if (status != PJ_SUCCESS) { + return false; + } - return connect_feature_port_to_stream_port(call, ae, fp); + pjmedia_flite_port_speak(ae->feature_cbps[FP_SPEECH_SYNTH].port, text, flags); + + ae->feature_cbps[FP_SPEECH_SYNTH].implementation = IMPLEMENTATION_FLITE; + } else { + status = pjmedia_ws_speech_port_create( + call->inv->pool, + PJMEDIA_PIA_SRATE(&ae->stream_cbp.port->info), + PJMEDIA_PIA_CCNT(&ae->stream_cbp.port->info), + PJMEDIA_PIA_SPF(&ae->stream_cbp.port->info), + PJMEDIA_PIA_BITS(&ae->stream_cbp.port->info), + g_ws_endpt, + server_url, + voice, + text, + on_ws_speech_event, + call, + flags, + end_of_speech_event, + &fp->port); + if (status != PJ_SUCCESS) { + set_error("pjmedia_flite_port_create failed"); + return false; + } + + status = pjmedia_conf_add_port(ae->conf, call->inv->pool, fp->port, NULL, &fp->slot); + if (status != PJ_SUCCESS) { + set_error("pjmedia_conf_add_port failed"); + return false; + } + + fp->connection_mode = CONNECTION_MODE_SOURCE; + + status = connect_feature_port_to_stream_port(call, ae, fp); + if (status != PJ_SUCCESS) { + return false; + } + + ae->feature_cbps[FP_SPEECH_SYNTH].implementation = IMPLEMENTATION_WS_SPEECH; + } + + return PJ_SUCCESS; } bool prepare_speech_recog(Call *call, AudioEndpoint *ae) {