Skip to content

Commit

Permalink
fix/confirmation_state (#125)
Browse files Browse the repository at this point in the history
* refactor/drop_confirmation_state

closes #107

* skip listen_sound from STT buffer

* validate source

* get default sound duration from the file itself if available

* get default sound duration from the file itself if available

* log

* more logs

* resolve sound uris

* new_util/get_sound_duration

* fix sound destination

* utils 0.0.38 compat

* fix tests

* test
  • Loading branch information
JarbasAl authored Jun 20, 2024
1 parent 42c2ce5 commit 1f0f99a
Show file tree
Hide file tree
Showing 4 changed files with 88 additions and 44 deletions.
52 changes: 27 additions & 25 deletions ovos_dinkum_listener/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,10 @@
import wave
from enum import Enum
from hashlib import md5
from os.path import dirname
from pathlib import Path
from tempfile import NamedTemporaryFile
from threading import Thread, RLock, Event, Timer
from threading import Thread, RLock, Event

import speech_recognition as sr
from distutils.spawn import find_executable
Expand All @@ -40,13 +41,19 @@
from ovos_dinkum_listener.transformers import AudioTransformersService
from ovos_dinkum_listener.voice_loop import DinkumVoiceLoop, ListeningMode, ListeningState
from ovos_dinkum_listener.voice_loop.hotwords import HotwordContainer

try:
from ovos_backend_client.api import DatasetApi
except ImportError:
LOG.info("`ovos-backend-client` is not installed. Upload is disabled")
DatasetApi = None

try:
from ovos_utils.sound import get_sound_duration
except ImportError:

def get_sound_duration(*args, **kwargs):
raise ImportError("please install ovos-utils>=0.1.0a25")

# Seconds between systemd watchdog updates
WATCHDOG_DELAY = 0.5

Expand Down Expand Up @@ -254,7 +261,7 @@ def _init_voice_loop(self, listener_config: dict):
fallback_stt=self.fallback_stt,
vad=self.vad,
transformers=self.transformers,
#
instant_listen=listener_config.get("instant_listen"),
speech_seconds=listener_config.get("speech_begin", 0.3),
silence_seconds=listener_config.get("silence_end", 0.7),
timeout_seconds=listener_config.get("recording_timeout", 10),
Expand Down Expand Up @@ -611,19 +618,12 @@ def _hotword_audio(self, audio_bytes: bytes, ww_context: dict):
event = ww_context.get("event")

if sound:
context = {'client_name': 'ovos_dinkum_listener',
'source': 'listener',
'destination': ["audio"] # default native-source
}
LOG.debug(f"Handling listen sound: {sound}")
audio_context = dict(context)
audio_context["destination"] = ["audio"]
self.bus.emit(Message("mycroft.audio.play_sound",
{"uri": sound, "force_unmute": True},
context))
if not listener.get("instant_listen"):
self.voice_loop.state = ListeningState.CONFIRMATION
self.voice_loop.confirmation_event.clear()
Timer(0.5, lambda: self.voice_loop.confirmation_event.set()).start()

audio_context))
if listen:
msg_type = "recognizer_loop:wakeword"
payload["utterance"] = \
Expand Down Expand Up @@ -782,18 +782,17 @@ def _handle_listen(self, message: Message):
if self.config.get('confirm_listening'):
sound = self.config.get('sounds', {}).get('start_listening')
if sound:
context = {'client_name': 'ovos_dinkum_listener',
'source': 'listener',
'destination': ["audio"] # default native-source
}
message = message or Message("", context=context) # might be None
self.bus.emit(message.forward("mycroft.audio.play_sound", {"uri": sound}))
if not self.config["listener"].get("instant_listen"):
self.voice_loop.state = ListeningState.CONFIRMATION
self.voice_loop.confirmation_event.clear()
Timer(0.5, lambda: self.voice_loop.confirmation_event.set()).start()
else:
self.voice_loop.state = ListeningState.BEFORE_COMMAND
self.voice_loop.state = ListeningState.CONFIRMATION
try:
if sound.startswith("snd/"):
dur = get_sound_duration(sound, base_dir=f"{dirname(__file__)}/res")
else:
dur = get_sound_duration(sound)
LOG.debug(f"{sound} duration: {dur} seconds")
self.voice_loop.confirmation_seconds_left = dur
except:
self.voice_loop.confirmation_seconds_left = self.voice_loop.confirmation_seconds
else:
self.voice_loop.state = ListeningState.BEFORE_COMMAND

Expand Down Expand Up @@ -879,8 +878,11 @@ def _handle_wake_up(self, message: Message):

def _handle_sound_played(self, message: Message):
"""Handle response message from audio service."""
if not self._validate_message_context(message) or not self.voice_loop.running:
# ignore this sound, it is targeted to an external client
return
if self.voice_loop.state == ListeningState.CONFIRMATION:
self.voice_loop.confirmation_event.set()
self.voice_loop.state = ListeningState.BEFORE_COMMAND

def _handle_b64_audio(self, message: Message):
""" transcribe base64 encoded audio """
Expand Down
21 changes: 20 additions & 1 deletion ovos_dinkum_listener/voice_loop/hotwords.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,18 @@
from enum import Enum
from os.path import dirname
from threading import Event
from typing import Optional

from ovos_config import Configuration
from ovos_plugin_manager.wakewords import OVOSWakeWordFactory, HotWordEngine
from ovos_utils.log import LOG
from ovos_utils.fakebus import FakeBus
from ovos_utils.log import LOG
try:
from ovos_utils.sound import get_sound_duration
except ImportError:

def get_sound_duration(*args, **kwargs):
raise ImportError("please install ovos-utils>=0.1.0a25")


class HotWordException(RuntimeWarning):
Expand Down Expand Up @@ -174,6 +181,18 @@ def load_hotword_engines(self):
"listen": listen,
"wakeup": wakeup,
"stopword": stopword}
if sound:
try:
if sound.startswith("snd/"):
dur = get_sound_duration(sound,
base_dir=f"{dirname(dirname(__file__))}/res")
else:
dur = get_sound_duration(sound)
LOG.debug(f"{sound} duration: {dur} seconds")
self._plugins[word]["sound_duration"] = dur
except:
pass

except Exception as e:
LOG.error("Failed to load hotword: " + word)

Expand Down
52 changes: 38 additions & 14 deletions ovos_dinkum_listener/voice_loop/voice_loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ def debiased_energy(audio_data: bytes, sample_width: int) -> float:
@dataclass
class ChunkInfo:
is_speech: bool = False
is_listen_sound: bool = False
energy: float = 0.0


Expand All @@ -106,17 +107,20 @@ class DinkumVoiceLoop(VoiceLoop):
speech_seconds: float = 0.3
silence_seconds: float = 0.7
timeout_seconds: float = 10.0
timeout_seconds_with_silence: float = 5.0
timeout_seconds_with_silence: float = 5.0
confirmation_seconds: float = 0.5
num_stt_rewind_chunks: int = 2
num_hotword_keep_chunks: int = 15
remove_silence: bool = False
instant_listen: bool = False
skip_next_wake: bool = False
hotword_chunks: Deque = field(default_factory=deque)
stt_chunks: Deque = field(default_factory=deque)
stt_audio_bytes: bytes = bytes()
last_ww: float = -1.0
speech_seconds_left: float = 0.0
silence_seconds_left: float = 0.0
confirmation_seconds_left: float = 0.0
timeout_seconds_left: float = 0.0
timeout_seconds_with_silence_left: float = 0.0
recording_seconds_with_silence_left: float = 0.0
Expand Down Expand Up @@ -181,7 +185,6 @@ def run(self):
self.timeout_seconds_left = self.timeout_seconds
self.timeout_seconds_with_silence_left = self.timeout_seconds_with_silence
self.state = ListeningState.DETECT_WAKEWORD
self.confirmation_event = Event()

# Keep hotword/STT audio so they can (optionally) be saved to disk
self.hotword_chunks = deque(maxlen=self.num_hotword_keep_chunks)
Expand Down Expand Up @@ -248,12 +251,10 @@ def run(self):
self._before_wakeup(chunk)
elif self.state == ListeningState.CHECK_WAKE_UP:
self._detect_wakeup(chunk)

# set either by timeout (0.5) or by ovos-audio response
elif self.state == ListeningState.CONFIRMATION and \
self.confirmation_event.is_set():
self.state = ListeningState.BEFORE_COMMAND
LOG.debug(f"STATE: {self.state}")

elif self.state == ListeningState.CONFIRMATION:
LOG.debug("playing listen sound")
self._confirmation_sound(chunk)

elif self.state == ListeningState.BEFORE_COMMAND:
LOG.debug("waiting for speech")
Expand Down Expand Up @@ -485,14 +486,15 @@ def _detect_ww(self, chunk: bytes) -> bool:
ww = self.hotwords.found()
if ww:
LOG.debug(f"Wake word detected={ww}")
ww_data = self.hotwords.get_ww(ww)

# Callback to handle recorded hotword audio
if self.listenword_audio_callback is not None:
hotword_audio_bytes = bytes()
while self.hotword_chunks:
hotword_audio_bytes += self.hotword_chunks.popleft()

self.listenword_audio_callback(hotword_audio_bytes,
self.hotwords.get_ww(ww))
self.listenword_audio_callback(hotword_audio_bytes, ww_data)

self.hotword_chunks.clear()

Expand All @@ -504,18 +506,22 @@ def _detect_ww(self, chunk: bytes) -> bool:
if self.listen_mode == ListeningMode.SLEEPING:
# Wake word detected, begin detecting "wake up" word
self.state = ListeningState.CHECK_WAKE_UP
LOG.debug(f"STATE: {self.state}")
else:
# Wake word detected, begin recording voice command
if not self.state == ListeningState.CONFIRMATION:
if ww_data.get("sound"):
self.state = ListeningState.CONFIRMATION
# derive timeout from sound file length if possible
dur = ww_data.get("sound_duration", self.confirmation_seconds)
self.confirmation_seconds_left = dur
else:
self.state = ListeningState.BEFORE_COMMAND
LOG.debug(f"STATE: {self.state}")
# Wake word detected, begin recording voice command
self.reset_speech_timer()
self.stt_audio_bytes = bytes()
self.stt.stream_start()
if self.fallback_stt is not None:
self.fallback_stt.stream_start()

LOG.debug(f"STATE: {self.state}")
self.last_ww = time.time()
self.transformers.feed_hotword(chunk)
return True
Expand Down Expand Up @@ -557,6 +563,24 @@ def _wait_cmd(self, chunk: bytes):
self.stt_audio_bytes += chunk
self.stt_chunks.append(chunk)

def _confirmation_sound(self, chunk: bytes):
self._chunk_info.is_listen_sound = True
if self.instant_listen:
LOG.debug("instant_listen is on")
self.confirmation_seconds_left = 0
self.state = ListeningState.BEFORE_COMMAND
LOG.debug(f"STATE: {self.state}")
self._before_cmd(chunk)
return

# skip STT buffer if instant_listen is NOT set
# Recording voice command, but user has not spoken yet
self.transformers.feed_audio(chunk)
self.confirmation_seconds_left -= self.mic.seconds_per_chunk
if self.confirmation_seconds_left <= 0:
self.state = ListeningState.BEFORE_COMMAND
LOG.debug(f"STATE: {self.state}")

def _before_cmd(self, chunk: bytes):
"""
Handle audio chunks after WW detection or listen triggered, before VAD
Expand Down
7 changes: 3 additions & 4 deletions test/unittests/test_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from unittest.mock import Mock, patch

from ovos_utils.messagebus import FakeBus
from ovos_bus_client.message import Message
from ovos_utils.process_utils import ProcessState


Expand Down Expand Up @@ -286,7 +287,7 @@ def test_handle_listen(self):
self.service.voice_loop.reset_speech_timer = Mock()
self.service.voice_loop.confirmation_event = Event()

self.service._handle_listen(None)
self.service._handle_listen(Message(""))
self.assertEqual(self.service.voice_loop.confirmation_event.is_set(), False)
self.service.voice_loop.reset_speech_timer.assert_called_once()
self.service.voice_loop.reset_speech_timer.reset_mock()
Expand All @@ -295,13 +296,11 @@ def test_handle_listen(self):
self.service.voice_loop.stt.stream_start.assert_called_once()
self.service.voice_loop.stt.stream_start.reset_mock()
self.assertEqual(self.service.voice_loop.state, ListeningState.CONFIRMATION)
sleep(1)
self.assertEqual(self.service.voice_loop.confirmation_event.is_set(), True)

self.service.voice_loop.state = ListeningState.DETECT_WAKEWORD
self.service.config["confirm_listening"] = False

self.service._handle_listen(None)
self.service._handle_listen(Message(""))
self.assertEqual(self.service.config["confirm_listening"], False)
self.service.voice_loop.reset_speech_timer.assert_called_once()
self.service.voice_loop.reset_speech_timer.reset_mock()
Expand Down

0 comments on commit 1f0f99a

Please sign in to comment.