fix/confirmation_state (#125)

* refactor/drop_confirmation_state closes #107 * skip listen_sound from STT buffer * validate source * get default sound duration from the file itself if available * get default sound duration from the file itself if available * log * more logs * resolve sound uris * new_util/get_sound_duration * fix sound destination * utils 0.0.38 compat * fix tests * test
OpenVoiceOS · Jun 20, 2024 · 1f0f99a · 1f0f99a
1 parent 42c2ce5
commit 1f0f99a
Show file tree

Hide file tree

Showing 4 changed files with 88 additions and 44 deletions.
diff --git a/ovos_dinkum_listener/service.py b/ovos_dinkum_listener/service.py
@@ -16,9 +16,10 @@
 import wave
 from enum import Enum
 from hashlib import md5
+from os.path import dirname
 from pathlib import Path
 from tempfile import NamedTemporaryFile
-from threading import Thread, RLock, Event, Timer
+from threading import Thread, RLock, Event
 
 import speech_recognition as sr
 from distutils.spawn import find_executable
@@ -40,13 +41,19 @@
 from ovos_dinkum_listener.transformers import AudioTransformersService
 from ovos_dinkum_listener.voice_loop import DinkumVoiceLoop, ListeningMode, ListeningState
 from ovos_dinkum_listener.voice_loop.hotwords import HotwordContainer
-
 try:
     from ovos_backend_client.api import DatasetApi
 except ImportError:
     LOG.info("`ovos-backend-client` is not installed. Upload is disabled")
     DatasetApi = None
 
+try:
+    from ovos_utils.sound import get_sound_duration
+except ImportError:
+
+    def get_sound_duration(*args, **kwargs):
+        raise ImportError("please install ovos-utils>=0.1.0a25")
+
 # Seconds between systemd watchdog updates
 WATCHDOG_DELAY = 0.5
 
@@ -254,7 +261,7 @@ def _init_voice_loop(self, listener_config: dict):
                 fallback_stt=self.fallback_stt,
                 vad=self.vad,
                 transformers=self.transformers,
-                #
+                instant_listen=listener_config.get("instant_listen"),
                 speech_seconds=listener_config.get("speech_begin", 0.3),
                 silence_seconds=listener_config.get("silence_end", 0.7),
                 timeout_seconds=listener_config.get("recording_timeout", 10),
@@ -611,19 +618,12 @@ def _hotword_audio(self, audio_bytes: bytes, ww_context: dict):
             event = ww_context.get("event")
 
             if sound:
-                context = {'client_name': 'ovos_dinkum_listener',
-                           'source': 'listener',
-                           'destination': ["audio"]  # default native-source
-                           }
                 LOG.debug(f"Handling listen sound: {sound}")
+                audio_context = dict(context)
+                audio_context["destination"] = ["audio"]
                 self.bus.emit(Message("mycroft.audio.play_sound",
                                       {"uri": sound, "force_unmute": True},
-                                      context))
-                if not listener.get("instant_listen"):
-                    self.voice_loop.state = ListeningState.CONFIRMATION
-                    self.voice_loop.confirmation_event.clear()
-                    Timer(0.5, lambda: self.voice_loop.confirmation_event.set()).start()
-
+                                      audio_context))
             if listen:
                 msg_type = "recognizer_loop:wakeword"
                 payload["utterance"] = \
@@ -782,18 +782,17 @@ def _handle_listen(self, message: Message):
         if self.config.get('confirm_listening'):
             sound = self.config.get('sounds', {}).get('start_listening')
             if sound:
-                context = {'client_name': 'ovos_dinkum_listener',
-                           'source': 'listener',
-                           'destination': ["audio"]  # default native-source
-                           }
-                message = message or Message("", context=context)  # might be None
                 self.bus.emit(message.forward("mycroft.audio.play_sound", {"uri": sound}))
-                if not self.config["listener"].get("instant_listen"):
-                    self.voice_loop.state = ListeningState.CONFIRMATION
-                    self.voice_loop.confirmation_event.clear()
-                    Timer(0.5, lambda: self.voice_loop.confirmation_event.set()).start()
-                else:
-                    self.voice_loop.state = ListeningState.BEFORE_COMMAND
+                self.voice_loop.state = ListeningState.CONFIRMATION
+                try:
+                    if sound.startswith("snd/"):
+                        dur = get_sound_duration(sound, base_dir=f"{dirname(__file__)}/res")
+                    else:
+                        dur = get_sound_duration(sound)
+                    LOG.debug(f"{sound} duration: {dur} seconds")
+                    self.voice_loop.confirmation_seconds_left = dur
+                except:
+                    self.voice_loop.confirmation_seconds_left = self.voice_loop.confirmation_seconds
         else:
             self.voice_loop.state = ListeningState.BEFORE_COMMAND
 
@@ -879,8 +878,11 @@ def _handle_wake_up(self, message: Message):
 
     def _handle_sound_played(self, message: Message):
         """Handle response message from audio service."""
+        if not self._validate_message_context(message) or not self.voice_loop.running:
+            # ignore this sound, it is targeted to an external client
+            return
         if self.voice_loop.state == ListeningState.CONFIRMATION:
-            self.voice_loop.confirmation_event.set()
+            self.voice_loop.state = ListeningState.BEFORE_COMMAND
 
     def _handle_b64_audio(self, message: Message):
         """ transcribe base64 encoded audio """

diff --git a/ovos_dinkum_listener/voice_loop/hotwords.py b/ovos_dinkum_listener/voice_loop/hotwords.py
@@ -1,11 +1,18 @@
 from enum import Enum
+from os.path import dirname
 from threading import Event
 from typing import Optional
 
 from ovos_config import Configuration
 from ovos_plugin_manager.wakewords import OVOSWakeWordFactory, HotWordEngine
-from ovos_utils.log import LOG
 from ovos_utils.fakebus import FakeBus
+from ovos_utils.log import LOG
+try:
+    from ovos_utils.sound import get_sound_duration
+except ImportError:
+
+    def get_sound_duration(*args, **kwargs):
+        raise ImportError("please install ovos-utils>=0.1.0a25")
 
 
 class HotWordException(RuntimeWarning):
@@ -174,6 +181,18 @@ def load_hotword_engines(self):
                                            "listen": listen,
                                            "wakeup": wakeup,
                                            "stopword": stopword}
+                    if sound:
+                        try:
+                            if sound.startswith("snd/"):
+                                dur = get_sound_duration(sound,
+                                                         base_dir=f"{dirname(dirname(__file__))}/res")
+                            else:
+                                dur = get_sound_duration(sound)
+                            LOG.debug(f"{sound} duration: {dur} seconds")
+                            self._plugins[word]["sound_duration"] = dur
+                        except:
+                            pass
+
             except Exception as e:
                 LOG.error("Failed to load hotword: " + word)
 

diff --git a/ovos_dinkum_listener/voice_loop/voice_loop.py b/ovos_dinkum_listener/voice_loop/voice_loop.py
@@ -92,6 +92,7 @@ def debiased_energy(audio_data: bytes, sample_width: int) -> float:
 @dataclass
 class ChunkInfo:
     is_speech: bool = False
+    is_listen_sound: bool = False
     energy: float = 0.0
 
 
@@ -106,17 +107,20 @@ class DinkumVoiceLoop(VoiceLoop):
     speech_seconds: float = 0.3
     silence_seconds: float = 0.7
     timeout_seconds: float = 10.0
-    timeout_seconds_with_silence: float = 5.0    
+    timeout_seconds_with_silence: float = 5.0
+    confirmation_seconds: float = 0.5
     num_stt_rewind_chunks: int = 2
     num_hotword_keep_chunks: int = 15
     remove_silence: bool = False
+    instant_listen: bool = False
     skip_next_wake: bool = False
     hotword_chunks: Deque = field(default_factory=deque)
     stt_chunks: Deque = field(default_factory=deque)
     stt_audio_bytes: bytes = bytes()
     last_ww: float = -1.0
     speech_seconds_left: float = 0.0
     silence_seconds_left: float = 0.0
+    confirmation_seconds_left: float = 0.0
     timeout_seconds_left: float = 0.0
     timeout_seconds_with_silence_left: float = 0.0
     recording_seconds_with_silence_left: float = 0.0
@@ -181,7 +185,6 @@ def run(self):
         self.timeout_seconds_left = self.timeout_seconds
         self.timeout_seconds_with_silence_left = self.timeout_seconds_with_silence        
         self.state = ListeningState.DETECT_WAKEWORD
-        self.confirmation_event = Event()
 
         # Keep hotword/STT audio so they can (optionally) be saved to disk
         self.hotword_chunks = deque(maxlen=self.num_hotword_keep_chunks)
@@ -248,12 +251,10 @@ def run(self):
                 self._before_wakeup(chunk)
             elif self.state == ListeningState.CHECK_WAKE_UP:
                 self._detect_wakeup(chunk)
-
-            # set either by timeout (0.5) or by ovos-audio response
-            elif self.state == ListeningState.CONFIRMATION and \
-                    self.confirmation_event.is_set():
-                self.state = ListeningState.BEFORE_COMMAND
-                LOG.debug(f"STATE: {self.state}")
+
+            elif self.state == ListeningState.CONFIRMATION:
+                LOG.debug("playing listen sound")
+                self._confirmation_sound(chunk)
 
             elif self.state == ListeningState.BEFORE_COMMAND:
                 LOG.debug("waiting for speech")
@@ -485,14 +486,15 @@ def _detect_ww(self, chunk: bytes) -> bool:
         ww = self.hotwords.found()
         if ww:
             LOG.debug(f"Wake word detected={ww}")
+            ww_data = self.hotwords.get_ww(ww)
+
             # Callback to handle recorded hotword audio
             if self.listenword_audio_callback is not None:
                 hotword_audio_bytes = bytes()
                 while self.hotword_chunks:
                     hotword_audio_bytes += self.hotword_chunks.popleft()
 
-                self.listenword_audio_callback(hotword_audio_bytes,
-                                               self.hotwords.get_ww(ww))
+                self.listenword_audio_callback(hotword_audio_bytes, ww_data)
 
             self.hotword_chunks.clear()
 
@@ -504,18 +506,22 @@ def _detect_ww(self, chunk: bytes) -> bool:
             if self.listen_mode == ListeningMode.SLEEPING:
                 # Wake word detected, begin detecting "wake up" word
                 self.state = ListeningState.CHECK_WAKE_UP
-                LOG.debug(f"STATE: {self.state}")
             else:
-                # Wake word detected, begin recording voice command
-                if not self.state == ListeningState.CONFIRMATION:
+                if ww_data.get("sound"):
+                    self.state = ListeningState.CONFIRMATION
+                    # derive timeout from sound file length if possible
+                    dur = ww_data.get("sound_duration", self.confirmation_seconds)
+                    self.confirmation_seconds_left = dur
+                else:
                     self.state = ListeningState.BEFORE_COMMAND
-                    LOG.debug(f"STATE: {self.state}")
+                # Wake word detected, begin recording voice command
                 self.reset_speech_timer()
                 self.stt_audio_bytes = bytes()
                 self.stt.stream_start()
                 if self.fallback_stt is not None:
                     self.fallback_stt.stream_start()
 
+            LOG.debug(f"STATE: {self.state}")
             self.last_ww = time.time()
             self.transformers.feed_hotword(chunk)
             return True
@@ -557,6 +563,24 @@ def _wait_cmd(self, chunk: bytes):
                 self.stt_audio_bytes += chunk
                 self.stt_chunks.append(chunk)
 
+    def _confirmation_sound(self, chunk: bytes):
+        self._chunk_info.is_listen_sound = True
+        if self.instant_listen:
+            LOG.debug("instant_listen is on")
+            self.confirmation_seconds_left = 0
+            self.state = ListeningState.BEFORE_COMMAND
+            LOG.debug(f"STATE: {self.state}")
+            self._before_cmd(chunk)
+            return
+
+        # skip STT buffer if instant_listen is NOT set
+        # Recording voice command, but user has not spoken yet
+        self.transformers.feed_audio(chunk)
+        self.confirmation_seconds_left -= self.mic.seconds_per_chunk
+        if self.confirmation_seconds_left <= 0:
+            self.state = ListeningState.BEFORE_COMMAND
+            LOG.debug(f"STATE: {self.state}")
+
     def _before_cmd(self, chunk: bytes):
         """
         Handle audio chunks after WW detection or listen triggered, before VAD

diff --git a/test/unittests/test_service.py b/test/unittests/test_service.py
@@ -8,6 +8,7 @@
 from unittest.mock import Mock, patch
 
 from ovos_utils.messagebus import FakeBus
+from ovos_bus_client.message import Message
 from ovos_utils.process_utils import ProcessState
 
 
@@ -286,7 +287,7 @@ def test_handle_listen(self):
         self.service.voice_loop.reset_speech_timer = Mock()
         self.service.voice_loop.confirmation_event = Event()
 
-        self.service._handle_listen(None)
+        self.service._handle_listen(Message(""))
         self.assertEqual(self.service.voice_loop.confirmation_event.is_set(), False)
         self.service.voice_loop.reset_speech_timer.assert_called_once()
         self.service.voice_loop.reset_speech_timer.reset_mock()
@@ -295,13 +296,11 @@ def test_handle_listen(self):
         self.service.voice_loop.stt.stream_start.assert_called_once()
         self.service.voice_loop.stt.stream_start.reset_mock()
         self.assertEqual(self.service.voice_loop.state, ListeningState.CONFIRMATION)
-        sleep(1)
-        self.assertEqual(self.service.voice_loop.confirmation_event.is_set(), True)
 
         self.service.voice_loop.state = ListeningState.DETECT_WAKEWORD
         self.service.config["confirm_listening"] = False
 
-        self.service._handle_listen(None)
+        self.service._handle_listen(Message(""))
         self.assertEqual(self.service.config["confirm_listening"], False)
         self.service.voice_loop.reset_speech_timer.assert_called_once()
         self.service.voice_loop.reset_speech_timer.reset_mock()