From 8d06a545d8c0a19b12f0d916c185b27725b87807 Mon Sep 17 00:00:00 2001
From: William Wong <compulim@users.noreply.github.com>
Date: Thu, 31 Oct 2019 07:42:14 +0800
Subject: [PATCH] Keep AudioContext object and add WILL_START dictate state
 (#2520)

* Keep AudioContext object and add WILL_START dictate state

* Add entries
---
 CHANGELOG.md                                  |  4 +-
 packages/bundle/package.json                  |  3 +-
 ...veServicesSpeechServicesPonyfillFactory.js | 22 ++++++++++
 packages/component/src/Composer.js            |  8 +++-
 packages/component/src/Dictation.js           | 40 +------------------
 .../component/src/SendBox/MicrophoneButton.js |  6 ++-
 .../src/Styles/StyleSet/MicrophoneButton.js   |  8 +++-
 packages/core/src/constants/DictateState.js   |  9 +++--
 packages/core/src/reducers/dictateState.js    |  6 ++-
 packages/core/src/sagas.js                    |  2 +
 ...DictateOnIncomingActivityFromOthersSaga.js |  5 ++-
 .../sagas/startDictateOnSpeakCompleteSaga.js  | 23 +++++++++++
 packages/core/src/selectors/dictateState.js   |  1 +
 13 files changed, 83 insertions(+), 54 deletions(-)
 create mode 100644 packages/core/src/sagas/startDictateOnSpeakCompleteSaga.js
 create mode 100644 packages/core/src/selectors/dictateState.js

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3c156d7786..783ad11617 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -97,7 +97,9 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 -  Fixes [#2473](https://github.com/microsoft/BotFramework-WebChat/issues/2473). Fix samples 13 using wrong region for Speech Services credentials, by [@compulim](https://github.com/compulim) in PR [#2482](https://github.com/microsoft/BotFramework-WebChat/pull/2482)
 -  Fixes [#2420](https://github.com/microsoft/BotFramework-WebChat/issues/2420). Fix saga error should not result in an unhandled exception, by [@compulim](https://github.com/compulim) in PR [#2421](https://github.com/microsoft/BotFramework-WebChat/pull/2421)
 -  Fixes [#2513](https://github.com/microsoft/BotFramework-WebChat/issues/2513). Fix `core-js` not loading properly, by [@compulim](https://github.com/compulim) in PR [#2514](https://github.com/microsoft/BotFramework-WebChat/pull/2514)
--  Fixes [#2516](https://github.com/microsoft/BotFramework-WebChat/issues/2516). Disable microphone input for `expecting` input hint on Safari, by [@compulim](https://github.com/compulim) in PR [#2517](https://github.com/microsoft/BotFramework-WebChat/pull/2517)
+-  Fixes [#2516](https://github.com/microsoft/BotFramework-WebChat/issues/2516). Disable microphone input for `expecting` input hint on Safari, by [@compulim](https://github.com/compulim) in PR [#2517](https://github.com/microsoft/BotFramework-WebChat/pull/2517) and PR [#2520](https://github.com/microsoft/BotFramework-WebChat/pull/2520)
+-  Fixes [#2518](https://github.com/microsoft/BotFramework-WebChat/issues/2518). Synthesis of bot activities with input hint expecting, should be interruptible, by [@compulim](https://github.com/compulim) in PR [#2520](https://github.com/microsoft/BotFramework-WebChat/pull/2520)
+-  Fixes [#2519](https://github.com/microsoft/BotFramework-WebChat/issues/2519). On Safari, microphone should turn on after synthesis of bot activities with input hint expecting, by [@compulim](https://github.com/compulim) in PR [#2520](https://github.com/microsoft/BotFramework-WebChat/pull/2520)
 
 ### Added
 
diff --git a/packages/bundle/package.json b/packages/bundle/package.json
index a8ecb035d1..6e8474aa18 100644
--- a/packages/bundle/package.json
+++ b/packages/bundle/package.json
@@ -42,11 +42,12 @@
     "markdown-it": "^8.4.2",
     "markdown-it-for-inline": "^0.1.1",
     "memoize-one": "^5.0.2",
+    "microsoft-cognitiveservices-speech-sdk": "1.6.0",
     "microsoft-speech-browser-sdk": "^0.0.12",
     "prop-types": "^15.7.2",
     "sanitize-html": "^1.19.0",
     "url-search-params-polyfill": "^5.0.0",
-    "web-speech-cognitive-services": "^5.0.1",
+    "web-speech-cognitive-services": "5.0.1",
     "whatwg-fetch": "^3.0.0"
   },
   "devDependencies": {
diff --git a/packages/bundle/src/createCognitiveServicesSpeechServicesPonyfillFactory.js b/packages/bundle/src/createCognitiveServicesSpeechServicesPonyfillFactory.js
index 3820d3343e..8dd0daa7e6 100644
--- a/packages/bundle/src/createCognitiveServicesSpeechServicesPonyfillFactory.js
+++ b/packages/bundle/src/createCognitiveServicesSpeechServicesPonyfillFactory.js
@@ -1,3 +1,4 @@
+import { AudioConfig } from 'microsoft-cognitiveservices-speech-sdk';
 import createPonyfill from 'web-speech-cognitive-services/lib/SpeechServices';
 
 export default function createCognitiveServicesSpeechServicesPonyfillFactory({
@@ -15,6 +16,27 @@ export default function createCognitiveServicesSpeechServicesPonyfillFactory({
     'Web Chat: Cognitive Services Speech Services support is currently in preview. If you encounter any problems, please file us an issue at https://github.com/microsoft/BotFramework-WebChat/issues/.'
   );
 
+  // HACK: We should prevent AudioContext object from being recreated because they may be blessed and UX-wise expensive to recreate.
+  //       In Cognitive Services SDK, if they detect the "end" function is falsy, they will not call "end" but "suspend" instead.
+  //       And on next recognition, they will re-use the AudioContext object.
+  if (!audioConfig) {
+    audioConfig = AudioConfig.fromDefaultMicrophoneInput();
+    // audioConfig.privSource.privContext = new (window.AudioContext || window.webkitAudioContext)();
+
+    const source = audioConfig.privSource;
+
+    // This piece of code is adopted from microsoft-cognitiveservices-speech-sdk/common.browser/MicAudioSource.ts.
+    // Instead of closing the AudioContext, it will just suspend it. And the next time it is needed, it will be resumed (by the original code).
+    source.destroyAudioContext = () => {
+      if (!source.privContext) {
+        return;
+      }
+
+      source.privRecorder.releaseMediaResources(source.privContext);
+      source.privContext.state === 'running' && source.privContext.suspend();
+    };
+  }
+
   return ({ referenceGrammarID }) => {
     const ponyfill = createPonyfill({
       audioConfig,
diff --git a/packages/component/src/Composer.js b/packages/component/src/Composer.js
index 28b5c33fc4..003cee47bf 100644
--- a/packages/component/src/Composer.js
+++ b/packages/component/src/Composer.js
@@ -198,7 +198,13 @@ const Composer = ({
   }, [dispatch, patchedSendTypingIndicator]);
 
   useEffect(() => {
-    dispatch(createConnectAction({ directLine, userID, username }));
+    dispatch(
+      createConnectAction({
+        directLine,
+        userID,
+        username
+      })
+    );
 
     return () => {
       // TODO: [P3] disconnect() is an async call (pending -> fulfilled), we need to wait, or change it to reconnect()
diff --git a/packages/component/src/Dictation.js b/packages/component/src/Dictation.js
index 32ba29c46a..238eb73ac9 100644
--- a/packages/component/src/Dictation.js
+++ b/packages/component/src/Dictation.js
@@ -9,26 +9,6 @@ const {
   DictateState: { DICTATING, IDLE, STARTING }
 } = Constants;
 
-const PrefixedAudioContext = window.AudioContext || window.webkitAudioContext;
-
-// The result of this check is asynchronous and it will fail on user interaction requirement.
-async function canOpenMicrophone() {
-  const audioContext = new PrefixedAudioContext();
-
-  try {
-    if (audioContext.state === 'suspended') {
-      return await Promise.race([
-        audioContext.resume().then(() => true),
-        new Promise(resolve => setImmediate(resolve)).then(() => false)
-      ]);
-    }
-
-    return true;
-  } finally {
-    await audioContext.close();
-  }
-}
-
 const Dictation = ({
   dictateState,
   disabled,
@@ -82,24 +62,6 @@ const Dictation = ({
     onError && onError(event);
   }, [dictateState, onError, setDictateState, stopDictate]);
 
-  const shouldStart = !disabled && (dictateState === STARTING || dictateState === DICTATING) && !numSpeakingActivities;
-
-  // We need to check if the browser allow us to do open microphone.
-  // In Safari, it block microphone access if the code was not executed based on user interaction.
-
-  // Since the check call is asynchronous, the result will always fail the user interaction requirement.
-  // Thus, we can never open microphone after we receive the check result.
-  // Instead, we will both open microphone and check the result. If the result is negative, we will close the microphone.
-
-  // TODO: [P3] Investigate if a resumed AudioContext instance is kept across multiple session, can we workaround Safari's restrictions.
-  useMemo(async () => {
-    if (shouldStart) {
-      const canStart = await canOpenMicrophone();
-
-      !canStart && stopDictate();
-    }
-  }, [shouldStart, stopDictate]);
-
   return (
     <DictateComposer
       lang={language}
@@ -108,7 +70,7 @@ const Dictation = ({
       onProgress={handleDictating}
       speechGrammarList={SpeechGrammarList}
       speechRecognition={SpeechRecognition}
-      started={shouldStart}
+      started={!disabled && (dictateState === STARTING || dictateState === DICTATING) && !numSpeakingActivities}
     />
   );
 };
diff --git a/packages/component/src/SendBox/MicrophoneButton.js b/packages/component/src/SendBox/MicrophoneButton.js
index 97de6addca..782b8cf9c9 100644
--- a/packages/component/src/SendBox/MicrophoneButton.js
+++ b/packages/component/src/SendBox/MicrophoneButton.js
@@ -56,7 +56,9 @@ const connectMicrophoneButton = (...selectors) => {
       webSpeechPonyfill: { speechSynthesis, SpeechSynthesisUtterance } = {}
     }) => ({
       click: () => {
-        if (dictateState === DictateState.STARTING || dictateState === DictateState.DICTATING) {
+        if (dictateState === DictateState.WILL_START) {
+          stopSpeakingActivity();
+        } else if (dictateState === DictateState.DICTATING) {
           stopDictate();
           setSendBox(dictateInterims.join(' '));
         } else {
@@ -67,7 +69,7 @@ const connectMicrophoneButton = (...selectors) => {
         primeSpeechSynthesis(speechSynthesis, SpeechSynthesisUtterance);
       },
       dictating: dictateState === DictateState.DICTATING,
-      disabled: disabled || (dictateState === DictateState.STARTING || dictateState === DictateState.STOPPING),
+      disabled: disabled || (dictateState === DictateState.STARTING && dictateState === DictateState.STOPPING),
       language
     }),
     ...selectors
diff --git a/packages/component/src/Styles/StyleSet/MicrophoneButton.js b/packages/component/src/Styles/StyleSet/MicrophoneButton.js
index fb6b5b3473..d432c7a510 100644
--- a/packages/component/src/Styles/StyleSet/MicrophoneButton.js
+++ b/packages/component/src/Styles/StyleSet/MicrophoneButton.js
@@ -1,8 +1,12 @@
 export default function createMicrophoneButtonStyle({ microphoneButtonColorOnDictate }) {
   return {
     // TODO: [P3] This path should not know anything about the DOM tree of <IconButton>
-    '&.dictating > button svg': {
-      fill: microphoneButtonColorOnDictate
+    '&.dictating > button': {
+      '&, &:focus, &:hover': {
+        '& svg': {
+          fill: microphoneButtonColorOnDictate
+        }
+      }
     }
   };
 }
diff --git a/packages/core/src/constants/DictateState.js b/packages/core/src/constants/DictateState.js
index d037088573..7aef0394c4 100644
--- a/packages/core/src/constants/DictateState.js
+++ b/packages/core/src/constants/DictateState.js
@@ -1,6 +1,7 @@
 const IDLE = 0;
-const STARTING = 1;
-const DICTATING = 2;
-const STOPPING = 3;
+const WILL_START = 1;
+const STARTING = 2;
+const DICTATING = 3;
+const STOPPING = 4;
 
-export { DICTATING, IDLE, STARTING, STOPPING };
+export { DICTATING, IDLE, STARTING, STOPPING, WILL_START };
diff --git a/packages/core/src/reducers/dictateState.js b/packages/core/src/reducers/dictateState.js
index 2061f0728c..2b94ae8736 100644
--- a/packages/core/src/reducers/dictateState.js
+++ b/packages/core/src/reducers/dictateState.js
@@ -1,4 +1,4 @@
-import { DICTATING, IDLE, STARTING, STOPPING } from '../constants/DictateState';
+import { DICTATING, IDLE, STARTING, STOPPING, WILL_START } from '../constants/DictateState';
 
 import { SET_DICTATE_STATE } from '../actions/setDictateState';
 import { START_DICTATE } from '../actions/startDictate';
@@ -13,7 +13,7 @@ export default function dictateState(state = DEFAULT_STATE, { payload, type }) {
       break;
 
     case START_DICTATE:
-      if (state === IDLE || state === STOPPING) {
+      if (state === IDLE || state === STOPPING || state === WILL_START) {
         state = STARTING;
       }
 
@@ -22,6 +22,8 @@ export default function dictateState(state = DEFAULT_STATE, { payload, type }) {
     case STOP_DICTATE:
       if (state === STARTING || state === DICTATING) {
         state = STOPPING;
+      } else if (state === WILL_START) {
+        state = IDLE;
       }
 
       break;
diff --git a/packages/core/src/sagas.js b/packages/core/src/sagas.js
index 2fb8e2237f..2340d205c2 100644
--- a/packages/core/src/sagas.js
+++ b/packages/core/src/sagas.js
@@ -15,6 +15,7 @@ import sendMessageToPostActivitySaga from './sagas/sendMessageToPostActivitySaga
 import sendPostBackToPostActivitySaga from './sagas/sendPostBackToPostActivitySaga';
 import sendTypingIndicatorOnSetSendBoxSaga from './sagas/sendTypingIndicatorOnSetSendBoxSaga';
 import speakActivityAndStartDictateOnIncomingActivityFromOthersSaga from './sagas/speakActivityAndStartDictateOnIncomingActivityFromOthersSaga';
+import startDictateOnSpeakCompleteSaga from './sagas/startDictateOnSpeakCompleteSaga';
 import startSpeakActivityOnPostActivitySaga from './sagas/startSpeakActivityOnPostActivitySaga';
 import stopDictateOnCardActionSaga from './sagas/stopDictateOnCardActionSaga';
 import stopSpeakingActivityOnInputSaga from './sagas/stopSpeakingActivityOnInputSaga';
@@ -38,6 +39,7 @@ export default function* sagas() {
   yield fork(sendPostBackToPostActivitySaga);
   yield fork(sendTypingIndicatorOnSetSendBoxSaga);
   yield fork(speakActivityAndStartDictateOnIncomingActivityFromOthersSaga);
+  yield fork(startDictateOnSpeakCompleteSaga);
   yield fork(startSpeakActivityOnPostActivitySaga);
   yield fork(stopDictateOnCardActionSaga);
   yield fork(stopSpeakingActivityOnInputSaga);
diff --git a/packages/core/src/sagas/speakActivityAndStartDictateOnIncomingActivityFromOthersSaga.js b/packages/core/src/sagas/speakActivityAndStartDictateOnIncomingActivityFromOthersSaga.js
index c59e954ca8..529b88ecb6 100644
--- a/packages/core/src/sagas/speakActivityAndStartDictateOnIncomingActivityFromOthersSaga.js
+++ b/packages/core/src/sagas/speakActivityAndStartDictateOnIncomingActivityFromOthersSaga.js
@@ -1,10 +1,11 @@
 import { put, select, takeEvery } from 'redux-saga/effects';
 
 import { INCOMING_ACTIVITY } from '../actions/incomingActivity';
+import { WILL_START } from '../constants/DictateState';
 import markActivity from '../actions/markActivity';
+import setDictateState from '../actions/setDictateState';
 import shouldSpeakIncomingActivitySelector from '../selectors/shouldSpeakIncomingActivity';
 import speakableActivity from '../definitions/speakableActivity';
-import startDictate from '../actions/startDictate';
 import stopDictate from '../actions/stopDictate';
 import whileConnected from './effects/whileConnected';
 
@@ -25,7 +26,7 @@ function* speakActivityAndStartDictateOnIncomingActivityFromOthers({ userID }) {
     }
 
     if (shouldSpeak && activity.inputHint === 'expectingInput') {
-      yield put(startDictate());
+      yield put(setDictateState(WILL_START));
     } else if (activity.inputHint === 'ignoringInput') {
       yield put(stopDictate());
     }
diff --git a/packages/core/src/sagas/startDictateOnSpeakCompleteSaga.js b/packages/core/src/sagas/startDictateOnSpeakCompleteSaga.js
new file mode 100644
index 0000000000..0d8945b52e
--- /dev/null
+++ b/packages/core/src/sagas/startDictateOnSpeakCompleteSaga.js
@@ -0,0 +1,23 @@
+import { put, select, takeEvery } from 'redux-saga/effects';
+
+import { MARK_ACTIVITY } from '../../lib/actions/markActivity';
+import { of as activitiesOf } from '../selectors/activities';
+import { SET_DICTATE_STATE } from '../../lib/actions/setDictateState';
+import { WILL_START } from '../constants/DictateState';
+import dictateStateSelector from '../selectors/dictateState';
+import speakingActivity from '../definitions/speakingActivity';
+import startDictate from '../actions/startDictate';
+
+function* startDictateOnSpeakComplete() {
+  const speakingActivities = yield select(activitiesOf(speakingActivity));
+  const dictateState = yield select(dictateStateSelector);
+
+  if (dictateState === WILL_START && !speakingActivities.length) {
+    yield put(startDictate());
+  }
+}
+
+// TODO: [P4] We should turn this into a reducer instead
+export default function* startDictateOnSpeakCompleteSaga() {
+  yield takeEvery(({ type }) => type === MARK_ACTIVITY || type === SET_DICTATE_STATE, startDictateOnSpeakComplete);
+}
diff --git a/packages/core/src/selectors/dictateState.js b/packages/core/src/selectors/dictateState.js
new file mode 100644
index 0000000000..1d3bc58776
--- /dev/null
+++ b/packages/core/src/selectors/dictateState.js
@@ -0,0 +1 @@
+export default ({ dictateState }) => dictateState;