Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add test for trailing audio after silence timeout #678

Draft
wants to merge 5 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 10 additions & 5 deletions src/common.browser/ReplayableAudioNode.ts
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ export class ReplayableAudioNode implements IAudioStreamNode {

// Reads and returns the next chunk of audio buffer.
// If replay of existing buffers are needed, read() will first seek and replay
// existing content, and upoin completion it will read new content from the underlying
// existing content, and upon completion it will read new content from the underlying
// audio node, saving that content into the replayable buffers.
public read(): Promise<IStreamChunk<ArrayBuffer>> {
// if there is a replay request to honor.
Expand All @@ -46,13 +46,14 @@ export class ReplayableAudioNode implements IAudioStreamNode {
}

let i: number = 0;

while (i < this.privBuffers.length && bytesToSeek >= this.privBuffers[i].chunk.buffer.byteLength) {
bytesToSeek -= this.privBuffers[i++].chunk.buffer.byteLength;
}

if (i < this.privBuffers.length) {
const retVal: ArrayBuffer = this.privBuffers[i].chunk.buffer.slice(bytesToSeek);
const timeReceived = this.privBuffers[i].chunk.timeReceived;
const isEnd: boolean = false;

this.privReplayOffset += (retVal.byteLength / this.privBytesPerSecond) * 1e+7;

Expand All @@ -63,8 +64,8 @@ export class ReplayableAudioNode implements IAudioStreamNode {

return Promise.resolve<IStreamChunk<ArrayBuffer>>({
buffer: retVal,
isEnd: false,
timeReceived: this.privBuffers[i].chunk.timeReceived,
isEnd,
timeReceived,
});
}
}
Expand All @@ -85,12 +86,16 @@ export class ReplayableAudioNode implements IAudioStreamNode {
}

public replay(): void {
if (this.privBuffers && 0 !== this.privBuffers.length) {
if (!this.isEmpty()) {
this.privReplay = true;
this.privReplayOffset = this.privLastShrinkOffset;
}
}

public isEmpty(): boolean {
return !this.privBuffers || this.privBuffers.length === 0;
}

// Shrinks the existing audio buffers to start at the new offset, or at the
// beginning of the buffer closest to the requested offset.
// A replay request will start from the last shrink point.
Expand Down
10 changes: 9 additions & 1 deletion src/common.speech/RequestSession.ts
Original file line number Diff line number Diff line change
Expand Up @@ -149,16 +149,24 @@ export class RequestSession {
public async onServiceTurnEndResponse(continuousRecognition: boolean): Promise<void> {
this.privTurnDeferral.resolve();

if (!continuousRecognition || this.isSpeechEnded) {
if (!continuousRecognition || this.privAudioNode.isEmpty()) {
await this.onComplete();
this.privInTurn = false;
} else {
// Trailing audio issue: Restart ServiceRecognizerBase.sendAudio and do below
// Start a new request set.
this.privTurnStartAudioOffset = this.privLastRecoOffset;
this.privAudioNode.replay();
if (this.isSpeechEnded) {
this.privIsSpeechEnded = false;
}
}
}

public get audioNode(): ReplayableAudioNode {
return this.privAudioNode;
}

public onSpeechContext(): void {
this.privRequestId = createNoDashGuid();
}
Expand Down
29 changes: 25 additions & 4 deletions src/common.speech/ServiceRecognizerBase.ts
Original file line number Diff line number Diff line change
Expand Up @@ -338,7 +338,12 @@ export abstract class ServiceRecognizerBase implements IDisposable {
}

void this.receiveMessage();
const audioSendPromise = this.sendAudio(audioNode);
void this.startSendingAudio(audioNode);
return;
}

public startSendingAudio(audioStreamNode: IAudioStreamNode): Promise<void> {
const audioSendPromise = this.sendAudio(audioStreamNode);

audioSendPromise.catch(async (error: string): Promise<void> => {
await this.cancelRecognitionLocal(CancellationReason.Error, CancellationErrorCode.RuntimeError, error);
Expand Down Expand Up @@ -550,7 +555,7 @@ export abstract class ServiceRecognizerBase implements IDisposable {

case "turn.end":
await this.sendTelemetryData();
if (this.privRequestSession.isSpeechEnded && this.privMustReportEndOfStream) {
if (this.privRequestSession.isSpeechEnded && this.privMustReportEndOfStream && this.privRequestSession.audioNode.isEmpty()) {
this.privMustReportEndOfStream = false;
await this.cancelRecognitionLocal(CancellationReason.EndOfStream, CancellationErrorCode.NoError, undefined);
}
Expand All @@ -563,7 +568,12 @@ export abstract class ServiceRecognizerBase implements IDisposable {
return;
} else {
connection = await this.fetchConnection();
await this.sendPrePayloadJSON(connection);
const restartAudio = !this.privRequestSession.audioNode.isEmpty();
await this.sendSpeechServiceConfig(connection, this.privRequestSession, this.privRecognizerConfig.SpeechServiceConfig.serialize());
await this.sendPrePayloadJSON(connection, restartAudio);
if (restartAudio) {
void this.startSendingAudio(this.privRequestSession.audioNode);
}
}
break;

Expand Down Expand Up @@ -743,7 +753,18 @@ export abstract class ServiceRecognizerBase implements IDisposable {
this.privRequestSession.recogNumber === startRecogNumber) {

const connection: IConnection = await this.fetchConnection();
const audioStreamChunk: IStreamChunk<ArrayBuffer> = await audioStreamNode.read();
let audioStreamChunk: IStreamChunk<ArrayBuffer> = {
buffer: null,
isEnd: true,
timeReceived: Date.now()};
try {
audioStreamChunk = await audioStreamNode.read();
} catch (error) {
if (!this.privIsLiveAudio) {
this.privRequestSession.onSpeechEnded();
}
return;
}
// we have a new audio chunk to upload.
if (this.privRequestSession.isSpeechEnded) {
// If service already recognized audio end then don't send any more audio
Expand Down
67 changes: 50 additions & 17 deletions tests/SpeechRecognizerSilenceTests.ts
Original file line number Diff line number Diff line change
Expand Up @@ -88,30 +88,29 @@ const BuildSpeechConfig: () => sdk.SpeechConfig = (): sdk.SpeechConfig => {
return s;
};

describe.each([true])("Service based tests", (forceNodeWebSocket: boolean) => {
describe.each([true])("Service based tests", (forceNodeWebSocket: boolean): void => {

beforeAll(() => {
beforeAll((): void => {
WebsocketMessageAdapter.forceNpmWebSocket = forceNodeWebSocket;
});

afterAll(() => {
afterAll((): void => {
WebsocketMessageAdapter.forceNpmWebSocket = false;
});

describe("Intiial Silence Tests", () => {
test("InitialSilenceTimeout (pull)", (done: jest.DoneCallback) => {
describe("Intiial Silence Tests", (): void => {
test("InitialSilenceTimeout (pull)", (done: jest.DoneCallback): void => {
// eslint-disable-next-line no-console
console.info("Name: InitialSilenceTimeout (pull)");
let p: sdk.PullAudioInputStream;
let bytesSent: number = 0;

// To make sure we don't send a ton of extra data.
// For reference, before the throttling was implemented, we sent 6-10x the required data.
const startTime: number = Date.now();

p = sdk.AudioInputStream.createPullStream(
const p = sdk.AudioInputStream.createPullStream(
{
close: () => { return; },
close: (): void => { return; },
read: (buffer: ArrayBuffer): number => {
bytesSent += buffer.byteLength;
return buffer.byteLength;
Expand All @@ -120,7 +119,7 @@ describe.each([true])("Service based tests", (forceNodeWebSocket: boolean) => {

const config: sdk.AudioConfig = sdk.AudioConfig.fromStreamInput(p);

testInitialSilenceTimeout(config, done, (): void => {
testInitialSilenceTimeout(config, done, 2, (): void => {
const elapsed: number = Date.now() - startTime;

// We should have sent 5 seconds of audio unthrottled and then 2x the time reco took until we got a response.
Expand All @@ -130,7 +129,7 @@ describe.each([true])("Service based tests", (forceNodeWebSocket: boolean) => {
});
}, 15000);

test("InitialSilenceTimeout (push)", (done: jest.DoneCallback) => {
test("InitialSilenceTimeout (push)", (done: jest.DoneCallback): void => {
// eslint-disable-next-line no-console
console.info("Name: InitialSilenceTimeout (push)");
const p: sdk.PushAudioInputStream = sdk.AudioInputStream.createPushStream();
Expand All @@ -143,7 +142,41 @@ describe.each([true])("Service based tests", (forceNodeWebSocket: boolean) => {
testInitialSilenceTimeout(config, done);
}, 15000);

Settings.testIfDOMCondition("InitialSilenceTimeout (File)", (done: jest.DoneCallback) => {
test.only("Multi-turn silence test", (done: jest.DoneCallback) => {
// eslint-disable-next-line no-console
console.info("Name: Multi-turn silence test");
const s: sdk.SpeechConfig = BuildSpeechConfig();
objsToClose.push(s);

s.setProperty(sdk.PropertyId.SpeechServiceConnection_InitialSilenceTimeoutMs, "3000");

const p: sdk.PushAudioInputStream = sdk.AudioInputStream.createPushStream();
objsToClose.push(p);
p.write(new ArrayBuffer(2 * 16000 * 5)); // 5 seconds of silence
p.close();

const r: sdk.SpeechRecognizer = new sdk.SpeechRecognizer(s, sdk.AudioConfig.fromStreamInput(p));
objsToClose.push(r);

let lastOffset: number = 0;

r.speechEndDetected = (r: sdk.Recognizer, e: sdk.RecognitionEventArgs): void => {
lastOffset = e.offset;
};

r.canceled = (r: sdk.Recognizer, e: sdk.SpeechRecognitionCanceledEventArgs): void => {
if (lastOffset !== 50000000) {
done("Got unexpected offset: " + lastOffset.toString());
} else {
done();
}
};

r.startContinuousRecognitionAsync();

}, 25000);

Settings.testIfDOMCondition("InitialSilenceTimeout (File)", (done: jest.DoneCallback): void => {
// eslint-disable-next-line no-console
console.info("Name: InitialSilenceTimeout (File)");
const audioFormat: AudioStreamFormatImpl = sdk.AudioStreamFormat.getDefaultInputFormat() as AudioStreamFormatImpl;
Expand All @@ -155,7 +188,7 @@ describe.each([true])("Service based tests", (forceNodeWebSocket: boolean) => {
testInitialSilenceTimeout(config, done);
}, 15000);

const testInitialSilenceTimeout = (config: sdk.AudioConfig, done: jest.DoneCallback, addedChecks?: () => void): void => {
const testInitialSilenceTimeout = (config: sdk.AudioConfig, done: jest.DoneCallback, expectedNumReports: number = 2, addedChecks?: () => void): void => {
const s: sdk.SpeechConfig = BuildSpeechConfig();
objsToClose.push(s);

Expand All @@ -169,11 +202,11 @@ describe.each([true])("Service based tests", (forceNodeWebSocket: boolean) => {

let numReports: number = 0;

r.canceled = (o: sdk.Recognizer, e: sdk.SpeechRecognitionCanceledEventArgs) => {
r.canceled = (o: sdk.Recognizer, e: sdk.SpeechRecognitionCanceledEventArgs): void => {
done(e.errorDetails);
};

r.recognized = (o: sdk.Recognizer, e: sdk.SpeechRecognitionEventArgs) => {
r.recognized = (o: sdk.Recognizer, e: sdk.SpeechRecognitionEventArgs): void => {
try {
const res: sdk.SpeechRecognitionResult = e.result;
expect(res).not.toBeUndefined();
Expand All @@ -193,7 +226,7 @@ describe.each([true])("Service based tests", (forceNodeWebSocket: boolean) => {
};

r.recognizeOnceAsync(
(p2: sdk.SpeechRecognitionResult) => {
(p2: sdk.SpeechRecognitionResult): void => {
try {
const res: sdk.SpeechRecognitionResult = p2;
numReports++;
Expand All @@ -211,11 +244,11 @@ describe.each([true])("Service based tests", (forceNodeWebSocket: boolean) => {
done(error);
}
},
(error: string) => {
(error: string): void => {
fail(error);
});

WaitForCondition(() => (numReports === 2), () => {
WaitForCondition((): boolean => (numReports === expectedNumReports), (): void => {
try {
if (!!addedChecks) {
addedChecks();
Expand Down