Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MM-54242] Improve timestamp accuracy #3

Merged
merged 4 commits into from
Nov 16, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ WHISPER_SHA ?= "b2e34e65777033584fa6769a366cdb0228bc5c7da81e58a5e8dc0ce94d0fb54e
# Opus
OPUS_VERSION ?= "1.4"
OPUS_SHA ?= "c9b32b4253be5ae63d1ff16eea06b94b5f0f2951b7a02aceef58e3a3ce49c51f"
# ONNX Runtime
ONNX_VERSION ?= "1.16.2"
ONNX_SHA ?= "70c769771ad4b6d63b87ca1f62d3f11e998ea0b9d738d6bbdd6a5e6d8c1deb31"

## Docker Variables
# Docker executable
Expand All @@ -56,7 +59,7 @@ DOCKER_REGISTRY_REPO ?= mattermost/${APP_NAME}-daily
DOCKER_USER ?= user
DOCKER_PASSWORD ?= password
## Docker Images
DOCKER_IMAGE_GO += "golang:${GO_VERSION}@sha256:27b021393d0e0dfffc6cd6cca5e7836ac59f5ac98724c5d6b3b0a82199d275c5"
DOCKER_IMAGE_GO += "golang:${GO_VERSION}@sha256:337543447173c2238c78d4851456760dcc57c1dfa8c3bcd94cbee8b0f7b32ad0"
DOCKER_IMAGE_GOLINT += "golangci/golangci-lint:v1.54.2@sha256:abe731fe6bb335a30eab303a41dd5c2b630bb174372a4da08e3d42eab5324127"
DOCKER_IMAGE_DOCKERLINT += "hadolint/hadolint:v2.9.2@sha256:d355bd7df747a0f124f3b5e7b21e9dafd0cb19732a276f901f0fdee243ec1f3b"
DOCKER_IMAGE_COSIGN += "bitnami/cosign:1.8.0@sha256:8c2c61c546258fffff18b47bb82a65af6142007306b737129a7bd5429d53629a"
Expand Down Expand Up @@ -170,6 +173,8 @@ docker-build: ## to build the docker image
--build-arg OPUS_SHA=${OPUS_SHA} \
--build-arg WHISPER_VERSION=${WHISPER_VERSION} \
--build-arg WHISPER_SHA=${WHISPER_SHA} \
--build-arg ONNX_VERSION=${ONNX_VERSION} \
--build-arg ONNX_SHA=${ONNX_SHA} \
-f ${DOCKER_FILE} . \
-t ${APP_NAME}:${APP_VERSION} || ${FAIL}
@$(OK) Performing Docker build ${APP_NAME}:${APP_VERSION}
Expand Down Expand Up @@ -298,7 +303,7 @@ go-build-docker: # to build binaries under a controlled docker dedicated go cont
-v $(PWD):/app -w /app \
-e GOCACHE="/tmp" \
$(DOCKER_IMAGE_GO) \
/bin/bash ./build/build.sh ${OPUS_VERSION} ${OPUS_SHA} ${WHISPER_VERSION} ${WHISPER_SHA} || ${FAIL}
/bin/bash ./build/build.sh ${OPUS_VERSION} ${OPUS_SHA} ${WHISPER_VERSION} ${WHISPER_SHA} ${ONNX_VERSION} ${ONNX_SHA} || ${FAIL}
@$(OK) go build docker

.PHONY: go-run
Expand All @@ -315,7 +320,7 @@ go-test: ## to run tests
-v /var/run/docker.sock:/var/run/docker.sock \
-e GOCACHE="/tmp" \
$(DOCKER_IMAGE_GO) \
/bin/sh ./build/run_tests.sh "${GO_TEST_OPTS}" "${OPUS_VERSION}" "${OPUS_SHA}" "${WHISPER_VERSION}" "${WHISPER_SHA}" || ${FAIL}
/bin/sh ./build/run_tests.sh "${GO_TEST_OPTS}" "${OPUS_VERSION}" "${OPUS_SHA}" "${WHISPER_VERSION}" "${WHISPER_SHA}" "${ONNX_VERSION}" "${ONNX_SHA}" || ${FAIL}
@$(OK) testing

.PHONY: go-mod-check
Expand All @@ -342,7 +347,7 @@ go-lint: ## to lint go code
-e GOCACHE="/tmp" \
-e GOLANGCI_LINT_CACHE="/tmp" \
${DOCKER_IMAGE_GOLINT} \
/bin/sh ./build/lint.sh "${OPUS_VERSION}" "${OPUS_SHA}" "${WHISPER_VERSION}" "${WHISPER_SHA}" || ${FAIL}
/bin/sh ./build/lint.sh "${OPUS_VERSION}" "${OPUS_SHA}" "${WHISPER_VERSION}" "${WHISPER_SHA}" "${ONNX_VERSION}" "${ONNX_SHA}" || ${FAIL}
@$(OK) App linting

.PHONY: go-fmt
Expand Down
8 changes: 7 additions & 1 deletion build/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ ARG OPUS_VERSION
ARG OPUS_SHA
ARG WHISPER_VERSION
ARG WHISPER_SHA
ARG ONNX_VERSION
ARG ONNX_SHA
ARG DEBIAN_FRONTEND=noninteractive
#GO_BUILD_PLATFORMS holds the platforms that we will build the docker image against
ARG GO_BUILD_PLATFORMS=linux-${ARCH:?}
Expand All @@ -34,14 +36,18 @@ ARG GO_BUILD_PLATFORMS=linux-${ARCH:?}
COPY . /src
WORKDIR /src

RUN /bin/bash ./build/build.sh ${OPUS_VERSION} ${OPUS_SHA} ${WHISPER_VERSION} ${WHISPER_SHA} "tiny base small"
RUN /bin/bash ./build/build.sh ${OPUS_VERSION} ${OPUS_SHA} ${WHISPER_VERSION} ${WHISPER_SHA} "tiny base small" ${ONNX_VERSION} ${ONNX_SHA}

FROM base AS runner
ARG ARCH
ARG WHISPER_VERSION
ARG ONNX_VERSION
COPY --from=builder /src/dist/calls-transcriber-linux-${ARCH:?} /opt/calls-transcriber/bin/calls-transcriber
COPY --from=builder /tmp/whisper.cpp-${WHISPER_VERSION}/models /models
COPY --from=builder /tmp/onnxruntime-linux-x64-${ONNX_VERSION}/lib/* /usr/lib/
COPY --from=builder /src/build/models/silero_vad.onnx /models
COPY --from=builder /src/build/entrypoint.sh .
RUN ldconfig

ENTRYPOINT ["./entrypoint.sh"]
CMD ["/opt/calls-transcriber/bin/calls-transcriber"]
11 changes: 8 additions & 3 deletions build/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,18 @@ OPUS_SHA=$2
WHISPER_VERSION=$3
WHISPER_SHA=$4
MODELS=$5
ONNX_VERSION=$6
ONNX_SHA=$7

OPUS_INCLUDE_PATH="/tmp/opus-${OPUS_VERSION}/include"
WHISPER_INCLUDE_PATH="/tmp/whisper.cpp-${WHISPER_VERSION}"
OPUS_LIBRARY_PATH="/tmp/opus-${OPUS_VERSION}/.libs"
WHISPER_LIBRARY_PATH=${WHISPER_INCLUDE_PATH}
ONNX_INCLUDE_PATH="/tmp/onnxruntime-linux-x64-${ONNX_VERSION}/include"
ONNX_LIBRARY_PATH="/tmp/onnxruntime-linux-x64-${ONNX_VERSION}/lib"

bash ./build/prepare_deps.sh ${OPUS_VERSION} ${OPUS_SHA} ${WHISPER_VERSION} ${WHISPER_SHA} "${MODELS}" && \
C_INCLUDE_PATH="${OPUS_INCLUDE_PATH}:${WHISPER_INCLUDE_PATH}" \
LIBRARY_PATH="${OPUS_LIBRARY_PATH}:${WHISPER_LIBRARY_PATH}" \
bash ./build/prepare_deps.sh ${OPUS_VERSION} ${OPUS_SHA} ${WHISPER_VERSION} ${WHISPER_SHA} "${MODELS}" ${ONNX_VERSION} ${ONNX_SHA} && \
C_INCLUDE_PATH="${OPUS_INCLUDE_PATH}:${WHISPER_INCLUDE_PATH}:${ONNX_INCLUDE_PATH}" \
LIBRARY_PATH="${OPUS_LIBRARY_PATH}:${WHISPER_LIBRARY_PATH}:${ONNX_LIBRARY_PATH}" \
LD_RUN_PATH="${ONNX_LIBRARY_PATH}" \
make go-build
11 changes: 8 additions & 3 deletions build/lint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,18 @@ OPUS_VERSION=$1
OPUS_SHA=$2
WHISPER_VERSION=$3
WHISPER_SHA=$4
ONNX_VERSION=$5
ONNX_SHA=$6

OPUS_INCLUDE_PATH="/tmp/opus-${OPUS_VERSION}/include"
WHISPER_INCLUDE_PATH="/tmp/whisper.cpp-${WHISPER_VERSION}"
OPUS_LIBRARY_PATH="/tmp/opus-${OPUS_VERSION}/.libs"
WHISPER_LIBRARY_PATH=${WHISPER_INCLUDE_PATH}
ONNX_INCLUDE_PATH="/tmp/onnxruntime-linux-x64-${ONNX_VERSION}/include"
ONNX_LIBRARY_PATH="/tmp/onnxruntime-linux-x64-${ONNX_VERSION}/lib"

bash ./build/prepare_deps.sh ${OPUS_VERSION} ${OPUS_SHA} ${WHISPER_VERSION} ${WHISPER_SHA} && \
C_INCLUDE_PATH="${OPUS_INCLUDE_PATH}:${WHISPER_INCLUDE_PATH}" \
LIBRARY_PATH="${OPUS_LIBRARY_PATH}:${WHISPER_LIBRARY_PATH}" \
bash ./build/prepare_deps.sh ${OPUS_VERSION} ${OPUS_SHA} ${WHISPER_VERSION} ${WHISPER_SHA} "" ${ONNX_VERSION} ${ONNX_SHA} && \
C_INCLUDE_PATH="${OPUS_INCLUDE_PATH}:${WHISPER_INCLUDE_PATH}:${ONNX_INCLUDE_PATH}" \
LIBRARY_PATH="${OPUS_LIBRARY_PATH}:${WHISPER_LIBRARY_PATH}:${ONNX_LIBRARY_PATH}" \
LD_RUN_PATH="${ONNX_LIBRARY_PATH}" \
golangci-lint run ./...
7 changes: 6 additions & 1 deletion build/prepare_deps.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ OPUS_SHA=$2
WHISPER_VERSION=$3
WHISPER_SHA=$4
MODELS=$5
ONNX_VERSION=$6
ONNX_SHA=$7
streamer45 marked this conversation as resolved.
Show resolved Hide resolved

cd /tmp && \
wget https://downloads.xiph.org/releases/opus/opus-${OPUS_VERSION}.tar.gz && \
Expand All @@ -20,4 +22,7 @@ echo "${WHISPER_SHA} v${WHISPER_VERSION}.tar.gz" | sha256sum --check && \
tar xf v${WHISPER_VERSION}.tar.gz && \
cd whisper.cpp-${WHISPER_VERSION} && \
for model in ${MODELS}; do ./models/download-ggml-model.sh "${model}.en"; done && \
make libwhisper.a
make libwhisper.a && \
cd /tmp && \
wget https://github.com/microsoft/onnxruntime/releases/download/v${ONNX_VERSION}/onnxruntime-linux-x64-${ONNX_VERSION}.tgz && \
tar xf onnxruntime-linux-x64-${ONNX_VERSION}.tgz
11 changes: 8 additions & 3 deletions build/run_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,19 @@ OPUS_VERSION=$2
OPUS_SHA=$3
WHISPER_VERSION=$4
WHISPER_SHA=$5
ONNX_VERSION=$6
ONNX_SHA=$7

OPUS_INCLUDE_PATH="/tmp/opus-${OPUS_VERSION}/include"
WHISPER_INCLUDE_PATH="/tmp/whisper.cpp-${WHISPER_VERSION}"
OPUS_LIBRARY_PATH="/tmp/opus-${OPUS_VERSION}/.libs"
WHISPER_LIBRARY_PATH=${WHISPER_INCLUDE_PATH}
ONNX_INCLUDE_PATH="/tmp/onnxruntime-linux-x64-${ONNX_VERSION}/include"
ONNX_LIBRARY_PATH="/tmp/onnxruntime-linux-x64-${ONNX_VERSION}/lib"

bash ./build/prepare_deps.sh ${OPUS_VERSION} ${OPUS_SHA} ${WHISPER_VERSION} ${WHISPER_SHA} "tiny" && \
bash ./build/prepare_deps.sh ${OPUS_VERSION} ${OPUS_SHA} ${WHISPER_VERSION} ${WHISPER_SHA} "tiny" ${ONNX_VERSION} ${ONNX_SHA} && \
MODELS_DIR=/tmp/whisper.cpp-${WHISPER_VERSION}/models \
C_INCLUDE_PATH="${OPUS_INCLUDE_PATH}:${WHISPER_INCLUDE_PATH}" \
LIBRARY_PATH="${OPUS_LIBRARY_PATH}:${WHISPER_LIBRARY_PATH}" \
C_INCLUDE_PATH="${OPUS_INCLUDE_PATH}:${WHISPER_INCLUDE_PATH}:${ONNX_INCLUDE_PATH}" \
LIBRARY_PATH="${OPUS_LIBRARY_PATH}:${WHISPER_LIBRARY_PATH}:${ONNX_LIBRARY_PATH}" \
LD_RUN_PATH="${ONNX_LIBRARY_PATH}" \
go test ${GO_TEST_OPTS} ./...
72 changes: 71 additions & 1 deletion cmd/transcriber/call/tracks.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ import (
"github.com/mattermost/mattermost/server/public/model"
"github.com/mattermost/rtcd/client"

"github.com/streamer45/silero-vad-go/speech"

"github.com/pion/webrtc/v3"
)

Expand Down Expand Up @@ -304,13 +306,81 @@ func (t *Transcriber) transcribeTrack(ctx trackContext) (transcribe.TrackTranscr
return trackTr, 0, fmt.Errorf("failed to decode audio samples: %w", err)
}

slog.Debug("decoding done", slog.Any("samplesLen", len(samples)))

transcriber, err := t.newTrackTranscriber()
if err != nil {
return trackTr, 0, fmt.Errorf("failed to create track transcriber: %w", err)
}

var totalDur time.Duration
sd, err := speech.NewDetector(speech.DetectorConfig{
ModelPath: filepath.Join(getModelsDir(), "silero_vad.onnx"),
SampleRate: trackOutAudioRate,
WindowSize: 1536,
Threshold: 0.5,
MinSilenceDurationMs: 1000,
SpeechPadMs: 100,
})
if err != nil {
return trackTr, 0, fmt.Errorf("failed to ceate speech detector: %w", err)
}
defer func() {
if err := sd.Destroy(); err != nil {
slog.Error("failed to destroy speech detector", slog.String("err", err.Error()), slog.String("trackID", ctx.trackID))
}
}()

// Before transcribing, we feed the samples to a speech detector and adjust
// the timestamps in accordance to when the speech begins/ends. This is
// to account for any potential silence that Whisper wouldn't recognize with
// much accuracy.
// TODO: consider deprecating this logic if we get accurate word level timestamps
// (https://github.com/ggerganov/whisper.cpp/issues/375).

var speechSamples []trackTimedSamples
for _, ts := range samples {
segments, err := sd.Detect(ts.pcm)
if err != nil {
slog.Error("failed to detect speech",
slog.String("err", err.Error()),
slog.String("trackID", ctx.trackID))
continue
}
slog.Debug("speech detection done", slog.Any("segments", segments))

for _, seg := range segments {
// Both SpeechStartAt and SpeechEndAt are in seconds.
// We simply multiply by the audio sampling rate to find out
// the index of the sample where speech starts/ends.
startSampleOff := int(seg.SpeechStartAt * trackOutAudioRate)
endSampleOff := int(seg.SpeechEndAt * trackOutAudioRate)
Comment on lines +358 to +359
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One request -- can you add some comments for future us about what the units are for seg.SpeechStartAt and startSampleOff? (That will help in the future when I'm reviewing and remembering what the numbers mean.)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure but the sample offset is just what it says. It's an index to the first sample where the speech starts (+- some padding), there's no unit really.


if startSampleOff >= len(ts.pcm) {
slog.Error("invalid startSampleOff",
slog.Int("startSampleOff", startSampleOff),
slog.String("trackID", ctx.trackID))
continue
}

var speechPCM []float32
if endSampleOff > startSampleOff {
speechPCM = ts.pcm[startSampleOff:endSampleOff]
} else {
speechPCM = ts.pcm[startSampleOff:]
}

speechSamples = append(speechSamples, trackTimedSamples{
pcm: speechPCM,
// Multiplying as our timestamps are in milliseconds.
startTS: ts.startTS + int64(seg.SpeechStartAt*1000),
})
}
}

slog.Debug("speech detection done", slog.Any("speechSamples", len(speechSamples)))

var totalDur time.Duration
for _, ts := range speechSamples {
segments, err := transcriber.Transcribe(ts.pcm)
if err != nil {
slog.Error("failed to transcribe audio samples",
Expand Down
3 changes: 2 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
module github.com/mattermost/calls-transcriber

go 1.21.3
go 1.21.4

require (
github.com/mattermost/mattermost-plugin-calls/server/public v0.0.3-0.20231103204030-06bd54bcfa67
Expand All @@ -9,6 +9,7 @@ require (
github.com/pion/randutil v0.1.0
github.com/pion/rtp v1.7.13
github.com/pion/webrtc/v3 v3.2.12
github.com/streamer45/silero-vad-go v0.1.0
github.com/stretchr/testify v1.8.4
)

Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,8 @@ github.com/shurcooL/users v0.0.0-20180125191416-49c67e49c537/go.mod h1:QJTqeLYED
github.com/shurcooL/webdavfs v0.0.0-20170829043945-18c3829fa133/go.mod h1:hKmq5kWdCj2z2KEozexVbfEZIWiTjhE0+UjmZgPqehw=
github.com/sourcegraph/annotate v0.0.0-20160123013949-f4cad6c6324d/go.mod h1:UdhH50NIW0fCiwBSr0co2m7BnFLdv4fQTgdqdJTHFeE=
github.com/sourcegraph/syntaxhighlight v0.0.0-20170531221838-bd320f5d308e/go.mod h1:HuIsMU8RRBOtsCgI77wP899iHVBQpCmg4ErYMZB+2IA=
github.com/streamer45/silero-vad-go v0.1.0 h1:0nGZ6VT3LKOkBG/w+4kljIB6brxtgQn6YuNjTVYjOQ4=
github.com/streamer45/silero-vad-go v0.1.0/go.mod h1:B+2FXs/5fZ6pzl6unUZYhZqkYdOB+3saBVzjOzdZnUs=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
Expand Down
Loading