feat(rtc): add AudioResampler, combineAudioFrames (#286)

livekit · Oct 3, 2024 · 9881d52 · 9881d52
1 parent 5d0edf3
commit 9881d52
Show file tree

Hide file tree

Showing 6 changed files with 756 additions and 2 deletions.
diff --git a/.changeset/smooth-lies-tie.md b/.changeset/smooth-lies-tie.md
@@ -0,0 +1,5 @@
+---
+"@livekit/rtc-node": minor
+---
+
+add AudioResampler, combineAudioFrames
diff --git a/packages/livekit-rtc/rust-sdks b/packages/livekit-rtc/rust-sdks
diff --git a/packages/livekit-rtc/src/audio_frame.ts b/packages/livekit-rtc/src/audio_frame.ts
@@ -52,3 +52,42 @@ export class AudioFrame {
     });
   }
 }
+
+/**
+ * Combines one or more `rtc.AudioFrame` objects into a single `rtc.AudioFrame`.
+ *
+ * This function concatenates the audio data from multiple frames, ensuring that all frames have
+ * the same sample rate and number of channels. It efficiently merges the data by preallocating the
+ * necessary memory and copying the frame data without unnecessary reallocations.
+ *
+ * @param buffer - a single AudioFrame or list thereof
+ */
+export const combineAudioFrames = (buffer: AudioFrame | AudioFrame[]): AudioFrame => {
+  if (!buffer['length']) {
+    return buffer as AudioFrame;
+  }
+  buffer = buffer as AudioFrame[];
+
+  if (buffer.length === 0) {
+    throw new Error('buffer is empty');
+  }
+
+  const sampleRate = buffer[0].sampleRate;
+  const channels = buffer[0].channels;
+
+  let totalSamplesPerChannel = 0;
+  for (const frame of buffer) {
+    if (frame.sampleRate != sampleRate) {
+      throw new Error(`sample rate mismatch: expected ${sampleRate}, got ${frame.sampleRate}`);
+    }
+
+    if (frame.channels != channels) {
+      throw new Error(`channel mismatch: expected ${channels}, got ${frame.channels}`);
+    }
+
+    totalSamplesPerChannel += frame.samplesPerChannel;
+  }
+
+  const data = new Int16Array(buffer.map((x) => [...x.data]).flat());
+  return new AudioFrame(data, sampleRate, channels, totalSamplesPerChannel);
+};
diff --git a/packages/livekit-rtc/src/audio_resampler.ts b/packages/livekit-rtc/src/audio_resampler.ts
@@ -0,0 +1,166 @@
+// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+import { AudioFrame } from './audio_frame';
+import { FfiClient, FfiHandle } from './ffi_client';
+import type {
+  FlushSoxResamplerResponse,
+  NewSoxResamplerResponse,
+  PushSoxResamplerResponse,
+} from './proto/audio_frame_pb';
+import {
+  FlushSoxResamplerRequest,
+  NewSoxResamplerRequest,
+  PushSoxResamplerRequest,
+  SoxQualityRecipe,
+  SoxResamplerDataType,
+} from './proto/audio_frame_pb';
+
+/**
+ * Resampler quality. Higher quality settings result in better audio quality but require more
+ * processing power.
+ */
+export enum AudioResamplerQuality {
+  QUICK = SoxQualityRecipe.SOXR_QUALITY_QUICK,
+  LOW = SoxQualityRecipe.SOXR_QUALITY_LOW,
+  MEDIUM = SoxQualityRecipe.SOXR_QUALITY_MEDIUM,
+  HIGH = SoxQualityRecipe.SOXR_QUALITY_HIGH,
+  VERY_HIGH = SoxQualityRecipe.SOXR_QUALITY_VERYHIGH,
+}
+
+/**
+ * AudioResampler provides functionality to resample audio data from an input sample rate to
+ * an output sample rate using the Sox resampling library. It supports multiple channels and
+ * configurable resampling quality.
+ */
+export class AudioResampler {
+  #inputRate: number;
+  #outputRate: number;
+  #channels: number;
+  #ffiHandle: FfiHandle;
+
+  /**
+   * Initializes a new AudioResampler.
+   *
+   * @param inputRate - The sample rate of the input audio data (in Hz).
+   * @param outputRate - The desired sample rate of the output audio data (in Hz).
+   * @param channels - The number of audio channels (e.g., 1 for mono, 2 for stereo). Defaults to 1.
+   * @param quality - The quality setting for the resampler. Defaults to
+   * `AudioResamplerQuality.MEDIUM`.
+   */
+  constructor(
+    inputRate: number,
+    outputRate: number,
+    channels = 1,
+    quality = AudioResamplerQuality.MEDIUM,
+  ) {
+    this.#inputRate = inputRate;
+    this.#outputRate = outputRate;
+    this.#channels = channels;
+
+    const req = new NewSoxResamplerRequest({
+      inputRate,
+      outputRate,
+      numChannels: channels,
+      qualityRecipe: quality as unknown as SoxQualityRecipe,
+      inputDataType: SoxResamplerDataType.SOXR_DATATYPE_INT16I,
+      outputDataType: SoxResamplerDataType.SOXR_DATATYPE_INT16I,
+      flags: 0,
+    });
+
+    const res = FfiClient.instance.request<NewSoxResamplerResponse>({
+      message: {
+        case: 'newSoxResampler',
+        value: req,
+      },
+    });
+
+    if (res.error) {
+      throw new Error(res.error);
+    }
+
+    this.#ffiHandle = new FfiHandle(res.resampler.handle.id);
+  }
+
+  /**
+   * Push audio data into the resampler and retrieve any available resampled data.
+   *
+   * This method accepts audio data, resamples it according to the configured input and output rates,
+   * and returns any resampled data that is available after processing the input.
+   *
+   * @param data - The audio frame to resample
+   *
+   * @returns A list of {@link AudioFrame} objects containing the resampled audio data. The list may
+   * be empty if no output data is available yet.
+   */
+  push(data: AudioFrame): AudioFrame[] {
+    const req = new PushSoxResamplerRequest({
+      resamplerHandle: this.#ffiHandle.handle,
+      dataPtr: data.protoInfo().dataPtr,
+      size: data.data.length,
+    });
+
+    const res = FfiClient.instance.request<PushSoxResamplerResponse>({
+      message: {
+        case: 'pushSoxResampler',
+        value: req,
+      },
+    });
+
+    if (res.error) {
+      throw new Error(res.error);
+    }
+
+    if (res.outputPtr) {
+      return [];
+    }
+
+    const outputData = FfiClient.instance.copyBuffer(res.outputPtr, res.size);
+    return [
+      new AudioFrame(
+        new Int16Array(outputData.subarray()),
+        this.#outputRate,
+        this.#channels,
+        Math.trunc(outputData.length / this.#channels / 2),
+      ),
+    ];
+  }
+
+  /**
+   * Flush any remaining audio data through the resampler and retrieve the resampled data.
+   *
+   * @remarks
+   * This method should be called when no more input data will be provided to ensure that all
+   * internal buffers are processed and all resampled data is output.
+   */
+  flush(): AudioFrame[] {
+    const req = new FlushSoxResamplerRequest({
+      resamplerHandle: this.#ffiHandle.handle,
+    });
+
+    const res = FfiClient.instance.request<FlushSoxResamplerResponse>({
+      message: {
+        case: 'flushSoxResampler',
+        value: req,
+      },
+    });
+
+    if (res.error) {
+      throw new Error(res.error);
+    }
+
+    if (res.outputPtr) {
+      return [];
+    }
+
+    const outputData = FfiClient.instance.copyBuffer(res.outputPtr, res.size);
+    return [
+      new AudioFrame(
+        new Int16Array(outputData.subarray()),
+        this.#outputRate,
+        this.#channels,
+        Math.trunc(outputData.length / this.#channels / 2),
+      ),
+    ];
+  }
+}
+1 −1		.github/workflows/gen-protocol.yaml
+23 −12		.github/workflows/publish.yml
+1 −0		.nanparc
+16 −1		Cargo.lock
+1 −0		Cargo.toml
+2 −0		libwebrtc/.nanparc
+2 −0		livekit-api/.nanparc
+5 −5		livekit-api/src/signal_client/mod.rs
+4 −2		livekit-api/src/signal_client/signal_stream.rs
+2 −0		livekit-ffi/.nanparc
+2 −1		livekit-ffi/Cargo.toml
+78 −0		livekit-ffi/protocol/audio_frame.proto
+7 −1		livekit-ffi/protocol/ffi.proto
+5 −0		livekit-ffi/src/cabi.rs
+1 −0		livekit-ffi/src/conversion/mod.rs
+1 −0		livekit-ffi/src/conversion/resampler.rs
+207 −2		livekit-ffi/src/livekit.proto.rs
+6 −2		livekit-ffi/src/server/mod.rs
+119 −1		livekit-ffi/src/server/requests.rs
+147 −0		livekit-ffi/src/server/resampler.rs
+2 −0		livekit-protocol/.nanparc
+3 −0		livekit-protocol/generate_proto.sh
+68 −3		livekit-protocol/src/livekit.serde.rs
+2 −0		livekit-runtime/.nanparc
+2 −0		livekit/.nanparc
+24 −7		livekit/src/rtc_engine/mod.rs
+76 −71		livekit/src/rtc_engine/rtc_session.rs
+2 −0		soxr-sys/.nanparc
+32 −0		soxr-sys/Cargo.lock
+14 −0		soxr-sys/Cargo.toml
+46 −0		soxr-sys/build.rs
+1 −0		soxr-sys/generate_bindings.sh
+23 −0		soxr-sys/src/LICENCE
+39 −0		soxr-sys/src/aliases.h
+33 −0		soxr-sys/src/avfft32.c
+32 −0		soxr-sys/src/avfft32s.c
+75 −0		soxr-sys/src/ccrw2.h
+314 −0		soxr-sys/src/cr-core.c
+588 −0		soxr-sys/src/cr.c
+178 −0		soxr-sys/src/cr.h
+8 −0		soxr-sys/src/cr32.c
+8 −0		soxr-sys/src/cr32s.c
+8 −0		soxr-sys/src/cr64.c
+8 −0		soxr-sys/src/cr64s.c
+223 −0		soxr-sys/src/data-io.c
+39 −0		soxr-sys/src/data-io.h
+149 −0		soxr-sys/src/dbesi0.c
+54 −0		soxr-sys/src/dev32s.h
+42 −0		soxr-sys/src/dev64s.h
+1,346 −0		soxr-sys/src/fft4g.c
+23 −0		soxr-sys/src/fft4g.h
+36 −0		soxr-sys/src/fft4g32.c
+31 −0		soxr-sys/src/fft4g32s.c
+35 −0		soxr-sys/src/fft4g64.c
+92 −0		soxr-sys/src/fft4g_cache.h
+125 −0		soxr-sys/src/fifo.h
+277 −0		soxr-sys/src/filter.c
+44 −0		soxr-sys/src/filter.h
+75 −0		soxr-sys/src/half-coefs.h
+61 −0		soxr-sys/src/half-fir.h
+84 −0		soxr-sys/src/internal.h
+150 −0		soxr-sys/src/lib.rs
+31 −0		soxr-sys/src/math-wrap.h
+40 −0		soxr-sys/src/pffft-avx.h
+110 −0		soxr-sys/src/pffft-wrap.c
+1,946 −0		soxr-sys/src/pffft.c
+197 −0		soxr-sys/src/pffft.h
+39 −0		soxr-sys/src/pffft32.c
+34 −0		soxr-sys/src/pffft32s.c
+34 −0		soxr-sys/src/pffft64s.c
+150 −0		soxr-sys/src/poly-fir.h
+56 −0		soxr-sys/src/poly-fir0.h
+31 −0		soxr-sys/src/rdft.h
+24 −0		soxr-sys/src/rdft_t.h
+158 −0		soxr-sys/src/rint-clip.h
+102 −0		soxr-sys/src/rint.h
+1 −0		soxr-sys/src/samplerate.h
+28 −0		soxr-sys/src/soxr-config.h
+198 −0		soxr-sys/src/soxr-lsr.c
+78 −0		soxr-sys/src/soxr-lsr.h
+842 −0		soxr-sys/src/soxr.c
+344 −0		soxr-sys/src/soxr.h
+343 −0		soxr-sys/src/soxr.rs
+48 −0		soxr-sys/src/std-types.h
+89 −0		soxr-sys/src/util-simd.c
+8 −0		soxr-sys/src/util32s.c
+23 −0		soxr-sys/src/util32s.h
+8 −0		soxr-sys/src/util64s.c
+23 −0		soxr-sys/src/util64s.h
+115 −0		soxr-sys/src/vr-coefs.c
+94 −0		soxr-sys/src/vr-coefs.h
+651 −0		soxr-sys/src/vr32.c
+2 −0		webrtc-sys/.nanparc
+2 −0		webrtc-sys/build/.nanparc
+3 −3		webrtc-sys/src/audio_track.cpp