diff --git a/examples/tutorials/audio_data_augmentation_tutorial.py b/examples/tutorials/audio_data_augmentation_tutorial.py index 2675feefe4..a5fca405d9 100644 --- a/examples/tutorials/audio_data_augmentation_tutorial.py +++ b/examples/tutorials/audio_data_augmentation_tutorial.py @@ -27,8 +27,6 @@ # First, we import the modules and download the audio assets we use in this tutorial. # -import math - from IPython.display import Audio import matplotlib.pyplot as plt @@ -44,56 +42,38 @@ # Applying effects and filtering # ------------------------------ # -# :py:func:`torchaudio.sox_effects` allows for directly applying filters similar to -# those available in ``sox`` to Tensor objects and file object audio sources. -# -# There are two functions for this: -# -# - :py:func:`torchaudio.sox_effects.apply_effects_tensor` for applying effects -# to Tensor. -# - :py:func:`torchaudio.sox_effects.apply_effects_file` for applying effects to -# other audio sources. -# -# Both functions accept effect definitions in the form -# ``List[List[str]]``. -# This is mostly consistent with how ``sox`` command works, but one caveat is -# that ``sox`` adds some effects automatically, whereas ``torchaudio``’s -# implementation does not. -# -# For the list of available effects, please refer to `the sox -# documentation `__. +# :py:class:`torchaudio.io.AudioEffector` allows for directly applying +# filters and codecs to Tensor objects, in a similar way as ``ffmpeg`` +# command # -# **Tip** If you need to load and resample your audio data on the fly, -# then you can use :py:func:`torchaudio.sox_effects.apply_effects_file` -# with effect ``"rate"``. -# -# **Note** :py:func:`torchaudio.sox_effects.apply_effects_file` accepts a -# file-like object or path-like object. -# Similar to :py:func:`torchaudio.load`, when the audio format cannot be -# inferred from either the file extension or header, you can provide -# argument ``format`` to specify the format of the audio source. -# -# **Note** This process is not differentiable. +# `AudioEffector Usages <./effector_tutorial.html>` explains how to use +# this class, so for the detail, please refer to the tutorial. # # Load the data -waveform1, sample_rate1 = torchaudio.load(SAMPLE_WAV) +waveform1, sample_rate = torchaudio.load(SAMPLE_WAV, channels_first=False) # Define effects -effects = [ - ["lowpass", "-1", "300"], # apply single-pole lowpass filter - ["speed", "0.8"], # reduce the speed - # This only changes sample rate, so it is necessary to - # add `rate` effect with original sample rate after this. - ["rate", f"{sample_rate1}"], - ["reverb", "-w"], # Reverbration gives some dramatic feeling -] +effect = ",".join( + [ + "lowpass=frequency=300:poles=1", # apply single-pole lowpass filter + "atempo=0.8", # reduce the speed + "aecho=in_gain=0.8:out_gain=0.9:delays=200:decays=0.3|delays=400:decays=0.3" + # Applying echo gives some dramatic feeling + ], +) + # Apply effects -waveform2, sample_rate2 = torchaudio.sox_effects.apply_effects_tensor(waveform1, sample_rate1, effects) +def apply_effect(waveform, sample_rate, effect): + effector = torchaudio.io.AudioEffector(effect=effect) + return effector.apply(waveform, sample_rate) -print(waveform1.shape, sample_rate1) -print(waveform2.shape, sample_rate2) + +waveform2 = apply_effect(waveform1, sample_rate, effect) + +print(waveform1.shape, sample_rate) +print(waveform2.shape, sample_rate) ###################################################################### # Note that the number of frames and number of channels are different from @@ -101,6 +81,7 @@ # audio. # + def plot_waveform(waveform, sample_rate, title="Waveform", xlim=None): waveform = waveform.numpy() @@ -123,6 +104,7 @@ def plot_waveform(waveform, sample_rate, title="Waveform", xlim=None): ###################################################################### # + def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None): waveform = waveform.numpy() @@ -141,26 +123,23 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None): plt.show(block=False) ###################################################################### -# Original: -# ~~~~~~~~~ +# Original +# ~~~~~~~~ # -plot_waveform(waveform1, sample_rate1, title="Original", xlim=(-0.1, 3.2)) -plot_specgram(waveform1, sample_rate1, title="Original", xlim=(0, 3.04)) -Audio(waveform1, rate=sample_rate1) +plot_waveform(waveform1.T, sample_rate, title="Original", xlim=(-0.1, 3.2)) +plot_specgram(waveform1.T, sample_rate, title="Original", xlim=(0, 3.04)) +Audio(waveform1.T, rate=sample_rate) ###################################################################### -# Effects applied: -# ~~~~~~~~~~~~~~~~ +# Effects applied +# ~~~~~~~~~~~~~~~ # -plot_waveform(waveform2, sample_rate2, title="Effects Applied", xlim=(-0.1, 3.2)) -plot_specgram(waveform2, sample_rate2, title="Effects Applied", xlim=(0, 3.04)) -Audio(waveform2, rate=sample_rate2) +plot_waveform(waveform2.T, sample_rate, title="Effects Applied", xlim=(-0.1, 3.2)) +plot_specgram(waveform2.T, sample_rate, title="Effects Applied", xlim=(0, 3.04)) +Audio(waveform2.T, rate=sample_rate) -###################################################################### -# Doesn’t it sound more dramatic? -# ###################################################################### # Simulating room reverberation @@ -203,8 +182,8 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None): augmented = F.fftconvolve(speech, rir) ###################################################################### -# Original: -# ~~~~~~~~~ +# Original +# ~~~~~~~~ # plot_waveform(speech, sample_rate, title="Original") @@ -212,8 +191,8 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None): Audio(speech, rate=sample_rate) ###################################################################### -# RIR applied: -# ~~~~~~~~~~~~ +# RIR applied +# ~~~~~~~~~~~ # plot_waveform(augmented, sample_rate, title="RIR Applied") @@ -248,8 +227,8 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None): ###################################################################### -# Background noise: -# ~~~~~~~~~~~~~~~~~ +# Background noise +# ~~~~~~~~~~~~~~~~ # plot_waveform(noise, sample_rate, title="Background noise") @@ -257,8 +236,8 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None): Audio(noise, rate=sample_rate) ###################################################################### -# SNR 20 dB: -# ~~~~~~~~~~ +# SNR 20 dB +# ~~~~~~~~~ # snr_db, noisy_speech = snr_dbs[0], noisy_speeches[0:1] @@ -267,8 +246,8 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None): Audio(noisy_speech, rate=sample_rate) ###################################################################### -# SNR 10 dB: -# ~~~~~~~~~~ +# SNR 10 dB +# ~~~~~~~~~ # snr_db, noisy_speech = snr_dbs[1], noisy_speeches[1:2] @@ -277,8 +256,8 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None): Audio(noisy_speech, rate=sample_rate) ###################################################################### -# SNR 3 dB: -# ~~~~~~~~~ +# SNR 3 dB +# ~~~~~~~~ # snr_db, noisy_speech = snr_dbs[2], noisy_speeches[2:3] @@ -291,60 +270,56 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None): # Applying codec to Tensor object # ------------------------------- # -# :py:func:`torchaudio.functional.apply_codec` can apply codecs to +# :py:class:`torchaudio.io.AudioEffector` can also apply codecs to # a Tensor object. # -# **Note** This process is not differentiable. -# + +waveform, sample_rate = torchaudio.load(SAMPLE_SPEECH, channels_first=False) -waveform, sample_rate = torchaudio.load(SAMPLE_SPEECH) +def apply_codec(waveform, sample_rate, format, encoder=None): + encoder = torchaudio.io.AudioEffector(format=format, encoder=encoder) + return encoder.apply(waveform, sample_rate) -configs = [ - {"format": "wav", "encoding": "ULAW", "bits_per_sample": 8}, - {"format": "gsm"}, - {"format": "vorbis", "compression": -1}, -] -waveforms = [] -for param in configs: - augmented = F.apply_codec(waveform, sample_rate, **param) - waveforms.append(augmented) ###################################################################### -# Original: -# ~~~~~~~~~ +# Original +# ~~~~~~~~ # -plot_waveform(waveform, sample_rate, title="Original") -plot_specgram(waveform, sample_rate, title="Original") -Audio(waveform, rate=sample_rate) +plot_waveform(waveform.T, sample_rate, title="Original") +plot_specgram(waveform.T, sample_rate, title="Original") +Audio(waveform.T, rate=sample_rate) ###################################################################### -# 8 bit mu-law: -# ~~~~~~~~~~~~~ +# 8 bit mu-law +# ~~~~~~~~~~~~ # -plot_waveform(waveforms[0], sample_rate, title="8 bit mu-law") -plot_specgram(waveforms[0], sample_rate, title="8 bit mu-law") -Audio(waveforms[0], rate=sample_rate) +mulaw = apply_codec(waveform, sample_rate, "wav", encoder="pcm_mulaw") +plot_waveform(mulaw.T, sample_rate, title="8 bit mu-law") +plot_specgram(mulaw.T, sample_rate, title="8 bit mu-law") +Audio(mulaw.T, rate=sample_rate) ###################################################################### -# GSM-FR: -# ~~~~~~~ +# G.722 +# ~~~~~ # -plot_waveform(waveforms[1], sample_rate, title="GSM-FR") -plot_specgram(waveforms[1], sample_rate, title="GSM-FR") -Audio(waveforms[1], rate=sample_rate) +g722 = apply_codec(waveform, sample_rate, "g722") +plot_waveform(g722.T, sample_rate, title="G.722") +plot_specgram(g722.T, sample_rate, title="G.722") +Audio(g722.T, rate=sample_rate) ###################################################################### -# Vorbis: -# ~~~~~~~ +# Vorbis +# ~~~~~~ # -plot_waveform(waveforms[2], sample_rate, title="Vorbis") -plot_specgram(waveforms[2], sample_rate, title="Vorbis") -Audio(waveforms[2], rate=sample_rate) +vorbis = apply_codec(waveform, sample_rate, "ogg", encoder="vorbis") +plot_waveform(vorbis.T, sample_rate, title="Vorbis") +plot_specgram(vorbis.T, sample_rate, title="Vorbis") +Audio(vorbis.T, rate=sample_rate) ###################################################################### # Simulating a phone recoding @@ -378,62 +353,52 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None): plot_specgram(bg_added, sample_rate, title="BG noise added") # Apply filtering and change sample rate -filtered, sample_rate2 = torchaudio.sox_effects.apply_effects_tensor( - bg_added, - sample_rate, - effects=[ - ["lowpass", "4000"], - [ - "compand", - "0.02,0.05", - "-60,-60,-30,-10,-20,-8,-5,-8,-2,-8", - "-8", - "-7", - "0.05", - ], - ["rate", "8000"], - ], -) +effect = ",".join([ + "lowpass=frequency=4000:poles=1", + "compand=attacks=0.02:decays=0.05:points=-60/-60|-30/-10|-20/-8|-5/-8|-2/-8:gain=-8:volume=-7:delay=0.05", +]) -plot_specgram(filtered, sample_rate2, title="Filtered") +filtered = apply_effect(bg_added.T, sample_rate, effect) +sample_rate2 = 8000 -# Apply telephony codec -codec_applied = F.apply_codec(filtered, sample_rate2, format="gsm") +plot_specgram(filtered.T, sample_rate2, title="Filtered") -plot_specgram(codec_applied, sample_rate2, title="GSM Codec Applied") +# Apply telephony codec +codec_applied = apply_codec(filtered, sample_rate2, "g722") +plot_specgram(codec_applied.T, sample_rate2, title="G.722 Codec Applied") ###################################################################### -# Original speech: -# ~~~~~~~~~~~~~~~~ +# Original speech +# ~~~~~~~~~~~~~~~ # Audio(original_speech, rate=sample_rate) ###################################################################### -# RIR applied: -# ~~~~~~~~~~~~ +# RIR applied +# ~~~~~~~~~~~ # Audio(rir_applied, rate=sample_rate) ###################################################################### -# Background noise added: -# ~~~~~~~~~~~~~~~~~~~~~~~ +# Background noise added +# ~~~~~~~~~~~~~~~~~~~~~~ # Audio(bg_added, rate=sample_rate) ###################################################################### -# Filtered: -# ~~~~~~~~~ +# Filtered +# ~~~~~~~~ # -Audio(filtered, rate=sample_rate2) +Audio(filtered.T, rate=sample_rate2) ###################################################################### -# Codec applied: -# ~~~~~~~~~~~~~~ +# Codec applied +# ~~~~~~~~~~~~~ # -Audio(codec_applied, rate=sample_rate2) +Audio(codec_applied.T, rate=sample_rate2)