diff --git a/.github/workflows/pr_tests.yml b/.github/workflows/pr_tests.yml index 60e65b370695..10670f33cf2f 100644 --- a/.github/workflows/pr_tests.yml +++ b/.github/workflows/pr_tests.yml @@ -57,6 +57,7 @@ jobs: - name: Install dependencies run: | + apt-get update && apt-get install libsndfile1-dev -y python -m pip install -e .[quality,test] python -m pip install git+https://github.com/huggingface/accelerate python -m pip install -U git+https://github.com/huggingface/transformers diff --git a/.gitignore b/.gitignore index f018a111ea33..950e9e5db917 100644 --- a/.gitignore +++ b/.gitignore @@ -165,4 +165,4 @@ tags # DS_Store (MacOS) .DS_Store # RL pipelines may produce mp4 outputs -*.mp4 \ No newline at end of file +*.mp4 diff --git a/docker/diffusers-flax-cpu/Dockerfile b/docker/diffusers-flax-cpu/Dockerfile index a4b4ccd65b39..cd65c465a289 100644 --- a/docker/diffusers-flax-cpu/Dockerfile +++ b/docker/diffusers-flax-cpu/Dockerfile @@ -11,6 +11,7 @@ RUN apt update && \ git-lfs \ curl \ ca-certificates \ + libsndfile1-dev \ python3.8 \ python3-pip \ python3.8-venv && \ @@ -33,6 +34,7 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip && \ datasets \ hf-doc-builder \ huggingface-hub \ + librosa \ modelcards \ numpy \ scipy \ diff --git a/docker/diffusers-flax-tpu/Dockerfile b/docker/diffusers-flax-tpu/Dockerfile index 5508af6622dd..f68ba8b5da85 100644 --- a/docker/diffusers-flax-tpu/Dockerfile +++ b/docker/diffusers-flax-tpu/Dockerfile @@ -11,6 +11,7 @@ RUN apt update && \ git-lfs \ curl \ ca-certificates \ + libsndfile1-dev \ python3.8 \ python3-pip \ python3.8-venv && \ @@ -35,6 +36,7 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip && \ datasets \ hf-doc-builder \ huggingface-hub \ + librosa \ modelcards \ numpy \ scipy \ diff --git a/docker/diffusers-onnxruntime-cpu/Dockerfile b/docker/diffusers-onnxruntime-cpu/Dockerfile index c925715915cd..e3ae56f8a078 100644 --- a/docker/diffusers-onnxruntime-cpu/Dockerfile +++ b/docker/diffusers-onnxruntime-cpu/Dockerfile @@ -11,6 +11,7 @@ RUN apt update && \ git-lfs \ curl \ ca-certificates \ + libsndfile1-dev \ python3.8 \ python3-pip \ python3.8-venv && \ @@ -33,6 +34,7 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip && \ datasets \ hf-doc-builder \ huggingface-hub \ + librosa \ modelcards \ numpy \ scipy \ diff --git a/docker/diffusers-onnxruntime-cuda/Dockerfile b/docker/diffusers-onnxruntime-cuda/Dockerfile index e51a5e0ba30f..c3d2ff7aa574 100644 --- a/docker/diffusers-onnxruntime-cuda/Dockerfile +++ b/docker/diffusers-onnxruntime-cuda/Dockerfile @@ -11,6 +11,7 @@ RUN apt update && \ git-lfs \ curl \ ca-certificates \ + libsndfile1-dev \ python3.8 \ python3-pip \ python3.8-venv && \ @@ -33,6 +34,7 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip && \ datasets \ hf-doc-builder \ huggingface-hub \ + librosa \ modelcards \ numpy \ scipy \ diff --git a/docker/diffusers-pytorch-cpu/Dockerfile b/docker/diffusers-pytorch-cpu/Dockerfile index 41d1672f60e6..2c7bf17d91d0 100644 --- a/docker/diffusers-pytorch-cpu/Dockerfile +++ b/docker/diffusers-pytorch-cpu/Dockerfile @@ -11,6 +11,7 @@ RUN apt update && \ git-lfs \ curl \ ca-certificates \ + libsndfile1-dev \ python3.8 \ python3-pip \ python3.8-venv && \ @@ -32,6 +33,7 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip && \ datasets \ hf-doc-builder \ huggingface-hub \ + librosa \ modelcards \ numpy \ scipy \ diff --git a/docker/diffusers-pytorch-cuda/Dockerfile b/docker/diffusers-pytorch-cuda/Dockerfile index ba80395c89f4..ac97450745b0 100644 --- a/docker/diffusers-pytorch-cuda/Dockerfile +++ b/docker/diffusers-pytorch-cuda/Dockerfile @@ -11,6 +11,7 @@ RUN apt update && \ git-lfs \ curl \ ca-certificates \ + libsndfile1-dev \ python3.8 \ python3-pip \ python3.8-venv && \ @@ -32,6 +33,7 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip && \ datasets \ hf-doc-builder \ huggingface-hub \ + librosa \ modelcards \ numpy \ scipy \ diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index 354d381cd7e3..929e1319e41b 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -122,6 +122,8 @@ title: "VQ Diffusion" - local: api/pipelines/repaint title: "RePaint" + - local: api/pipelines/audio_diffusion + title: "Audio Diffusion" title: "Pipelines" - sections: - local: api/experimental/rl diff --git a/docs/source/api/pipelines/audio_diffusion.mdx b/docs/source/api/pipelines/audio_diffusion.mdx new file mode 100644 index 000000000000..bafdbef28140 --- /dev/null +++ b/docs/source/api/pipelines/audio_diffusion.mdx @@ -0,0 +1,102 @@ + + +# Audio Diffusion + +## Overview + +[Audio Diffusion](https://github.com/teticio/audio-diffusion) by Robert Dargavel Smith. + +Audio Diffusion leverages the recent advances in image generation using diffusion models by converting audio samples to +and from mel spectrogram images. + +The original codebase of this implementation can be found [here](https://github.com/teticio/audio-diffusion), including +training scripts and example notebooks. + +## Available Pipelines: + +| Pipeline | Tasks | Colab +|---|---|:---:| +| [pipeline_audio_diffusion.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/audio_diffusion/pipeline_audio_diffusion.py) | *Unconditional Audio Generation* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/teticio/audio-diffusion/blob/master/notebooks/audio_diffusion_pipeline.ipynb) | + + +## Examples: + +### Audio Diffusion + +```python +import torch +from IPython.display import Audio +from diffusers import DiffusionPipeline + +device = "cuda" if torch.cuda.is_available() else "cpu" +pipe = DiffusionPipeline.from_pretrained("teticio/audio-diffusion-256").to(device) + +output = pipe() +display(output.images[0]) +display(Audio(output.audios[0], rate=mel.get_sample_rate())) +``` + +### Latent Audio Diffusion + +```python +import torch +from IPython.display import Audio +from diffusers import DiffusionPipeline + +device = "cuda" if torch.cuda.is_available() else "cpu" +pipe = DiffusionPipeline.from_pretrained("teticio/latent-audio-diffusion-256").to(device) + +output = pipe() +display(output.images[0]) +display(Audio(output.audios[0], rate=pipe.mel.get_sample_rate())) +``` + +### Audio Diffusion with DDIM (faster) + +```python +import torch +from IPython.display import Audio +from diffusers import DiffusionPipeline + +device = "cuda" if torch.cuda.is_available() else "cpu" +pipe = DiffusionPipeline.from_pretrained("teticio/audio-diffusion-ddim-256").to(device) + +output = pipe() +display(output.images[0]) +display(Audio(output.audios[0], rate=pipe.mel.get_sample_rate())) +``` + +### Variations, in-painting, out-painting etc. + +```python +output = pipe( + raw_audio=output.audios[0, 0], + start_step=int(pipe.get_default_steps() / 2), + mask_start_secs=1, + mask_end_secs=1, +) +display(output.images[0]) +display(Audio(output.audios[0], rate=pipe.mel.get_sample_rate())) +``` + +## AudioDiffusionPipeline +[[autodoc]] AudioDiffusionPipeline + - __call__ + - encode + - slerp + + +## Mel +[[autodoc]] Mel + - audio_slice_to_image + - image_to_audio diff --git a/docs/source/api/pipelines/overview.mdx b/docs/source/api/pipelines/overview.mdx index ee0f27132a89..a0a3f3d77dd9 100644 --- a/docs/source/api/pipelines/overview.mdx +++ b/docs/source/api/pipelines/overview.mdx @@ -45,6 +45,7 @@ available a colab notebook to directly try them out. | Pipeline | Paper | Tasks | Colab |---|---|:---:|:---:| | [alt_diffusion](./api/pipelines/alt_diffusion) | [**AltDiffusion**](https://arxiv.org/abs/2211.06679) | Image-to-Image Text-Guided Generation | - +| [audio_diffusion](./api/pipelines/audio_diffusion) | [**Audio Diffusion**](https://github.com/teticio/audio_diffusion.git) | Unconditional Audio Generation | | [cycle_diffusion](./api/pipelines/cycle_diffusion) | [**Cycle Diffusion**](https://arxiv.org/abs/2210.05559) | Image-to-Image Text-Guided Generation | | [dance_diffusion](./api/pipelines/dance_diffusion) | [**Dance Diffusion**](https://github.com/williamberman/diffusers.git) | Unconditional Audio Generation | | [ddpm](./api/pipelines/ddpm) | [**Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2006.11239) | Unconditional Image Generation | diff --git a/docs/source/index.mdx b/docs/source/index.mdx index 975ff47b61e6..0cb89a459b86 100644 --- a/docs/source/index.mdx +++ b/docs/source/index.mdx @@ -35,6 +35,7 @@ available a colab notebook to directly try them out. | Pipeline | Paper | Tasks | Colab |---|---|:---:|:---:| | [alt_diffusion](./api/pipelines/alt_diffusion) | [**AltDiffusion**](https://arxiv.org/abs/2211.06679) | Image-to-Image Text-Guided Generation | +| [audio_diffusion](./api/pipelines/audio_diffusion) | [**Audio Diffusion**](https://github.com/teticio/audio-diffusion.git) | Unconditional Audio Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/teticio/audio-diffusion/blob/master/notebooks/audio_diffusion_pipeline.ipynb) | [cycle_diffusion](./api/pipelines/cycle_diffusion) | [**Cycle Diffusion**](https://arxiv.org/abs/2210.05559) | Image-to-Image Text-Guided Generation | | [dance_diffusion](./api/pipelines/dance_diffusion) | [**Dance Diffusion**](https://github.com/williamberman/diffusers.git) | Unconditional Audio Generation | | [ddpm](./api/pipelines/ddpm) | [**Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2006.11239) | Unconditional Image Generation | diff --git a/docs/source/using-diffusers/audio.mdx b/docs/source/using-diffusers/audio.mdx index 5a5c2241ca75..c895e2eb71d7 100644 --- a/docs/source/using-diffusers/audio.mdx +++ b/docs/source/using-diffusers/audio.mdx @@ -12,5 +12,5 @@ specific language governing permissions and limitations under the License. # Using Diffusers for audio -The [`DanceDiffusionPipeline`] can be used to generate audio rapidly! -More coming soon! \ No newline at end of file +[`DanceDiffusionPipeline`] and [`AudioDiffusionPipeline`] can be used to generate +audio rapidly! More coming soon! \ No newline at end of file diff --git a/setup.py b/setup.py index ee27b620ddae..6f5c1d9ed8b9 100644 --- a/setup.py +++ b/setup.py @@ -91,6 +91,7 @@ "isort>=5.5.4", "jax>=0.2.8,!=0.3.2", "jaxlib>=0.1.65", + "librosa", "modelcards>=0.1.4", "numpy", "parameterized", @@ -181,6 +182,7 @@ def run(self): extras["training"] = deps_list("accelerate", "datasets", "tensorboard", "modelcards") extras["test"] = deps_list( "datasets", + "librosa", "parameterized", "pytest", "pytest-timeout", diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index 1bda39cf3d49..51661814ed2c 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -30,12 +30,14 @@ ) from .pipeline_utils import DiffusionPipeline from .pipelines import ( + AudioDiffusionPipeline, DanceDiffusionPipeline, DDIMPipeline, DDPMPipeline, KarrasVePipeline, LDMPipeline, LDMSuperResolutionPipeline, + Mel, PNDMPipeline, RePaintPipeline, ScoreSdeVePipeline, diff --git a/src/diffusers/dependency_versions_table.py b/src/diffusers/dependency_versions_table.py index 2fd6bfa1fa17..56590335f2ec 100644 --- a/src/diffusers/dependency_versions_table.py +++ b/src/diffusers/dependency_versions_table.py @@ -15,6 +15,7 @@ "isort": "isort>=5.5.4", "jax": "jax>=0.2.8,!=0.3.2", "jaxlib": "jaxlib>=0.1.65", + "librosa": "librosa", "modelcards": "modelcards>=0.1.4", "numpy": "numpy", "parameterized": "parameterized", diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py index c5aba302042b..9555e3346fa3 100644 --- a/src/diffusers/pipelines/__init__.py +++ b/src/diffusers/pipelines/__init__.py @@ -1,4 +1,10 @@ -from ..utils import is_flax_available, is_onnx_available, is_torch_available, is_transformers_available +from ..utils import ( + is_flax_available, + is_librosa_available, + is_onnx_available, + is_torch_available, + is_transformers_available, +) if is_torch_available(): @@ -14,6 +20,11 @@ else: from ..utils.dummy_pt_objects import * # noqa F403 +if is_torch_available() and is_librosa_available(): + from .audio_diffusion import AudioDiffusionPipeline, Mel +else: + from ..utils.dummy_torch_and_librosa_objects import AudioDiffusionPipeline, Mel # noqa F403 + if is_torch_available() and is_transformers_available(): from .alt_diffusion import AltDiffusionImg2ImgPipeline, AltDiffusionPipeline from .latent_diffusion import LDMTextToImagePipeline diff --git a/src/diffusers/pipelines/audio_diffusion/__init__.py b/src/diffusers/pipelines/audio_diffusion/__init__.py new file mode 100644 index 000000000000..37f83bc91487 --- /dev/null +++ b/src/diffusers/pipelines/audio_diffusion/__init__.py @@ -0,0 +1,3 @@ +# flake8: noqa +from .mel import Mel +from .pipeline_audio_diffusion import AudioDiffusionPipeline diff --git a/src/diffusers/pipelines/audio_diffusion/mel.py b/src/diffusers/pipelines/audio_diffusion/mel.py new file mode 100644 index 000000000000..f9ac5654b4d2 --- /dev/null +++ b/src/diffusers/pipelines/audio_diffusion/mel.py @@ -0,0 +1,150 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import warnings + +from ...configuration_utils import ConfigMixin, register_to_config +from ...schedulers.scheduling_utils import SchedulerMixin + + +warnings.filterwarnings("ignore") + +import numpy as np # noqa: E402 + +import librosa # noqa: E402 +from PIL import Image # noqa: E402 + + +class Mel(ConfigMixin, SchedulerMixin): + """ + Parameters: + x_res (`int`): x resolution of spectrogram (time) + y_res (`int`): y resolution of spectrogram (frequency bins) + sample_rate (`int`): sample rate of audio + n_fft (`int`): number of Fast Fourier Transforms + hop_length (`int`): hop length (a higher number is recommended for lower than 256 y_res) + top_db (`int`): loudest in decibels + n_iter (`int`): number of iterations for Griffin Linn mel inversion + """ + + config_name = "mel_config.json" + + @register_to_config + def __init__( + self, + x_res: int = 256, + y_res: int = 256, + sample_rate: int = 22050, + n_fft: int = 2048, + hop_length: int = 512, + top_db: int = 80, + n_iter: int = 32, + ): + self.hop_length = hop_length + self.sr = sample_rate + self.n_fft = n_fft + self.top_db = top_db + self.n_iter = n_iter + self.set_resolution(x_res, y_res) + self.audio = None + + def set_resolution(self, x_res: int, y_res: int): + """Set resolution. + + Args: + x_res (`int`): x resolution of spectrogram (time) + y_res (`int`): y resolution of spectrogram (frequency bins) + """ + self.x_res = x_res + self.y_res = y_res + self.n_mels = self.y_res + self.slice_size = self.x_res * self.hop_length - 1 + + def load_audio(self, audio_file: str = None, raw_audio: np.ndarray = None): + """Load audio. + + Args: + audio_file (`str`): must be a file on disk due to Librosa limitation or + raw_audio (`np.ndarray`): audio as numpy array + """ + if audio_file is not None: + self.audio, _ = librosa.load(audio_file, mono=True, sr=self.sr) + else: + self.audio = raw_audio + + # Pad with silence if necessary. + if len(self.audio) < self.x_res * self.hop_length: + self.audio = np.concatenate([self.audio, np.zeros((self.x_res * self.hop_length - len(self.audio),))]) + + def get_number_of_slices(self) -> int: + """Get number of slices in audio. + + Returns: + `int`: number of spectograms audio can be sliced into + """ + return len(self.audio) // self.slice_size + + def get_audio_slice(self, slice: int = 0) -> np.ndarray: + """Get slice of audio. + + Args: + slice (`int`): slice number of audio (out of get_number_of_slices()) + + Returns: + `np.ndarray`: audio as numpy array + """ + return self.audio[self.slice_size * slice : self.slice_size * (slice + 1)] + + def get_sample_rate(self) -> int: + """Get sample rate: + + Returns: + `int`: sample rate of audio + """ + return self.sr + + def audio_slice_to_image(self, slice: int) -> Image.Image: + """Convert slice of audio to spectrogram. + + Args: + slice (`int`): slice number of audio to convert (out of get_number_of_slices()) + + Returns: + `PIL Image`: grayscale image of x_res x y_res + """ + S = librosa.feature.melspectrogram( + y=self.get_audio_slice(slice), sr=self.sr, n_fft=self.n_fft, hop_length=self.hop_length, n_mels=self.n_mels + ) + log_S = librosa.power_to_db(S, ref=np.max, top_db=self.top_db) + bytedata = (((log_S + self.top_db) * 255 / self.top_db).clip(0, 255) + 0.5).astype(np.uint8) + image = Image.fromarray(bytedata) + return image + + def image_to_audio(self, image: Image.Image) -> np.ndarray: + """Converts spectrogram to audio. + + Args: + image (`PIL Image`): x_res x y_res grayscale image + + Returns: + audio (`np.ndarray`): raw audio + """ + bytedata = np.frombuffer(image.tobytes(), dtype="uint8").reshape((image.height, image.width)) + log_S = bytedata.astype("float") * self.top_db / 255 - self.top_db + S = librosa.db_to_power(log_S) + audio = librosa.feature.inverse.mel_to_audio( + S, sr=self.sr, n_fft=self.n_fft, hop_length=self.hop_length, n_iter=self.n_iter + ) + return audio diff --git a/src/diffusers/pipelines/audio_diffusion/pipeline_audio_diffusion.py b/src/diffusers/pipelines/audio_diffusion/pipeline_audio_diffusion.py new file mode 100644 index 000000000000..1d7722125d9c --- /dev/null +++ b/src/diffusers/pipelines/audio_diffusion/pipeline_audio_diffusion.py @@ -0,0 +1,248 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from math import acos, sin +from typing import List, Tuple, Union + +import numpy as np +import torch + +from PIL import Image + +from ...models import AutoencoderKL, UNet2DConditionModel +from ...pipeline_utils import AudioPipelineOutput, BaseOutput, DiffusionPipeline, ImagePipelineOutput +from ...schedulers import DDIMScheduler, DDPMScheduler +from .mel import Mel + + +class AudioDiffusionPipeline(DiffusionPipeline): + """ + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the + library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) + + Parameters: + vqae ([`AutoencoderKL`]): Variational AutoEncoder for Latent Audio Diffusion or None + unet ([`UNet2DConditionModel`]): UNET model + mel ([`Mel`]): transform audio <-> spectrogram + scheduler ([`DDIMScheduler` or `DDPMScheduler`]): de-noising scheduler + """ + + _optional_components = ["vqvae"] + + def __init__( + self, + vqvae: AutoencoderKL, + unet: UNet2DConditionModel, + mel: Mel, + scheduler: Union[DDIMScheduler, DDPMScheduler], + ): + super().__init__() + self.register_modules(unet=unet, scheduler=scheduler, mel=mel, vqvae=vqvae) + + def get_input_dims(self) -> Tuple: + """Returns dimension of input image + + Returns: + `Tuple`: (height, width) + """ + input_module = self.vqvae if self.vqvae is not None else self.unet + # For backwards compatibility + sample_size = ( + (input_module.sample_size, input_module.sample_size) + if type(input_module.sample_size) == int + else input_module.sample_size + ) + return sample_size + + def get_default_steps(self) -> int: + """Returns default number of steps recommended for inference + + Returns: + `int`: number of steps + """ + return 50 if isinstance(self.scheduler, DDIMScheduler) else 1000 + + @torch.no_grad() + def __call__( + self, + batch_size: int = 1, + audio_file: str = None, + raw_audio: np.ndarray = None, + slice: int = 0, + start_step: int = 0, + steps: int = None, + generator: torch.Generator = None, + mask_start_secs: float = 0, + mask_end_secs: float = 0, + step_generator: torch.Generator = None, + eta: float = 0, + noise: torch.Tensor = None, + return_dict=True, + ) -> Union[ + Union[AudioPipelineOutput, ImagePipelineOutput], Tuple[List[Image.Image], Tuple[int, List[np.ndarray]]] + ]: + """Generate random mel spectrogram from audio input and convert to audio. + + Args: + batch_size (`int`): number of samples to generate + audio_file (`str`): must be a file on disk due to Librosa limitation or + raw_audio (`np.ndarray`): audio as numpy array + slice (`int`): slice number of audio to convert + start_step (int): step to start from + steps (`int`): number of de-noising steps (defaults to 50 for DDIM, 1000 for DDPM) + generator (`torch.Generator`): random number generator or None + mask_start_secs (`float`): number of seconds of audio to mask (not generate) at start + mask_end_secs (`float`): number of seconds of audio to mask (not generate) at end + step_generator (`torch.Generator`): random number generator used to de-noise or None + eta (`float`): parameter between 0 and 1 used with DDIM scheduler + noise (`torch.Tensor`): noise tensor of shape (batch_size, 1, height, width) or None + return_dict (`bool`): if True return AudioPipelineOutput, ImagePipelineOutput else Tuple + + Returns: + `List[PIL Image]`: mel spectrograms (`float`, `List[np.ndarray]`): sample rate and raw audios + """ + + steps = steps or self.get_default_steps() + self.scheduler.set_timesteps(steps) + step_generator = step_generator or generator + # For backwards compatibility + if type(self.unet.sample_size) == int: + self.unet.sample_size = (self.unet.sample_size, self.unet.sample_size) + input_dims = self.get_input_dims() + self.mel.set_resolution(x_res=input_dims[1], y_res=input_dims[0]) + if noise is None: + noise = torch.randn( + (batch_size, self.unet.in_channels, self.unet.sample_size[0], self.unet.sample_size[1]), + generator=generator, + device=self.device, + ) + images = noise + mask = None + + if audio_file is not None or raw_audio is not None: + self.mel.load_audio(audio_file, raw_audio) + input_image = self.mel.audio_slice_to_image(slice) + input_image = np.frombuffer(input_image.tobytes(), dtype="uint8").reshape( + (input_image.height, input_image.width) + ) + input_image = (input_image / 255) * 2 - 1 + input_images = torch.tensor(input_image[np.newaxis, :, :], dtype=torch.float).to(self.device) + + if self.vqvae is not None: + input_images = self.vqvae.encode(torch.unsqueeze(input_images, 0)).latent_dist.sample( + generator=generator + )[0] + input_images = 0.18215 * input_images + + if start_step > 0: + images[0, 0] = self.scheduler.add_noise(input_images, noise, self.scheduler.timesteps[start_step - 1]) + + pixels_per_second = ( + self.unet.sample_size[1] * self.mel.get_sample_rate() / self.mel.x_res / self.mel.hop_length + ) + mask_start = int(mask_start_secs * pixels_per_second) + mask_end = int(mask_end_secs * pixels_per_second) + mask = self.scheduler.add_noise(input_images, noise, torch.tensor(self.scheduler.timesteps[start_step:])) + + for step, t in enumerate(self.progress_bar(self.scheduler.timesteps[start_step:])): + model_output = self.unet(images, t)["sample"] + + if isinstance(self.scheduler, DDIMScheduler): + images = self.scheduler.step( + model_output=model_output, timestep=t, sample=images, eta=eta, generator=step_generator + )["prev_sample"] + else: + images = self.scheduler.step( + model_output=model_output, timestep=t, sample=images, generator=step_generator + )["prev_sample"] + + if mask is not None: + if mask_start > 0: + images[:, :, :, :mask_start] = mask[:, step, :, :mask_start] + if mask_end > 0: + images[:, :, :, -mask_end:] = mask[:, step, :, -mask_end:] + + if self.vqvae is not None: + # 0.18215 was scaling factor used in training to ensure unit variance + images = 1 / 0.18215 * images + images = self.vqvae.decode(images)["sample"] + + images = (images / 2 + 0.5).clamp(0, 1) + images = images.cpu().permute(0, 2, 3, 1).numpy() + images = (images * 255).round().astype("uint8") + images = list( + map(lambda _: Image.fromarray(_[:, :, 0]), images) + if images.shape[3] == 1 + else map(lambda _: Image.fromarray(_, mode="RGB").convert("L"), images) + ) + + audios = list(map(lambda _: self.mel.image_to_audio(_), images)) + if not return_dict: + return images, (self.mel.get_sample_rate(), audios) + + return BaseOutput(**AudioPipelineOutput(np.array(audios)[:, np.newaxis, :]), **ImagePipelineOutput(images)) + + @torch.no_grad() + def encode(self, images: List[Image.Image], steps: int = 50) -> np.ndarray: + """Reverse step process: recover noisy image from generated image. + + Args: + images (`List[PIL Image]`): list of images to encode + steps (`int`): number of encoding steps to perform (defaults to 50) + + Returns: + `np.ndarray`: noise tensor of shape (batch_size, 1, height, width) + """ + + # Only works with DDIM as this method is deterministic + assert isinstance(self.scheduler, DDIMScheduler) + self.scheduler.set_timesteps(steps) + sample = np.array( + [np.frombuffer(image.tobytes(), dtype="uint8").reshape((1, image.height, image.width)) for image in images] + ) + sample = (sample / 255) * 2 - 1 + sample = torch.Tensor(sample).to(self.device) + + for t in self.progress_bar(torch.flip(self.scheduler.timesteps, (0,))): + prev_timestep = t - self.scheduler.num_train_timesteps // self.scheduler.num_inference_steps + alpha_prod_t = self.scheduler.alphas_cumprod[t] + alpha_prod_t_prev = ( + self.scheduler.alphas_cumprod[prev_timestep] + if prev_timestep >= 0 + else self.scheduler.final_alpha_cumprod + ) + beta_prod_t = 1 - alpha_prod_t + model_output = self.unet(sample, t)["sample"] + pred_sample_direction = (1 - alpha_prod_t_prev) ** (0.5) * model_output + sample = (sample - pred_sample_direction) * alpha_prod_t_prev ** (-0.5) + sample = sample * alpha_prod_t ** (0.5) + beta_prod_t ** (0.5) * model_output + + return sample + + @staticmethod + def slerp(x0: torch.Tensor, x1: torch.Tensor, alpha: float) -> torch.Tensor: + """Spherical Linear intERPolation + + Args: + x0 (`torch.Tensor`): first tensor to interpolate between + x1 (`torch.Tensor`): seconds tensor to interpolate between + alpha (`float`): interpolation between 0 and 1 + + Returns: + `torch.Tensor`: interpolated tensor + """ + + theta = acos(torch.dot(torch.flatten(x0), torch.flatten(x1)) / torch.norm(x0) / torch.norm(x1)) + return sin((1 - alpha) * theta) * x0 / sin(theta) + sin(alpha * theta) * x1 / sin(theta) diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py index 1c2e2c9abbc6..325c94b0fadd 100644 --- a/src/diffusers/utils/__init__.py +++ b/src/diffusers/utils/__init__.py @@ -26,6 +26,7 @@ is_accelerate_available, is_flax_available, is_inflect_available, + is_librosa_available, is_modelcards_available, is_onnx_available, is_safetensors_available, diff --git a/src/diffusers/utils/dummy_pt_objects.py b/src/diffusers/utils/dummy_pt_objects.py index 23afb51cf30c..a3719a765c96 100644 --- a/src/diffusers/utils/dummy_pt_objects.py +++ b/src/diffusers/utils/dummy_pt_objects.py @@ -152,6 +152,21 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["torch"]) +class AudioDiffusionPipeline(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch"]) + + class DanceDiffusionPipeline(metaclass=DummyObject): _backends = ["torch"] @@ -242,6 +257,21 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["torch"]) +class Mel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch"]) + + class PNDMPipeline(metaclass=DummyObject): _backends = ["torch"] diff --git a/src/diffusers/utils/dummy_torch_and_librosa_objects.py b/src/diffusers/utils/dummy_torch_and_librosa_objects.py new file mode 100644 index 000000000000..ff60d4c4393f --- /dev/null +++ b/src/diffusers/utils/dummy_torch_and_librosa_objects.py @@ -0,0 +1,34 @@ +# This file is autogenerated by the command `make fix-copies`, do not edit. +# flake8: noqa + +from ..utils import DummyObject, requires_backends + + +class AudioDiffusionPipeline(metaclass=DummyObject): + _backends = ["torch", "librosa"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "librosa"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "librosa"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "librosa"]) + + +class Mel(metaclass=DummyObject): + _backends = ["torch", "librosa"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "librosa"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "librosa"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "librosa"]) diff --git a/src/diffusers/utils/import_utils.py b/src/diffusers/utils/import_utils.py index 531f9eab2f7a..a1f3bd19340d 100644 --- a/src/diffusers/utils/import_utils.py +++ b/src/diffusers/utils/import_utils.py @@ -180,10 +180,17 @@ _scipy_available = importlib.util.find_spec("scipy") is not None try: _scipy_version = importlib_metadata.version("scipy") - logger.debug(f"Successfully imported transformers version {_scipy_version}") + logger.debug(f"Successfully imported scipy version {_scipy_version}") except importlib_metadata.PackageNotFoundError: _scipy_available = False +_librosa_available = importlib.util.find_spec("librosa") is not None +try: + _librosa_version = importlib_metadata.version("librosa") + logger.debug(f"Successfully imported librosa version {_librosa_version}") +except importlib_metadata.PackageNotFoundError: + _librosa_available = False + _accelerate_available = importlib.util.find_spec("accelerate") is not None try: _accelerate_version = importlib_metadata.version("accelerate") @@ -244,6 +251,10 @@ def is_scipy_available(): return _scipy_available +def is_librosa_available(): + return _librosa_available + + def is_xformers_available(): return _xformers_available @@ -282,6 +293,12 @@ def is_accelerate_available(): scipy` """ +# docstyle-ignore +LIBROSA_IMPORT_ERROR = """ +{0} requires the librosa library but it was not found in your environment. Checkout the instructions on the +installation page: https://librosa.org/doc/latest/install.html and follow the ones that match your environment. +""" + # docstyle-ignore TENSORFLOW_IMPORT_ERROR = """ {0} requires the TensorFlow library but it was not found in your environment. Checkout the instructions on the @@ -311,6 +328,7 @@ def is_accelerate_available(): ("torch", (is_torch_available, PYTORCH_IMPORT_ERROR)), ("transformers", (is_transformers_available, TRANSFORMERS_IMPORT_ERROR)), ("unidecode", (is_unidecode_available, UNIDECODE_IMPORT_ERROR)), + ("librosa", (is_librosa_available, LIBROSA_IMPORT_ERROR)), ] ) diff --git a/tests/pipelines/audio_diffusion/test_audio_diffusion.py b/tests/pipelines/audio_diffusion/test_audio_diffusion.py new file mode 100644 index 000000000000..8289441d2192 --- /dev/null +++ b/tests/pipelines/audio_diffusion/test_audio_diffusion.py @@ -0,0 +1,157 @@ +# coding=utf-8 +# Copyright 2022 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc +import unittest + +import numpy as np +import torch + +from diffusers import ( + AudioDiffusionPipeline, + AutoencoderKL, + DDIMScheduler, + DDPMScheduler, + DiffusionPipeline, + Mel, + UNet2DModel, +) +from diffusers.utils import slow, torch_device +from diffusers.utils.testing_utils import require_torch_gpu + + +torch.backends.cuda.matmul.allow_tf32 = False + + +class PipelineFastTests(unittest.TestCase): + def tearDown(self): + # clean up the VRAM after each test + super().tearDown() + gc.collect() + torch.cuda.empty_cache() + + @property + def dummy_unet(self): + torch.manual_seed(0) + model = UNet2DModel( + sample_size=(32, 64), + in_channels=1, + out_channels=1, + layers_per_block=2, + block_out_channels=(128, 128), + down_block_types=("AttnDownBlock2D", "DownBlock2D"), + up_block_types=("UpBlock2D", "AttnUpBlock2D"), + ) + return model + + @property + def dummy_vqvae_and_unet(self): + torch.manual_seed(0) + vqvae = AutoencoderKL( + sample_size=(128, 64), + in_channels=1, + out_channels=1, + latent_channels=1, + layers_per_block=2, + block_out_channels=(128, 128), + down_block_types=("DownEncoderBlock2D", "DownEncoderBlock2D"), + up_block_types=("UpDecoderBlock2D", "UpDecoderBlock2D"), + ) + unet = UNet2DModel( + sample_size=(64, 32), + in_channels=1, + out_channels=1, + layers_per_block=2, + block_out_channels=(128, 128), + down_block_types=("AttnDownBlock2D", "DownBlock2D"), + up_block_types=("UpBlock2D", "AttnUpBlock2D"), + ) + return vqvae, unet + + def test_audio_diffusion(self): + device = "cpu" # ensure determinism for the device-dependent torch.Generator + mel = Mel() + + scheduler = DDPMScheduler() + pipe = AudioDiffusionPipeline(vqvae=None, unet=self.dummy_unet, mel=mel, scheduler=scheduler) + pipe = pipe.to(device) + pipe.set_progress_bar_config(disable=None) + + generator = torch.Generator(device=device).manual_seed(42) + output = pipe(generator=generator, steps=4) + audio = output.audios[0] + image = output.images[0] + + generator = torch.Generator(device=device).manual_seed(42) + output = pipe(generator=generator, steps=4, return_dict=False) + image_from_tuple = output[0][0] + + assert audio.shape == (1, (self.dummy_unet.sample_size[1] - 1) * mel.hop_length) + assert image.height == self.dummy_unet.sample_size[0] and image.width == self.dummy_unet.sample_size[1] + image_slice = np.frombuffer(image.tobytes(), dtype="uint8")[:10] + image_from_tuple_slice = np.frombuffer(image_from_tuple.tobytes(), dtype="uint8")[:10] + expected_slice = np.array([255, 255, 255, 0, 181, 0, 124, 0, 15, 255]) + assert np.abs(image_slice.flatten() - expected_slice).max() == 0 + assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() == 0 + + scheduler = DDIMScheduler() + dummy_vqvae_and_unet = self.dummy_vqvae_and_unet + pipe = AudioDiffusionPipeline( + vqvae=self.dummy_vqvae_and_unet[0], unet=dummy_vqvae_and_unet[1], mel=mel, scheduler=scheduler + ) + pipe = pipe.to(device) + pipe.set_progress_bar_config(disable=None) + + np.random.seed(0) + raw_audio = np.random.uniform(-1, 1, ((dummy_vqvae_and_unet[0].sample_size[1] - 1) * mel.hop_length,)) + generator = torch.Generator(device=device).manual_seed(42) + output = pipe(raw_audio=raw_audio, generator=generator, start_step=5, steps=10) + image = output.images[0] + + assert ( + image.height == self.dummy_vqvae_and_unet[0].sample_size[0] + and image.width == self.dummy_vqvae_and_unet[0].sample_size[1] + ) + image_slice = np.frombuffer(image.tobytes(), dtype="uint8")[:10] + expected_slice = np.array([120, 117, 110, 109, 138, 167, 138, 148, 132, 121]) + assert np.abs(image_slice.flatten() - expected_slice).max() == 0 + + +@slow +@require_torch_gpu +class PipelineIntegrationTests(unittest.TestCase): + def tearDown(self): + # clean up the VRAM after each test + super().tearDown() + gc.collect() + torch.cuda.empty_cache() + + def test_audio_diffusion(self): + device = torch_device + + pipe = DiffusionPipeline.from_pretrained("teticio/audio-diffusion-ddim-256") + pipe = pipe.to(device) + pipe.set_progress_bar_config(disable=None) + + generator = torch.Generator(device=device).manual_seed(42) + output = pipe(generator=generator) + audio = output.audios[0] + image = output.images[0] + + assert audio.shape == (1, (pipe.unet.sample_size[1] - 1) * pipe.mel.hop_length) + assert image.height == pipe.unet.sample_size[0] and image.width == pipe.unet.sample_size[1] + image_slice = np.frombuffer(image.tobytes(), dtype="uint8")[:10] + expected_slice = np.array([151, 167, 154, 144, 122, 134, 121, 105, 70, 26]) + assert np.abs(image_slice.flatten() - expected_slice).max() == 0