jbdiff-sample-v1.yaml

sampling:
      diffusion:
            dd:
                  num_steps: 150
                  init_strength: 0.28
                  ckpt_loc: "./epoch=2125-step=218000.ckpt"
                  xfade_samples: 1536
                  xfade_style: "constant-power"
            0:
                  num_steps: 20
                  embedding_strength: 1.3
                  init_strength: 0.5
                  ckpt_loc: "./epoch=543-step=705000.ckpt"
            1:
                  num_steps: 70
                  embedding_strength: 2.0
                  init_strength: 0.67
                  ckpt_loc: "./epoch=1404-step=455000.ckpt"
            2:
                  num_steps: 250
                  embedding_strength: 4.0
                  ckpt_loc: "./epoch=4938-step=400000_vqvae_add.ckpt"
model:
      vqvae:
            batch_size: 32
            aug_shift: True
            base_tokens: 768
            context_mult: 2
      diffusion:
            2: #Level 2 is bottom level of vqvae
                  net_t: audio_diffusion_pytorch.UNetV0
                  in_channels: 64
                  channels: [64, 128, 128, 256, 256, 512, 512, 1024, 1024]
                  factors: [1, 2, 1, 2, 1, 2, 1, 2, 1]
                  items: [1, 3, 3, 3, 3, 3, 3, 3, 1]
                  attentions: [0, 0, 0, 0, 1, 0, 1, 0, 0]
                  cross_attentions: [0, 1, 0, 1, 0, 1, 0, 1, 1]
                  attention_heads: 8
                  attention_features: 64
                  diffusion_t: audio_diffusion_pytorch.VDiffusion
                  sampler_t: audio_diffusion_pytorch.VSampler
                  embedding_features: 64
                  use_embedding_cfg: True
                  # embedding_max_length: context_mult*base_tokens
                  resnet_dilation_factor: 3
                  resnet_dropout_rate: 0.05
            1: #Level 1 is middle level of vqvae
                  net_t: audio_diffusion_pytorch.UNetV0
                  in_channels: 64
                  channels: [64, 128, 128, 256, 256, 512, 512, 1024, 1024]
                  factors: [1, 2, 1, 2, 1, 2, 1, 2, 1]
                  items: [1, 3, 3, 3, 3, 3, 3, 3, 1]
                  attentions: [0, 0, 0, 0, 1, 0, 1, 0, 0]
                  cross_attentions: [0, 1, 0, 1, 0, 1, 0, 1, 1]
                  attention_heads: 8
                  attention_features: 64
                  diffusion_t: audio_diffusion_pytorch.VDiffusion
                  sampler_t: audio_diffusion_pytorch.VSampler
                  embedding_features: 64
                  use_embedding_cfg: True
                  # embedding_max_length: context_mult*base_tokens
                  resnet_dilation_factor: 3
                  resnet_dropout_rate: 0.00
            0: #Level 0 is top level of vqvae
                  net_t: audio_diffusion_pytorch.UNetV0
                  in_channels: 64
                  channels: [64, 128, 128, 256, 256, 512, 512, 1024, 1024]
                  factors: [1, 2, 1, 2, 1, 2, 1, 2, 1]
                  items: [1, 3, 3, 3, 3, 3, 3, 3, 1]
                  attentions: [0, 0, 0, 0, 1, 0, 1, 0, 0]
                  cross_attentions: [0, 1, 0, 1, 0, 1, 0, 1, 1]
                  attention_heads: 8
                  attention_features: 64
                  diffusion_t: audio_diffusion_pytorch.VDiffusion
                  sampler_t: audio_diffusion_pytorch.VSampler
                  embedding_features: 64
                  use_embedding_cfg: True
                  # embedding_max_length: context_mult*base_tokens
                  resnet_dilation_factor: 3
                  resnet_dropout_rate: 0.00