open-mmlab · plyfager · Dec 2, 2022 · Sep 14, 2022 · Sep 16, 2022 · Sep 16, 2022
diff --git a/configs/_base_/datasets/imagenet_512.py b/configs/_base_/datasets/imagenet_512.py
@@ -0,0 +1,45 @@
+# dataset settings
+dataset_type = 'ImageNet'
+
+# different from mmcls, we adopt the setting used in BigGAN.
+# We use `RandomCropLongEdge` in training and `CenterCropLongEdge` in testing.
+train_pipeline = [
+    dict(type='LoadImageFromFile', key='img'),
+    dict(type='RandomCropLongEdge', keys=['img']),
+    dict(type='Resize', scale=(512, 512), keys=['img'], backend='pillow'),
+    dict(type='Flip', flip_ratio=0.5, direction='horizontal'),
+    dict(type='PackEditInputs')
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile', key='img'),
+    dict(type='CenterCropLongEdge', keys=['img']),
+    dict(type='Resize', scale=(512, 512), backend='pillow'),
+    dict(type='PackEditInputs')
+]
+
+train_dataloader = dict(
+    batch_size=None,
+    num_workers=5,
+    dataset=dict(
+        type=dataset_type,
+        data_root='./data/imagenet/',
+        ann_file='meta/train.txt',
+        data_prefix='train',
+        pipeline=train_pipeline),
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    persistent_workers=True)
+
+val_dataloader = dict(
+    batch_size=None,
+    num_workers=5,
+    dataset=dict(
+        type=dataset_type,
+        data_root='./data/imagenet/',
+        ann_file='meta/train.txt',
+        data_prefix='train',
+        pipeline=test_pipeline),
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    persistent_workers=True)
+
+test_dataloader = val_dataloader
diff --git a/configs/_base_/datasets/imagenet_64.py b/configs/_base_/datasets/imagenet_64.py
@@ -0,0 +1,45 @@
+# dataset settings
+dataset_type = 'ImageNet'
+
+# different from mmcls, we adopt the setting used in BigGAN.
+# We use `RandomCropLongEdge` in training and `CenterCropLongEdge` in testing.
+train_pipeline = [
+    dict(type='LoadImageFromFile', key='img'),
+    dict(type='RandomCropLongEdge', keys=['img']),
+    dict(type='Resize', scale=(64, 64), keys=['img'], backend='pillow'),
+    dict(type='Flip', flip_ratio=0.5, direction='horizontal'),
+    dict(type='PackEditInputs')
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile', key='img'),
+    dict(type='CenterCropLongEdge', keys=['img']),
+    dict(type='Resize', scale=(64, 64), backend='pillow'),
+    dict(type='PackEditInputs')
+]
+
+train_dataloader = dict(
+    batch_size=None,
+    num_workers=5,
+    dataset=dict(
+        type=dataset_type,
+        data_root='./data/imagenet/',
+        ann_file='meta/train.txt',
+        data_prefix='train',
+        pipeline=train_pipeline),
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    persistent_workers=True)
+
+val_dataloader = dict(
+    batch_size=64,
+    num_workers=5,
+    dataset=dict(
+        type=dataset_type,
+        data_root='./data/imagenet/',
+        ann_file='meta/train.txt',
+        data_prefix='train',
+        pipeline=test_pipeline),
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    persistent_workers=True)
+
+test_dataloader = val_dataloader
diff --git a/configs/disco/README.md b/configs/disco/README.md
@@ -0,0 +1,149 @@
+# Disco Diffusion(Google Colab)
+
+> [](<>)
+
+> **Task**: Text2Image
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Disco Diffusion (DD) is a Google Colab Notebook which leverages an AI Image generating technique called CLIP-Guided Diffusion to allow you to create compelling and beautiful images from just text inputs. Created by Somnai, augmented by Gandamu, and building on the work of RiversHaveWings, nshepperd, and many others.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/22982797/201001789-7ef108a0-f607-401e-98dc-4e16d6be384f.png"/>
+</div>
+
+## Models Card
+
+## Quick Start
+
+In order to get started, we introduce a simplest way to get an image within 6 line of codes.
+
+```python
+from mmengine import Config, MODELS
+from mmedit.utils import register_all_modules
+register_all_modules()
+
+disco = MODELS.build(Config.fromfile('configs/disco/disco-baseline.py').model).cuda().eval()
+text_prompts = {
+    0: ["A beautiful painting of a singular lighthouse, shining its light across a tumultuous sea of blood by greg rutkowski and thomas kinkade, Trending on artstation.", "yellow color scheme"]
+}
+image = disco.infer(height=768, width=1280, text_prompts=text_prompts, show_progress=True, num_inference_steps=250, eta=0.8)['samples']
+```
+
+## Advanced Tutorials
+
+For detailed description and advanced usage.
+
+### Overall Architecture(In Construction)
+
+### Infer Settings
+
+For fixed Disco-Diffusions, there are several runtime settings.
+
+1. Image Resolution.
+   Despite the limit of your device limitation, you can set height and width of image as you like.
+
+Performing code,
+
+```python
+from mmengine import Config, MODELS
+from mmedit.utils import register_all_modules
+register_all_modules()
+
+disco = MODELS.build(Config.fromfile('configs/disco/disco-baseline.py').model).cuda().eval()
+text_prompts = {
+    0: ["A beautiful painting of a singular lighthouse, shining its light across a tumultuous sea of blood by greg rutkowski and thomas kinkade, Trending on artstation.", "yellow color scheme"]
+}
+image = disco.infer(height=512, width=1024, text_prompts=text_prompts, show_progress=True, num_inference_steps=250, eta=0.8)['samples']
+```
+
+get
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/22982797/201041058-b47a897c-852e-4b78-9627-48706dade1d5.png"/>
+</div>
+
+2. Initial image.
+   You can set the initial image for your art work, simply set `init_image` to your image path. By set `init_scale`, you can adjust the similarity of initial image and your result.
+
+**Note**: Make sure you set `skip_steps` to ~50% of your steps if you want to use an init image.
+
+For example, Take this picture as initial image
+
+<div align="center">
+  <br/>
+  <img src="https://user-images.githubusercontent.com/22982797/201272831-81f2b1f4-3e28-4468-8e84-b7c52ad74e11.jpg" width="800"/>
+</div>
+
+Note that, `init_scale` need to be set in config, this field is contained in `loss_cfg`.
+
+```python
+from mmengine import Config, MODELS
+from mmedit.utils import register_all_modules
+
+register_all_modules()
+config = 'configs/disco/disco-init_scale20.py'
+disco = MODELS.build(Config.fromfile(config).model).cuda().eval()
+text_prompts = {
+    0: ["a huge dragon, human like, flying with flame, and two big wings"]
+}
+image_path = 'PATH/TO/INIT_IMAGE'
+image = disco.infer(width=1280, height=768, init_image=image_path, text_prompts=text_prompts, show_progress=True, num_inference_steps=250, skip_steps=150, eta=0.8)['samples']
+```
+
+and get
+
+<div align="center">
+  <br/>
+  <img src="https://user-images.githubusercontent.com/22982797/201273268-ce775eeb-fb9d-4997-a3f6-b93835593f36.png" width="800"/>
+</div>
+
+Then we use default `init_scale=1000`
+
+```python
+from mmengine import Config, MODELS
+from mmedit.utils import register_all_modules
+
+register_all_modules()
+config = 'configs/disco/disco-baseline.py'
+disco = MODELS.build(Config.fromfile(config).model).cuda().eval()
+text_prompts = {
+    0: ["a huge dragon, human like, flying with flame, and two big wings"]
+}
+image_path = 'PATH/TO/INIT_IMAGE'
+image = disco.infer(width=1280, height=768, init_image=image_path, text_prompts=text_prompts, show_progress=True, num_inference_steps=250, skip_steps=150, eta=0.8)['samples']
+```
+
+and get
+
+<div align="center">
+  <br/>
+  <img src="https://user-images.githubusercontent.com/22982797/201273252-3e9d1293-5a83-4ca1-a177-b9fa2639ba14.png" width="800"/>
+</div>
+
+### Unet Settings(In Construction)
+
+### Clip Models Settings(In Construction)
+
+### Cutter Settings(In Construction)
+
+### Diffuser Settings(In Construction)
+
+### Loss Settings(In Construction)
+
+## Citation
+
+```bibtex
+@misc{github,
+  author={alembics},
+  title={disco-diffusion},
+  year={2022},
+  url={https://github.com/alembics/disco-diffusion},
+}
+```
diff --git a/configs/disco/disco-baseline.py b/configs/disco/disco-baseline.py
@@ -0,0 +1,59 @@
+unet = dict(
+    type='DenoisingUnet',
+    image_size=512,
+    in_channels=3,
+    base_channels=256,
+    resblocks_per_downsample=2,
+    attention_res=(32, 16, 8),
+    norm_cfg=dict(type='GN32', num_groups=32),
+    dropout=0.0,
+    num_classes=0,
+    use_fp16=True,
+    resblock_updown=True,
+    attention_cfg=dict(
+        type='MultiHeadAttentionBlock',
+        num_heads=4,
+        num_head_channels=64,
+        use_new_attention_order=False),
+    use_scale_shift_norm=True)
+
+unet_ckpt_path = 'https://download.openmmlab.com/mmediting/synthesizers/disco/adm-u_finetuned_imagenet-512x512-ab471d70.pth'  # noqa
+secondary_model_ckpt_path = 'https://download.openmmlab.com/mmediting/synthesizers/disco/secondary_model_imagenet_2.pth'  # noqa
+pretrained_cfgs = dict(
+    unet=dict(ckpt_path=unet_ckpt_path, prefix='unet'),
+    secondary_model=dict(ckpt_path=secondary_model_ckpt_path, prefix=''))
+
+secondary_model = dict(type='SecondaryDiffusionImageNet2')
+
+diffuser = dict(
+    type='DDIMScheduler',
+    variance_type='learned_range',
+    beta_schedule='linear',
+    clip_sample=False)
+
+clip_models_cfg = [
+    dict(type='ClipWrapper', clip_type='clip', name='ViT-B/32', jit=False),
+    dict(type='ClipWrapper', clip_type='clip', name='ViT-B/16', jit=False),
+    dict(type='ClipWrapper', clip_type='clip', name='RN50', jit=False)
+]
+
+# pretrained_cfgs = None
+cutter_cfg = dict(
+    cut_overview=eval('[12]*400+[4]*600'),
+    cut_innercut=eval('[4]*400+[12]*600'),
+    cut_ic_pow=eval('[1]*1000'),
+    cut_icgray_p=eval('[0.2]*400+[0]*600'),
+    cutn_batches=4)
+
+loss_cfg = dict(tv_scale=0, range_scale=150, sat_scale=0, init_scale=1000)
+
+model = dict(
+    type='DiscoDiffusion',
+    unet=unet,
+    diffuser=diffuser,
+    secondary_model=secondary_model,
+    cutter_cfg=cutter_cfg,
+    loss_cfg=loss_cfg,
+    clip_models_cfg=clip_models_cfg,
+    use_fp16=True,
+    pretrained_cfgs=pretrained_cfgs)
diff --git a/configs/disco/disco-init_scale20.py b/configs/disco/disco-init_scale20.py
@@ -0,0 +1,59 @@
+unet = dict(
+    type='DenoisingUnet',
+    image_size=512,
+    in_channels=3,
+    base_channels=256,
+    resblocks_per_downsample=2,
+    attention_res=(32, 16, 8),
+    norm_cfg=dict(type='GN32', num_groups=32),
+    dropout=0.0,
+    num_classes=0,
+    use_fp16=True,
+    resblock_updown=True,
+    attention_cfg=dict(
+        type='MultiHeadAttentionBlock',
+        num_heads=4,
+        num_head_channels=64,
+        use_new_attention_order=False),
+    use_scale_shift_norm=True)
+
+unet_ckpt_path = 'https://download.openmmlab.com/mmediting/synthesizers/disco/adm-u_finetuned_imagenet-512x512-ab471d70.pth'  # noqa
+secondary_model_ckpt_path = 'https://download.openmmlab.com/mmediting/synthesizers/disco/secondary_model_imagenet_2.pth'  # noqa
+pretrained_cfgs = dict(
+    unet=dict(ckpt_path=unet_ckpt_path, prefix='unet'),
+    secondary_model=dict(ckpt_path=secondary_model_ckpt_path, prefix=''))
+
+secondary_model = dict(type='SecondaryDiffusionImageNet2')
+
+diffuser = dict(
+    type='DDIMScheduler',
+    variance_type='learned_range',
+    beta_schedule='linear',
+    clip_sample=False)
+
+clip_models_cfg = [
+    dict(type='ClipWrapper', clip_type='clip', name='ViT-B/32', jit=False),
+    dict(type='ClipWrapper', clip_type='clip', name='ViT-B/16', jit=False),
+    dict(type='ClipWrapper', clip_type='clip', name='RN50', jit=False)
+]
+
+# pretrained_cfgs = None
+cutter_cfg = dict(
+    cut_overview=eval('[12]*400+[4]*600'),
+    cut_innercut=eval('[4]*400+[12]*600'),
+    cut_ic_pow=eval('[1]*1000'),
+    cut_icgray_p=eval('[0.2]*400+[0]*600'),
+    cutn_batches=4)
+
+loss_cfg = dict(tv_scale=0, range_scale=150, sat_scale=0, init_scale=20)
+
+model = dict(
+    type='DiscoDiffusion',
+    unet=unet,
+    diffuser=diffuser,
+    secondary_model=secondary_model,
+    cutter_cfg=cutter_cfg,
+    loss_cfg=loss_cfg,
+    clip_models_cfg=clip_models_cfg,
+    use_fp16=True,
+    pretrained_cfgs=pretrained_cfgs)
diff --git a/configs/disco/metafile.yml b/configs/disco/metafile.yml
@@ -0,0 +1,9 @@
+Collections:
+- Metadata:
+    Architecture:
+    - Disco Diffusion
+  Name: Disco Diffusion
+  Paper:
+  - <>
+  README: configs/disco/README.md
+Models: []