It's working. Training LORA of the latest version of kohya_ss on AMD GPU,Ubuntu 22.04.2 LTS ,test on RX6800 ,sd1.5&sdxl #1484

tornado73 · 2023-09-04T21:10:44Z

ROCm 5.6.0 и 5.7.1 ,
Dependencies fixed

Change on
requirements.txt

Details

accelerate==0.23.0
albumentations==1.3.0
aiofiles==23.2.1
altair==4.2.2
dadaptation==3.1
diffusers[torch]==0.18.2
easygui==0.98.3
einops==0.6.0
fairscale==0.4.13
ftfy==6.1.1
gradio==3.36.1
huggingface-hub==0.15.1
keras==2.12.0

invisible-watermark==0.2.0
lion-pytorch==0.0.6
lycoris_lora==1.8.3
gradio==3.36.1; sys_platform == 'darwin'
gradio==3.36.1; sys_platform != 'darwin'
huggingface-hub==0.15.1; sys_platform == 'darwin'
huggingface-hub==0.15.1; sys_platform != 'darwin'

open-clip-torch==2.20.0
opencv-python==4.7.0.68
prodigyopt==1.0
pytorch-lightning==1.9.0
tensorflow-rocm==2.12.0.560
tensorboard==2.12.0 ; sys_platform != 'darwin'
tensorboard==2.12.0 ; sys_platform == 'darwin'
tensorflow==2.12.0; sys_platform != 'darwin'
rich==13.4.1
safetensors==0.3.1
timm==0.6.12
tk==0.1.0
toml==0.10.2
transformers==4.30.2
voluptuous==0.13.1
wandb==0.15.0

-e . # no_verify leave this to specify not checking this a verification stage

install `
tested on ubuntu Ubuntu 22.04.2 LTS + rx6800

git clone https://github.com/bmaltais/kohya_ss.git
cd kohya_ss
python -m venv venv
source venv/bin/activate
pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm5.6
pip install --use-pep517 --upgrade -r requirements.txt
accelerate config

This machine
No distributed training
no
no
all
fp16

sudo apt install python3-tk
export HSA_OVERRIDE_GFX_VERSION=10.3.0
source venv/bin/activate
python kohya_gui.py "$@"

Change on gui.sh

#!/usr/bin/env bash
export HSA_OVERRIDE_GFX_VERSION=10.3.0
source venv/bin/activate
python kohya_gui.py "$@"

Customization files from previous versions are not suitable ,
this is my working sample, change it for yourself

LoRa training on SD1.5

.json

Details

"LoRA_type": "Standard",
"adaptive_noise_scale": 0,
"additional_parameters": "",
"block_alphas": "",
"block_dims": "",
"block_lr_zero_threshold": "",
"bucket_no_upscale": true,
"bucket_reso_steps": 64,
"cache_latents": true,
"cache_latents_to_disk": false,
"caption_dropout_every_n_epochs": 0.0,
"caption_dropout_rate": 0,
"caption_extension": ".txt",
"clip_skip": 2,
"color_aug": false,
"conv_alpha": 1,
"conv_block_alphas": "",
"conv_block_dims": "",
"conv_dim": 1,
"decompose_both": false,
"dim_from_weights": false,
"down_lr_weight": "",
"enable_bucket": false,
"epoch": 1,
"factor": -1,
"flip_aug": false,
"full_bf16": false,
"full_fp16": false,
"gradient_accumulation_steps": "1",
"gradient_checkpointing": false,
"keep_tokens": "0",
"learning_rate": 0.0001,
"logging_dir": "/home/tor/kohya_ss/LORA/log",
"lora_network_weights": "",
"lr_scheduler": "constant",
"lr_scheduler_args": "",
"lr_scheduler_num_cycles": "",
"lr_scheduler_power": "",
"lr_warmup": 0,
"max_bucket_reso": 2048,
"max_data_loader_n_workers": "1",
"max_resolution": "768,768",
"max_timestep": 1000,
"max_token_length": "75",
"max_train_epochs": "",
"max_train_steps": "",
"mem_eff_attn": true,
"mid_lr_weight": "",
"min_bucket_reso": 256,
"min_snr_gamma": 0,
"min_timestep": 0,
"mixed_precision": "fp16",
"model_list": "custom",
"module_dropout": 0,
"multires_noise_discount": 0,
"multires_noise_iterations": 0,
"network_alpha": 128,
"network_dim": 256,
"network_dropout": 0,
"no_token_padding": false,
"noise_offset": 0,
"noise_offset_type": "Original",
"num_cpu_threads_per_process": 2,
"optimizer": "AdamW",
"optimizer_args": "",
"output_dir": "/home/tor/kohya_ss/LORA/model",
"output_name": "dzetaA4_80_",
"persistent_data_loader_workers": false,
"pretrained_model_name_or_path": "/home/tor/kohya_ss/model/absolutereality_v181.safetensors",
"prior_loss_weight": 1.0,
"random_crop": false,
"rank_dropout": 0,
"reg_data_dir": "",
"resume": "",
"sample_every_n_epochs": 0,
"sample_every_n_steps": 0,
"sample_prompts": "",
"sample_sampler": "euler_a",
"save_every_n_epochs": 1,
"save_every_n_steps": 0,
"save_last_n_steps": 0,
"save_last_n_steps_state": 0,
"save_model_as": "safetensors",
"save_precision": "fp16",
"save_state": false,
"scale_v_pred_loss_like_noise_pred": false,
"scale_weight_norms": 0,
"sdxl": false,
"sdxl_cache_text_encoder_outputs": false,
"sdxl_no_half_vae": true,
"seed": "",
"shuffle_caption": false,
"stop_text_encoder_training": 0,
"text_encoder_lr": 0.0004,
"train_batch_size": 1,
"train_data_dir": "/home/tor/kohya_ss/LORA/img",
"train_on_input": true,
"training_comment": "",
"unet_lr": 0.0001,
"unit": 1,
"up_lr_weight": "",
"use_cp": false,
"use_wandb": false,
"v2": false,
"v_parameterization": false,
"v_pred_like_loss": 0,
"vae_batch_size": 0,
"wandb_api_key": "",
"weighted_captions": false,
"xformers": "none"

LoRa training on sdxl
.json

Details

"LoRA_type": "Standard",
"adaptive_noise_scale": 0,
"additional_parameters": "",
"block_alphas": "",
"block_dims": "",
"block_lr_zero_threshold": "",
"bucket_no_upscale": false,
"bucket_reso_steps": 64,
"cache_latents": true,
"cache_latents_to_disk": true,
"caption_dropout_every_n_epochs": 0.0,
"caption_dropout_rate": 0.05,
"caption_extension": ".txt",
"clip_skip": "1",
"color_aug": false,
"conv_alpha": 1,
"conv_block_alphas": "",
"conv_block_dims": "",
"conv_dim": 1,
"decompose_both": false,
"dim_from_weights": false,
"down_lr_weight": "",
"enable_bucket": false,
"epoch": 50,
"factor": -1,
"flip_aug": false,
"full_bf16": false,
"full_fp16": false,
"gradient_accumulation_steps": "1",
"gradient_checkpointing": true,
"keep_tokens": "0",
"learning_rate": 3e-05,
"logging_dir": "/home/tor/kohya_ss/LORA/log",
"lora_network_weights": "",
"lr_scheduler": "constant",
"lr_scheduler_args": "",
"lr_scheduler_num_cycles": "",
"lr_scheduler_power": "",
"lr_warmup": 0,
"max_bucket_reso": 2048,
"max_data_loader_n_workers": "0",
"max_resolution": "1024,1024",
"max_timestep": 1000,
"max_token_length": "75",
"max_train_epochs": "50",
"max_train_steps": "",
"mem_eff_attn": true,
"mid_lr_weight": "",
"min_bucket_reso": 256,
"min_snr_gamma": 5,
"min_timestep": 0,
"mixed_precision": "fp16",
"model_list": "custom",
"module_dropout": 0,
"multires_noise_discount": 0,
"multires_noise_iterations": 0,
"network_alpha": 32,
"network_dim": 32,
"network_dropout": 0,
"no_token_padding": false,
"noise_offset": 0,
"noise_offset_type": "Original",
"num_cpu_threads_per_process": 2,
"optimizer": "AdamW",
"optimizer_args": "",
"output_dir": "/home/tor/kohya_ss/LORA/model",
"output_name": "dzetaA4xl",
"persistent_data_loader_workers": false,
"pretrained_model_name_or_path": "/home/tor/kohya_ss/model/sdXL_v10VAEFix.safetensors",
"prior_loss_weight": 1.0,
"random_crop": false,
"rank_dropout": 0,
"reg_data_dir": "",
"resume": "",
"sample_every_n_epochs": 0,
"sample_every_n_steps": 0,
"sample_prompts": "",
"sample_sampler": "euler_a",
"save_every_n_epochs": 1,
"save_every_n_steps": 0,
"save_last_n_steps": 0,
"save_last_n_steps_state": 0,
"save_model_as": "safetensors",
"save_precision": "fp16",
"save_state": false,
"scale_v_pred_loss_like_noise_pred": false,
"scale_weight_norms": 0,
"sdxl": true,
"sdxl_cache_text_encoder_outputs": false,
"sdxl_no_half_vae": true,
"seed": "",
"shuffle_caption": false,
"stop_text_encoder_training": 0,
"text_encoder_lr": 3e-05,
"train_batch_size": 3,
"train_data_dir": "/home/tor/kohya_ss/LORA/img",
"train_on_input": true,
"training_comment": "3 repeats. More info: https://civitai.com/articles/1771",
"unet_lr": 3e-05,
"unit": 1,
"up_lr_weight": "",
"use_cp": false,
"use_wandb": false,
"v2": false,
"v_parameterization": false,
"v_pred_like_loss": 0,
"vae_batch_size": 0,
"wandb_api_key": "",
"weighted_captions": false,
"xformers": "none"

Good luck

The text was updated successfully, but these errors were encountered:

tornado73 · 2023-09-04T21:12:51Z

sd1.5

sdxl

shssoichiro · 2023-09-05T15:13:08Z

Thanks, this has been driving me nuts however as I get a segfault when trying to run Lora training. I'm on a 7900 XTX, I went through all setup as you and many others have shown, yet getting this segfault:

EDIT: I got past the original segfault by changing to export HSA_OVERRIDE_GFX_VERSION=11.0.0, this part seems very important for 7000-series. However now I'm just getting a segfault further down in the process 🤦 Below is the new segfault location. What's interesting is a1111's UI works out of the box with no special pytorch or variable exports (for generating images, haven't tried training with it), but kohya requires all these extra steps and is still segfaulting.

create LoRA network. base dim (rank): 128, alpha: 64.0
neuron dropout: p=None, rank dropout: p=None, module dropout: p=None
create LoRA for Text Encoder 1:
create LoRA for Text Encoder 2:
create LoRA for Text Encoder: 264 modules.
create LoRA for U-Net: 722 modules.
enable LoRA for text encoder
enable LoRA for U-Net
prepare optimizer, data loader etc.
use AdamW optimizer | {}
running training / 学習開始  num train images * repeats / 学習画像の数×繰り返し回数: 408
  num reg images / 正則化画像の数: 408
  num batches per epoch / 1epochのバッチ数: 816
  num epochs / epoch数: 5
  batch size per device / バッチサイズ: 1
  gradient accumulation steps / 勾配を合計するステップ数 = 1
  total optimization steps / 学習ステップ数: 4080
steps:   0%|                                                                                        | 0/4080 [00:00<?, ?it/s]
epoch 1/5
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ /home/soichiro/build/kohya_ss/venv/bin/accelerate:8 in <module>                                  │
│                                                                                                  │
│   5 from accelerate.commands.accelerate_cli import main                                          │
│   6 if __name__ == '__main__':                                                                   │
│   7 │   sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])                         │
│ ❱ 8 │   sys.exit(main())                                                                         │
│   9                                                                                              │
│                                                                                                  │
│ /home/soichiro/build/kohya_ss/venv/lib/python3.10/site-packages/accelerate/commands/accelerate_c │
│ li.py:45 in main                                                                                 │
│                                                                                                  │
│   42 │   │   exit(1)                                                                             │
│   43 │                                                                                           │
│   44 │   # Run                                                                                   │
│ ❱ 45 │   args.func(args)                                                                         │
│   46                                                                                             │
│   47                                                                                             │
│   48 if __name__ == "__main__":                                                                  │
│                                                                                                  │
│ /home/soichiro/build/kohya_ss/venv/lib/python3.10/site-packages/accelerate/commands/launch.py:91 │
│ 8 in launch_command                                                                              │
│                                                                                                  │
│   915 │   elif defaults is not None and defaults.compute_environment == ComputeEnvironment.AMA   │
│   916 │   │   sagemaker_launcher(defaults, args)                                                 │
│   917 │   else:                                                                                  │
│ ❱ 918 │   │   simple_launcher(args)                                                              │
│   919                                                                                            │
│   920                                                                                            │
│   921 def main():                                                                                │
│                                                                                                  │
│ /home/soichiro/build/kohya_ss/venv/lib/python3.10/site-packages/accelerate/commands/launch.py:58 │
│ 0 in simple_launcher                                                                             │
│                                                                                                  │
│   577 │   process.wait()                                                                         │
│   578 │   if process.returncode != 0:                                                            │
│   579 │   │   if not args.quiet:                                                                 │
│ ❱ 580 │   │   │   raise subprocess.CalledProcessError(returncode=process.returncode, cmd=cmd)    │
│   581 │   │   else:                                                                              │
│   582 │   │   │   sys.exit(1)                                                                    │
│   583                                                                                            │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
CalledProcessError: Command '['/home/soichiro/build/kohya_ss/venv/bin/python3.10', './sdxl_train_network.py', 
'--enable_bucket', '--min_bucket_reso=256', '--max_bucket_reso=2048', 
'--pretrained_model_name_or_path=/home/soichiro/build/stable-diffusion-webui/models/Stable-diffusion/SD_XL/RealitiesEdgeXL_20
.safetensors', '--train_data_dir=/home/soichiro/Pictures/training/asdf/img/', 
'--reg_data_dir=/home/soichiro/Pictures/training/asdf/reg/', '--resolution=1024,1024', 
'--output_dir=/home/soichiro/Pictures/training/asdf/model/', 
'--logging_dir=/home/soichiro/Pictures/training/asdf/log/', '--network_alpha=64', '--save_model_as=safetensors', 
'--network_module=networks.lora', '--text_encoder_lr=5e-05', '--unet_lr=0.0001', '--network_dim=128', 
'--output_name=TESTING_v1', '--lr_scheduler_num_cycles=5', '--no_half_vae', '--learning_rate=0.0001', 
'--lr_scheduler=cosine_with_restarts', '--lr_warmup_steps=408', '--train_batch_size=1', '--max_train_steps=4080', 
'--save_every_n_epochs=1', '--mixed_precision=fp16', '--save_precision=fp16', '--caption_extension=.txt', 
'--optimizer_type=AdamW', '--max_data_loader_n_workers=0', '--caption_dropout_rate=0.1', '--bucket_reso_steps=64', 
'--shuffle_caption', '--gradient_checkpointing', '--sdpa', '--bucket_no_upscale', '--noise_offset=0.1', 
'--sample_sampler=euler_a', '--sample_prompts=/home/soichiro/Pictures/training/asdf/model/sample/prompt.txt', 
'--sample_every_n_steps=25']' died with <Signals.SIGSEGV: 11>.

tornado73 · 2023-09-05T17:08:59Z

I attached the contents of the fil requirements.txt
,you have CUDA installed during installation
do step by step as I wrote
DO NOT run immediately gui.sh

Models, unfortunately, curves are obtained -(((
I didn't have much time to check, but three SD 1.5 models trained on the latest version give an absolutely inadequate result when generated
+
Learning takes place three times slower

Unfortunately, the latest version that trains normally, in our case for 1.5 models, is 21.7.10

shssoichiro · 2023-09-05T18:00:08Z

Thanks, I went back and re-did the full process and it is working now, only having to change HSA_OVERRIDE_GFX_VERSION to 11.0.0 for 7000-series. Although for SDXL it is using a very large amount of VRAM even with cache latents disabled. All 24 GB to the point of locking up my machine. Other users seem to have reported as little as 12 GB works with SDXL but that is with nvidia+xformers sadly.

tornado73 · 2023-09-06T06:36:28Z

strange, but I had 16 GB to work without crashes

royallife88 · 2023-09-16T06:22:51Z

Hi,
@tornado73 did everything as you write the instructions above. But training still running on CPU that have a huge time. So what is the problem here that it's not using GPU (RX 5500 XT).
Thanks.

2blackbar · 2023-09-17T13:27:05Z

I think it installs torch cpu version, installed needs to be checked very carefuly by the dev of this repo, he has all this installed since the start so he doesnt test if its working when installing from 0

ngovandang · 2023-09-21T00:18:05Z

Linux only?

tornado73 · 2023-09-21T06:25:43Z

Только линукс?
yes ,only Linux

использует графический процессор (RX 5500 XT)?
AMD Radeon RX 5500 XT based on RDNA 1.0 architecture

https://rocm.docs.amd.com/en/latest/release/gpu_os_support.html

unfortunately not supported-(
teach on google colab,

FrostyForest · 2023-09-25T16:56:54Z

6700xt，Ubuntu，ROCm5.3 successfully train a sdxl lora too.

Charmandrigo · 2023-11-21T00:11:43Z

has anyone tested rocm 5.7?

tornado73 · 2023-11-24T19:54:12Z

Yes it works
pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm5.7

danielaixer · 2023-12-19T11:46:30Z

When I try to train a Dreambooth LoRA I get ModuleNotFoundError: No module named 'xformers'

I'm on Ubuntu 22.04, 7900XTX and I can generate images on Stable Diffusion using the GPU.

Install commands:

sudo apt-get install -y python3-tk
rm -rf kohya_ss
git clone https://github.com/bmaltais/kohya_ss.git 
cd kohya_ss
python -m venv venv
source venv/bin/activate
# This is the same I installed for SD
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.6
pip install --use-pep517 --upgrade -r requirements.txt
accelerate config

requirements.txt contains exactly what was shown here: #1484 (comment)

Run commands:

export HSA_OVERRIDE_GFX_VERSION=11.0.0
source venv/bin/activate
python kohya_gui.py --server_port 7862 --listen 0.0.0.0

I'm pretty sure xFormers for kohya_ss is mandatory, but xFormers doesn't support AMD GPUs... facebookresearch/xformers#807 (comment)

Edit: I got rid of the xFormers error by setting "Parameters > Basic > CrossAttention" to "none", but it still doesn't work and it looks like it's not detecting or using the GPU, as I get messages like:

2023-12-20 19:18:42.944527: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.

And also:
ModuleNotFoundError: No module named 'bitsandbytes'

Which I fixed with:

source venv/bin/activate
pip install bitsandbytes

And now I get
AttributeError: 'NoneType' object has no attribute 'split'

But even if I fixed that, it's still not using the GPU

Edit 2: Using the optimizer AdamW (which doesn't require bitsandbytes) instead of AdamW8bit gives this other error:
RuntimeError: "addmm_impl_cpu_" not implemented for 'Half'

Edit 3: the above issue is fixed with "Mixed precision" set to "no" on the UI parameters. I can train LoRAs now.

AndreGamaliel · 2023-12-31T01:13:34Z

when i start training the gpu doesnt got utilize at all, i install rocm correctly since i use same env that i use for stable diffusion
any ideas or it just take long on amd gpu
my card is rx 6700xt

AnimalEater · 2024-01-26T04:06:22Z

Is this still working? I tried it but I never do this type of stuff, it took sooo much effort with chatGPT and stuff but didn't get it to work. If someone can verify that it does in fact still work, I will give me and my brother chatGPT another chance. (6800 xt)

danielaixer · 2024-01-26T13:00:25Z

Is this still working? I tried it but I never do this type of stuff, it took sooo much effort with chatGPT and stuff but didn't get it to work. If someone can verify that it does in fact still work, I will give me and my brother chatGPT another chance. (6800 xt)

This should still be valid, but apparently it only uses the CPU, not the GPU.

GUUser91 · 2024-02-15T10:41:18Z

I followed tornado73's instructions and when I start to train a model, I get these error messages.

RuntimeError: Error(s) in loading state_dict for CLIPTextModel:

TypeError: StableDiffusionPipeline.init() got an unexpected keyword argument 'image_encoder'

So I updated transformers and diffusers and now these error messages are gone.
I'm using the nightly rocm 6.0 builds of pytorch on my 7900 xtx

fallfro · 2024-02-16T19:11:25Z

I have attempted installing and reinstalling multiple times using the explicit instructions provided it refuses to recognize my AMD GPU.

I still get the " tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used." error.

I have even copied the steps explicitly using the version of requriements.txt that correlates to the version of Kohya that was available at the time these instructions were written - it makes no difference. Regardless of what I do it simply will not see my GPU.

bmaltais · 2024-02-16T21:20:47Z

I don’t thing kohya sd-scripts support AMD cards…

…

On Fri, Feb 16, 2024 at 14:11 fallfro ***@***.***> wrote: I have attempted installing and reinstalling multiple times using the explicit instructions provided it refuses to recognize my AMD GPU. I still get the " tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used." error. I have even copied the steps explicitly using the versions of Kohya that were available at the time these instructions were written - it makes no difference. Regardless of what I do it simply will not see my GPU. — Reply to this email directly, view it on GitHub <#1484 (comment)>, or unsubscribe <https://github.com/notifications/unsubscribe-auth/ABZA34XUEEFQECOGAUL7BXTYT6VOVAVCNFSM6AAAAAA4K3RF2SVHI2DSMVQWIX3LMV43OSLTON2WKQ3PNVWWK3TUHMYTSNBZGE2TCMBTHA> . You are receiving this because you modified the open/close state.Message ID: ***@***.***>

zacvine · 2024-03-03T04:12:11Z

This does work as of the latest version downloaded march 2nd:

git clone https://github.com/bmaltais/kohya_ss.git
cd kohya_ss
python3.10 -m venv venv
this is where people maybe going wrong as if the machine has a more recent python and they don't specify a correct version
source venv/bin/activate

this also works with the nightly preview rocm6.0
pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.0

pip install --use-pep517 --upgrade -r requirements.txt (updated to reflect the list at the top of the post, can disregard darwin versions for mac (on PC anyway), and remove tensorflow and leave tensorflow-rocm, this will mean less steps)
update diffusers
update transformers
update accelerate
ensure that ONLY tensorflow-rocm is installed and not tensorflow, should be like similar:

(venv) (base) :/kohya_ss$ pip show tensorflow
WARNING: Package(s) not found: tensorflow
(venv) (base) :/kohya_ss$ pip show tensorflow-rocm
Name: tensorflow-rocm
Version: 2.14.0.600

accelerate config

This machine
No distributed training
no
no
no 
all
fp16

sudo apt install python3-tk

BEFORE RUNNING:
open gui.sh in text editor or whatever, delete everything and put the following:

#!/usr/bin/env bash
export PYTHONPATH=$HOME/kohya_ss
export HSA_OVERRIDE_GFX_VERSION=10.3.0
source venv/bin/activate
python kohya_gui.py "$@"

adjust the kohya_ss directory as necessary
assumes 6000 series graphics cards, if using 7000 series then:
HSA_OVERRIDE_GFX_VERSION=11.0.0

now here is the bit that needs a little tweaking and i am still trying to balance the packages just right. start the gui AFTER editing the script:
./gui.sh

ignore the gradio warnings, then do your thing in the gui but before training, you need to open another terminal and upgrade gradio within the virtual enviroment, there is probs a way to automate this but i haven't done it yet, anyways i just use:
pip uninstall gradio
pip install gradio==4.0.0

then start your training:

The above speeds are on a 6900xt SDXL training at res 1024,1024
I have tested the actual loras as well and they do work, training on prodigy with arguments:
decouple=True weight_decay=0.01 betas=[0.9,0.999] d_coef=0.8 use_bias_correction=True safeguard_warmup=True
I would also just quickly say if using prodigy keep your batch size to 1 since it adjusts the learning rate each iteration it means a more adaptive approach.The above speeds let me train an sdxl lora in around an hour and a half (since running through all 3000 steps is usually highly overtrained near the end.

I know this has been closed but i hope this helps some people

edit: you will get api warnings when you first launch the gui, but you can just leave gradio 4.0.0 installed and it works fine

danielaixer · 2024-03-03T15:32:46Z

@zacvine

Which optimizer are you using for training?
Does this message appear somewhere in the console when you start the GUI?: Could not find cuda drivers on your machine, GPU will not be used.

danielaixer · 2024-03-03T15:34:29Z

I have attempted installing and reinstalling multiple times using the explicit instructions provided it refuses to recognize my AMD GPU.

I still get the " tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used." error.

I have even copied the steps explicitly using the version of requriements.txt that correlates to the version of Kohya that was available at the time these instructions were written - it makes no difference. Regardless of what I do it simply will not see my GPU.

I think that's... expected with AMD GPU's

Yeah, even if the issue title says "It's working", that doesn't seem to include the GPU...

zacvine · 2024-03-03T16:21:36Z

@zacvine

* Which optimizer are you using for training?

* Does this message appear somewhere in the console when you start the GUI?: `Could not find cuda drivers on your machine, GPU will not be used. `

No, I only get the api error messages for gradio, but they seem to have no impact so far and chatgpt seems to think they can be overlooked, it is 100% using GPU, I used to get nowhere near those speeds now I can train an sdxl lora in around an hour and half, on CPU which i have used, i would literally have to leave it on overnight. I am training on prodigy, using extra optimizer settings decouple=True weight_decay=0.01 betas=[0.9,0.999] d_coef=0.8 use_bias_correction=True safeguard_warmup=True, i am sstill tinkering but the training is defo working, concepts have no issue but am getting some artifacts from overtraining in character loras, which i need to muck around with... there are a few different steps i took from the original post to get it working, and i kooked it a few times, but basically there are a couple of key points...i don't know if you overlooked or not but where i went wrong....

python3.10 -m venv venv - you NEED to include the version number because otherwise it will default to the system installed, which in my case was 3.11 something, so it wont work.
use the nightly of the latest rocm -pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.0
DO NOT INSTALL TENSORFLOW, it is in the requirements text, there are different versions, install ONLY the rocm version:
update diffusers, transformers and accelerator, upgrade gradio to 4.0.0 specifically pip uninstall gradio
pip install gradio==4.0.0
run the accelerator config as per original post
run sudo apt install python3-tk as per post
DO NOT RUN THE GUI YET...you need to ensure that before running anything you change the gui.sh file, just delete it all and replace it with:
#!/usr/bin/env bash
export PYTHONPATH=$HOME/kohya_ss
export HSA_OVERRIDE_GFX_VERSION=10.3.0
source venv/bin/activate
python kohya_gui.py "$@"

then start the gui from terminal ./gui.sh
This is exactly what i did literally 12 hours ago to get it to work with the latest version.

best of luck, hopefully something there helped.

larssn · 2024-03-04T23:27:45Z

Whats the minimum required memory for training SDXL? I have a RX6750 12GB, but keep getting OOM, even with AdamW8Bit, gradient checkpointing and mem attn.

bmaltais · 2024-03-04T23:33:51Z

I can barelly train on my 3090 with 24GB of VRAM. For LoRA probably 16GB might work but might be tough.

zacvine · 2024-03-05T00:05:15Z

Whats the minimum required memory for training SDXL? I have a RX6750 12GB, but keep getting OOM, even with AdamW8Bit, gradient checkpointing and mem attn.

you should be able to get away with it on 12gb. I would try manually resizing my training data down to 1024 before doing it and disable buckets. Cache latents to disk, make sure you have to disk checked, NOT just cache latents (means you can't use any data augmentation tho' like random crop, flip, etc). Don't try to have a batch size over 1. In my experience adam 8-bit won't work on an AMD card, try prodigy, it has an adaptive learning rate, I use these settings for it: decouple=True weight_decay=0.01 betas=[0.9,0.999] d_coef=0.8 use_bias_correction=True safeguard_warmup=True then i use cosine, set all learning rates to 1. you could lower the resolution further if need be, but it might impact the quality of the Lora obviously, especially if going for a photo style, but concepts may turn out just as good, i don't know. There is also a way you can train on a diffusers model, I'm not sure if it is as simple as downloading it from hugging face and selecting it from the script as a custom model, but apparently it uses significantly less resources.

larssn · 2024-03-06T21:06:40Z

@zacvine Thanks for the tips, but I still get OOM even with buckets disabled, and caching to disk. I've tried ROCM 5.7 and the nightly 6.0: no difference there.

Any other memory tips?

There is also a way you can train on a diffusers model, I'm not sure if it is as simple as downloading it from hugging face and selecting it from the script as a custom model, but apparently it uses significantly less resources.

Might look into that later, if this is a lost cause. 😄

My current settings

accelerate launch --num_cpu_threads_per_process=2 "./sdxl_train_network.py" --bucket_no_upscale --bucket_reso_steps=32         
                         --cache_latents --cache_latents_to_disk --caption_extension=".txt" --gradient_checkpointing --learning_rate="1.0"              
                         --lr_scheduler="cosine" --lr_scheduler_num_cycles="1"   
                         --max_data_loader_n_workers="0" --max_grad_norm="1" --resolution="1024,1024" --max_token_length=225 --max_train_steps="10560"  
                         --mem_eff_attn --min_snr_gamma=5 --mixed_precision="no" --network_alpha="8" --network_args "preset=unet-transformer-only"      
                         "conv_dim=4" "conv_alpha=1" "rank_dropout=0" "module_dropout=0" "use_tucker=False" "use_scalar=False"                          
                         "rank_dropout_scale=False" "algo=locon" "train_norm=False" --network_dim=8 --network_module=lycoris.kohya --no_half_vae        
                         --multires_noise_iterations="6" --multires_noise_discount="0.3" --optimizer_args decouple=True weight_decay=0.01               
                         betas=[0.9,0.999] d_coef=0.8 use_bias_correction=True safeguard_warmup=True --optimizer_type="Prodigy"                         
                         --save_every_n_epochs="1"                  
                         --save_model_as=safetensors --save_precision="fp16" --scale_weight_norms="5" --text_encoder_lr=1.0 --train_batch_size="1"      
                         --unet_lr=1.0

Crashes with: torch.cuda.OutOfMemoryError: HIP out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 11.98 GiB of which 0 bytes is free

hqnicolas · 2024-03-16T20:57:33Z

@zacvine Thank You!
kohya_ss --branch v22.2.2
7800xt Training LORA of the latest version of kohya_ss on AMD GPU Ubuntu 22.04.2 LTS sd1.5&sdxl

zacvine · 2024-03-17T03:02:36Z

@zacvine Thanks for the tips, but I still get OOM even with buckets disabled, and caching to disk. I've tried ROCM 5.7 and the nightly 6.0: no difference there.

Any other memory tips?

There is also a way you can train on a diffusers model, I'm not sure if it is as simple as downloading it from hugging face and selecting it from the script as a custom model, but apparently it uses significantly less resources.

Might look into that later, if this is a lost cause. 😄
My current settings

accelerate launch --num_cpu_threads_per_process=2 "./sdxl_train_network.py" --bucket_no_upscale --bucket_reso_steps=32         
                         --cache_latents --cache_latents_to_disk --caption_extension=".txt" --gradient_checkpointing --learning_rate="1.0"              
                         --lr_scheduler="cosine" --lr_scheduler_num_cycles="1"   
                         --max_data_loader_n_workers="0" --max_grad_norm="1" --resolution="1024,1024" --max_token_length=225 --max_train_steps="10560"  
                         --mem_eff_attn --min_snr_gamma=5 --mixed_precision="no" --network_alpha="8" --network_args "preset=unet-transformer-only"      
                         "conv_dim=4" "conv_alpha=1" "rank_dropout=0" "module_dropout=0" "use_tucker=False" "use_scalar=False"                          
                         "rank_dropout_scale=False" "algo=locon" "train_norm=False" --network_dim=8 --network_module=lycoris.kohya --no_half_vae        
                         --multires_noise_iterations="6" --multires_noise_discount="0.3" --optimizer_args decouple=True weight_decay=0.01               
                         betas=[0.9,0.999] d_coef=0.8 use_bias_correction=True safeguard_warmup=True --optimizer_type="Prodigy"                         
                         --save_every_n_epochs="1"                  
                         --save_model_as=safetensors --save_precision="fp16" --scale_weight_norms="5" --text_encoder_lr=1.0 --train_batch_size="1"      
                         --unet_lr=1.0

Crashes with: torch.cuda.OutOfMemoryError: HIP out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 11.98 GiB of which 0 bytes is free

Sorry for the late response.

Also, try swapping prodigy to adamw, not adam8bit as this wont work, but adamw should use less memory than prodigy and you are only off by 20mb so this might be enough to push it over without lowering the res. Change the arguments for optimizer to weight_decay=0.01 betas=[0.9,0.999] (the others won't be needed, unless you are using a warmup phase in which case leave in safeguard_warmup=True, but it doesn't look like you are).

If this isn't enough, think for the sake of 20mb you could try downing the resolution a tiny bit, if it's a concept, style, or anime character, this might not be noticeable, could be concerning if it is a realistic character or something that requires a lot of detail. I know some people who train on sdxl at the sd2.0 size which, off the top of my head is like 768x768. You may not even need to go that low.

Aside from that I could only suggest using a third party program prior to training to ensure the GPU memory is completely empty. Like, if you have a model loaded from stable diffusion or something at the same time.

best of luck.

JohnDoe02 · 2024-03-17T10:00:13Z

@danielaixer

Regarding the bitsandbytes issue, I was able to make progress by installing this fork for rocm into my venv:
https://github.com/arlo-phoenix/bitsandbytes-rocm-5.6

Do not be concerned because of the version number, this works fine and was actually tested by the author himself for versions >5.6, cf. bitsandbytes-foundation/bitsandbytes#756. Personally, I can confirm that it works fine on Arch linux with rocm 6.0 installed from the repos. Compilation and installation are straightforward:

source kohya_ss/venv/bin/activate
git clone https://github.com/arlo-phoenix/bitsandbytes-rocm-5.6.git
cd bitsandbytes-rocm-5.6
# Use gfx1030 for 69xx cards, gfx1100 is for 79xx
ROCM_TARGET=gfx1100 make hip
pip install -e ./

With the rocm-enabled bitsandbytes, I am able to use all kind of optimizers such as AdamW8Bit or Adafactor. Also mixed precision using, e.g., bf16 works when configured with accelerate config. Regarding cross attention, I can select sdpa instead of xformers without issues.

hqnicolas · 2024-03-18T00:12:50Z

@JohnDoe02 your Bitsandbytes works here! Thank you!

zacvine · 2024-03-18T05:06:58Z

@JohnDoe02 Yeah this is perfect, trained a lora with Adam8bit using this and it actually worked instead of giving me empty black boxes. I would ask though, when using bf16 it seems to increase the training time by roughly 400% (about 8.7 it/s) when I use fp16 with the 8bit optimiser it works well so I don't think it's specific to the bitsandbytes setup. Is this a regular thing in your experience? I am using an older 6000 series card (6900XT) if you hink this could be the issue.

@larssn you should for sure try installing this if you haven't because the Adam8bit uses significantly less GPU memory.

larssn · 2024-03-18T09:07:48Z

Nice, thanks for the heads up. I definitely will.

Did you guys end up downgrading to ROCM 5.6 to be compatible with bitsandbytes, or isn't that necessary?

zacvine · 2024-03-18T10:27:10Z

@larssn No I still run the nightly with no problems, just did as per the post. I'm trying the same/similar process now to see if it works with everydream because it has a an option to find the best learning rate by training with a validation option enabled for a small number of steps on your dataset, using a learning rate that increases over time. This produces a distinctive shaped curve in a validation graph to illustrate the steps where the model was training best.

Charmandrigo · 2024-06-05T03:00:53Z

I think it would be more useful if this whole setup was just a fork to quick install

Insistentelk · 2024-07-21T15:38:39Z

Theirs a new error with kohya_ss/kohya_gui/class_gui_config.py", line 1, in
import toml
ModuleNotFoundError: No module named 'toml'

using
pip install toml
leading to
pip install psutil
leading to
pip install transformers

this dose make the program and system open... but any training fails with a

can't open file sdxl_train_network.py [Errno 2] No such file or directory

yes the sd-script folder where it should be is completely empty.

I'm no coder so i have no idea what going on, I'm just following instructions.

edit 9-24-24

also apparently all python commands need to be replaced with python3 for some reason.
and now the process fails at Obtaining file:///home/smith/kohya_ss/sd-scripts (from -r requirements.txt (line 35))
ERROR: file:///home/smith/kohya_ss/sd-scripts (from -r requirements.txt (line 35)) does not appear to be a Python project: neither 'setup.py' nor 'pyproject.toml' found.
accelerate: command not found

nxtkofi · 2024-09-24T19:02:16Z

This does work as of the latest version downloaded march 2nd:

ArchLinux + AMD RX6800XT confirmed to work with this setup. Life savior man, I've been wandering for about 4-5 hours on how to do it.

Charmandrigo · 2024-09-28T09:34:25Z

This is something I achieved, It's Kohya running on a 7900 XTX, this is a SDXL 1024,1024 training, on a batch size of 3.

I had to use custom torch versions (the --pre rocm 6.2 variants) plus multiple custom libraries published on Rocm documentation page.
I believe the 6.2 comes with flash-attention-2 implemented, at least that's what I am assuming because of these speeds I am getting.

The only thing I didn't managed to run yet is Xformers, despite it properly build and compiled for GFX1100

Also it's running on WSL2 :b (which I don't recommend tbh)

tornado73 changed the title ~~Work training LORA of the latest version of kohya_ss on AMD GPU,Ubuntu 22.04.2 LTS ,test on RX6800 ,sd1.5&sdxl~~ It's working. Training LORA of the latest version of kohya_ss on AMD GPU,Ubuntu 22.04.2 LTS ,test on RX6800 ,sd1.5&sdxl Sep 4, 2023

danielaixer mentioned this issue Dec 23, 2023

Linux and AMD Radeon RX 7900XTX: RuntimeError: "addmm_impl_cpu_" not implemented for 'Half' | Is it possible to enable the stable-diffusion-webui equivalent to --precision full --no-half? #1793

Closed

bmaltais closed this as completed Jan 29, 2024

It's working. Training LORA of the latest version of kohya_ss on AMD GPU,Ubuntu 22.04.2 LTS ,test on RX6800 ,sd1.5&sdxl #1484

It's working. Training LORA of the latest version of kohya_ss on AMD GPU,Ubuntu 22.04.2 LTS ,test on RX6800 ,sd1.5&sdxl #1484

Comments

tornado73 commented Sep 4, 2023 • edited Loading

tornado73 commented Sep 4, 2023

shssoichiro commented Sep 5, 2023 • edited Loading

tornado73 commented Sep 5, 2023 • edited Loading

shssoichiro commented Sep 5, 2023

tornado73 commented Sep 6, 2023

royallife88 commented Sep 16, 2023

2blackbar commented Sep 17, 2023

ngovandang commented Sep 21, 2023

tornado73 commented Sep 21, 2023

FrostyForest commented Sep 25, 2023 • edited Loading

Charmandrigo commented Nov 21, 2023

tornado73 commented Nov 24, 2023 • edited Loading

danielaixer commented Dec 19, 2023 • edited Loading

AndreGamaliel commented Dec 31, 2023 • edited Loading

AnimalEater commented Jan 26, 2024

danielaixer commented Jan 26, 2024

GUUser91 commented Feb 15, 2024 • edited Loading

fallfro commented Feb 16, 2024 • edited Loading

bmaltais commented Feb 16, 2024 via email

zacvine commented Mar 3, 2024 • edited Loading

danielaixer commented Mar 3, 2024

danielaixer commented Mar 3, 2024

zacvine commented Mar 3, 2024

larssn commented Mar 4, 2024

bmaltais commented Mar 4, 2024

zacvine commented Mar 5, 2024 • edited Loading

larssn commented Mar 6, 2024 • edited Loading

hqnicolas commented Mar 16, 2024 • edited Loading

zacvine commented Mar 17, 2024

JohnDoe02 commented Mar 17, 2024 • edited Loading

hqnicolas commented Mar 18, 2024 • edited Loading

zacvine commented Mar 18, 2024 • edited Loading

larssn commented Mar 18, 2024

zacvine commented Mar 18, 2024

Charmandrigo commented Jun 5, 2024

Insistentelk commented Jul 21, 2024 • edited Loading

nxtkofi commented Sep 24, 2024 • edited Loading

Charmandrigo commented Sep 28, 2024 • edited Loading

tornado73 commented Sep 4, 2023 •

edited

Loading

shssoichiro commented Sep 5, 2023 •

edited

Loading

tornado73 commented Sep 5, 2023 •

edited

Loading

FrostyForest commented Sep 25, 2023 •

edited

Loading

tornado73 commented Nov 24, 2023 •

edited

Loading

danielaixer commented Dec 19, 2023 •

edited

Loading

AndreGamaliel commented Dec 31, 2023 •

edited

Loading

GUUser91 commented Feb 15, 2024 •

edited

Loading

fallfro commented Feb 16, 2024 •

edited

Loading

zacvine commented Mar 3, 2024 •

edited

Loading

zacvine commented Mar 5, 2024 •

edited

Loading

larssn commented Mar 6, 2024 •

edited

Loading

hqnicolas commented Mar 16, 2024 •

edited

Loading

JohnDoe02 commented Mar 17, 2024 •

edited

Loading

hqnicolas commented Mar 18, 2024 •

edited

Loading

zacvine commented Mar 18, 2024 •

edited

Loading

Insistentelk commented Jul 21, 2024 •

edited

Loading

nxtkofi commented Sep 24, 2024 •

edited

Loading

Charmandrigo commented Sep 28, 2024 •

edited

Loading