diff --git a/Readme.md b/Readme.md index 2d7c237..107d2d1 100755 --- a/Readme.md +++ b/Readme.md @@ -53,10 +53,13 @@ cd hqq/kernels && python setup_cuda.py install; The ```HQQBackend.ATEN_BACKPROP``` backend with ```setup_cuda``` uses CUDA kernels for the dequantization step. This leads to a significant speed-up compared to ```PYTORCH_COMPILE``` and can be combined with ```model = torch.compile(model)``` for even faster runtime: -

- HQQ Aten CUDA - Titan RTX -

- +
+
+ Titan RTX +A100 +
+
+
### Supported Models #### LLMs @@ -235,12 +238,12 @@ from hqq.core.peft import PeftUtils base_lora_params = {'lora_type':'default', 'r':32, 'lora_alpha':64, 'dropout':0.05, 'train_dtype':torch.bfloat16} lora_params = {'self_attn.q_proj': base_lora_params, - 'self_attn.k_proj': base_lora_params, - 'self_attn.v_proj': base_lora_params, - 'self_attn.o_proj': base_lora_params, - 'mlp.gate_proj' : None, - 'mlp.up_proj' : None, - 'mlp.down_proj' : None} + 'self_attn.k_proj': base_lora_params, + 'self_attn.v_proj': base_lora_params, + 'self_attn.o_proj': base_lora_params, + 'mlp.gate_proj' : None, + 'mlp.up_proj' : None, + 'mlp.down_proj' : None} PeftUtils.add_lora(model, lora_params)