From ce5bcff750c95530cc2dab2e0fe5aa0d547af923 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Mon, 2 Dec 2024 17:28:58 -0500 Subject: [PATCH] various tests fixes for flakey tests (#2110) * add mhenrichsen/alpaca_2k_test with revision dataset download fixture for flaky tests * log slowest tests * pin pynvml==11.5.3 * fix load local hub path * optimize for speed w smaller models and val_set_size * replace pynvml * make the resume from checkpoint e2e faster * make tests smaller --- cicd/cicd.sh | 6 ++--- requirements.txt | 2 +- src/axolotl/utils/bench.py | 23 +++++++++++++++----- src/axolotl/utils/data/sft.py | 2 +- tests/conftest.py | 20 ++++++++++++++--- tests/core/test_trainer_builder.py | 7 +++--- tests/e2e/patched/test_fa_xentropy.py | 6 ++--- tests/e2e/patched/test_resume.py | 29 +++++++++++++------------ tests/e2e/patched/test_unsloth_qlora.py | 6 ++--- tests/e2e/test_optimizers.py | 6 +++-- tests/e2e/test_relora_llama.py | 1 + tests/test_datasets.py | 4 ++-- tests/test_perplexity.py | 10 +++++---- 13 files changed, 78 insertions(+), 44 deletions(-) diff --git a/cicd/cicd.sh b/cicd/cicd.sh index e199e112ff..7a4a315044 100755 --- a/cicd/cicd.sh +++ b/cicd/cicd.sh @@ -1,6 +1,6 @@ #!/bin/bash set -e -pytest -n8 --ignore=tests/e2e/ /workspace/axolotl/tests/ -pytest -n1 --dist loadfile -v /workspace/axolotl/tests/e2e/patched/ /workspace/axolotl/tests/e2e/integrations/ -pytest --ignore=tests/e2e/patched/ --ignore=tests/e2e/multigpu/ --ignore=tests/e2e/integrations/ /workspace/axolotl/tests/e2e/ +pytest --durations=10 -n8 --ignore=tests/e2e/ /workspace/axolotl/tests/ +pytest --durations=10 -n1 --dist loadfile -v /workspace/axolotl/tests/e2e/patched/ /workspace/axolotl/tests/e2e/integrations/ +pytest --durations=10 --ignore=tests/e2e/patched/ --ignore=tests/e2e/multigpu/ --ignore=tests/e2e/integrations/ /workspace/axolotl/tests/e2e/ diff --git a/requirements.txt b/requirements.txt index f2086d4427..456c63ca51 100644 --- a/requirements.txt +++ b/requirements.txt @@ -26,7 +26,7 @@ numpy>=1.24.4,<=2.0.1 evaluate==0.4.1 scipy scikit-learn==1.4.2 -pynvml +nvidia-ml-py==12.560.30 art gradio==3.50.2 tensorboard diff --git a/src/axolotl/utils/bench.py b/src/axolotl/utils/bench.py index 57471ae0d8..3d338aff10 100644 --- a/src/axolotl/utils/bench.py +++ b/src/axolotl/utils/bench.py @@ -1,13 +1,24 @@ """Benchmarking and measurement utilities""" import functools -import pynvml import torch -from pynvml.nvml import NVMLError from transformers.utils.import_utils import is_torch_npu_available from axolotl.utils.distributed import get_device_type +try: + from pynvml import ( + NVMLError, + nvmlDeviceGetHandleByIndex, + nvmlDeviceGetMemoryInfo, + nvmlInit, + ) +except ImportError: + NVMLError = None + nvmlDeviceGetHandleByIndex = None + nvmlDeviceGetMemoryInfo = None + nvmlInit = None + def check_cuda_device(default_value): """ @@ -68,10 +79,12 @@ def gpu_memory_usage_smi(device=0): device = device.index if isinstance(device, str) and device.startswith("cuda:"): device = int(device[5:]) + if not nvmlInit: + return 0.0 try: - pynvml.nvmlInit() - handle = pynvml.nvmlDeviceGetHandleByIndex(device) - info = pynvml.nvmlDeviceGetMemoryInfo(handle) + nvmlInit() + handle = nvmlDeviceGetHandleByIndex(device) + info = nvmlDeviceGetMemoryInfo(handle) return info.used / 1024.0**3 except NVMLError: return 0.0 diff --git a/src/axolotl/utils/data/sft.py b/src/axolotl/utils/data/sft.py index 0bee4dd5cf..4ed16e3582 100644 --- a/src/axolotl/utils/data/sft.py +++ b/src/axolotl/utils/data/sft.py @@ -179,7 +179,7 @@ def load_tokenized_prepared_datasets( + "|".join( sorted( [ - f"{d.path}: {d.type}: {d.shards}: {d.conversation}{d.split}" + f"{d.path}:{d.type}:{d.shards}:{d.conversation}{d.split}" for d in cfg_datasets ] ) diff --git a/tests/conftest.py b/tests/conftest.py index a8bf03ac01..4479e676f4 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -14,6 +14,12 @@ def download_smollm2_135m_model(): snapshot_download("HuggingFaceTB/SmolLM2-135M") +@pytest.fixture(scope="session", autouse=True) +def download_llama_68m_random_model(): + # download the model + snapshot_download("JackFram/llama-68m") + + @pytest.fixture(scope="session", autouse=True) def download_qwen_2_5_half_billion_model(): # download the model @@ -22,18 +28,26 @@ def download_qwen_2_5_half_billion_model(): @pytest.fixture(scope="session", autouse=True) def download_tatsu_lab_alpaca_dataset(): - # download the model + # download the dataset snapshot_download("tatsu-lab/alpaca", repo_type="dataset") @pytest.fixture(scope="session", autouse=True) def download_mhenrichsen_alpaca_2k_dataset(): - # download the model + # download the dataset snapshot_download("mhenrichsen/alpaca_2k_test", repo_type="dataset") +@pytest.fixture(scope="session", autouse=True) +def download_mhenrichsen_alpaca_2k_w_revision_dataset(): + # download the dataset + snapshot_download( + "mhenrichsen/alpaca_2k_test", repo_type="dataset", revision="d05c1cb" + ) + + def download_mlabonne_finetome_100k_dataset(): - # download the model + # download the dataset snapshot_download("mlabonne/FineTome-100k", repo_type="dataset") diff --git a/tests/core/test_trainer_builder.py b/tests/core/test_trainer_builder.py index 82455922ef..558d3cb956 100644 --- a/tests/core/test_trainer_builder.py +++ b/tests/core/test_trainer_builder.py @@ -14,9 +14,7 @@ def fixture_cfg(): cfg = DictDefault( { - "base_model": "TinyLlama/TinyLlama-1.1B-Chat-v0.6", - "model_type": "AutoModelForCausalLM", - "tokenizer_type": "LlamaTokenizer", + "base_model": "HuggingFaceTB/SmolLM2-135M", "micro_batch_size": 1, "gradient_accumulation_steps": 1, "learning_rate": 0.00005, @@ -33,6 +31,9 @@ def fixture_cfg(): "dataloader_num_workers": 1, "dataloader_pin_memory": True, "model_config_type": "llama", + "special_tokens": { + "pad_token": "<|endoftext|>", + }, } ) diff --git a/tests/e2e/patched/test_fa_xentropy.py b/tests/e2e/patched/test_fa_xentropy.py index 7ca1c08365..76ea1a9348 100644 --- a/tests/e2e/patched/test_fa_xentropy.py +++ b/tests/e2e/patched/test_fa_xentropy.py @@ -51,11 +51,11 @@ def test_lora_packing_fa_cross_entropy(self, temp_dir, gradient_accumulation_ste "flash_attn_cross_entropy": True, "load_in_8bit": True, "adapter": "lora", - "lora_r": 32, - "lora_alpha": 64, + "lora_r": 8, + "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, - "val_set_size": 0.2, + "val_set_size": 0.05, "special_tokens": { "pad_token": "<|endoftext|>", }, diff --git a/tests/e2e/patched/test_resume.py b/tests/e2e/patched/test_resume.py index c0e791f38a..44d3d9e837 100644 --- a/tests/e2e/patched/test_resume.py +++ b/tests/e2e/patched/test_resume.py @@ -29,23 +29,24 @@ class TestResumeLlama(unittest.TestCase): """ @with_temp_dir - def test_resume_qlora_packed(self, temp_dir): + def test_resume_lora_packed(self, temp_dir): # pylint: disable=duplicate-code cfg = DictDefault( { - "base_model": "JackFram/llama-68m", - "tokenizer_type": "LlamaTokenizer", + "base_model": "HuggingFaceTB/SmolLM2-135M", "sequence_len": 1024, "sample_packing": True, "flash_attention": True, - "load_in_4bit": True, - "adapter": "qlora", - "lora_r": 32, - "lora_alpha": 64, + "load_in_8bit": True, + "adapter": "lora", + "lora_r": 8, + "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, - "val_set_size": 0.1, - "special_tokens": {}, + "val_set_size": 0.01, + "special_tokens": { + "pad_token": "<|endoftext|>", + }, "datasets": [ { "path": "vicgalle/alpaca-gpt4", @@ -57,11 +58,11 @@ def test_resume_qlora_packed(self, temp_dir): "gradient_accumulation_steps": 1, "output_dir": temp_dir, "learning_rate": 0.00001, - "optimizer": "adamw_torch", + "optimizer": "adamw_8bit", "lr_scheduler": "cosine", - "save_steps": 10, + "save_steps": 3, "save_total_limit": 5, - "max_steps": 40, + "max_steps": 15, "use_tensorboard": True, } ) @@ -77,7 +78,7 @@ def test_resume_qlora_packed(self, temp_dir): resume_cfg = cfg | DictDefault( { - "resume_from_checkpoint": f"{temp_dir}/checkpoint-30/", + "resume_from_checkpoint": f"{temp_dir}/checkpoint-9/", } ) normalize_config(resume_cfg) @@ -93,4 +94,4 @@ def test_resume_qlora_packed(self, temp_dir): ) pattern = r"first_step\s+(\d+)" first_steps = int(re.findall(pattern, res.stdout)[0]) - assert first_steps == 31 + assert first_steps == 10 diff --git a/tests/e2e/patched/test_unsloth_qlora.py b/tests/e2e/patched/test_unsloth_qlora.py index 805b150037..3d7e794f1c 100644 --- a/tests/e2e/patched/test_unsloth_qlora.py +++ b/tests/e2e/patched/test_unsloth_qlora.py @@ -42,7 +42,7 @@ def test_unsloth_llama_qlora_fa2(self, temp_dir, sample_packing): "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, - "val_set_size": 0.2, + "val_set_size": 0.05, "special_tokens": { "pad_token": "<|endoftext|>", }, @@ -92,7 +92,7 @@ def test_unsloth_llama_qlora_unpacked(self, temp_dir): "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, - "val_set_size": 0.2, + "val_set_size": 0.05, "special_tokens": { "pad_token": "<|endoftext|>", }, @@ -146,7 +146,7 @@ def test_unsloth_llama_qlora_unpacked_no_fa2_fp16(self, temp_dir, sdp_attention) "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, - "val_set_size": 0.2, + "val_set_size": 0.05, "special_tokens": { "pad_token": "<|endoftext|>", }, diff --git a/tests/e2e/test_optimizers.py b/tests/e2e/test_optimizers.py index af5445461c..63c46c2a2d 100644 --- a/tests/e2e/test_optimizers.py +++ b/tests/e2e/test_optimizers.py @@ -94,6 +94,7 @@ def test_adopt_adamw(self, temp_dir): }, ], "num_epochs": 1, + "max_steps": 5, "micro_batch_size": 8, "gradient_accumulation_steps": 1, "output_dir": temp_dir, @@ -115,7 +116,7 @@ def test_fft_schedule_free_adamw(self, temp_dir): { "base_model": "HuggingFaceTB/SmolLM2-135M", "sequence_len": 1024, - "val_set_size": 0.1, + "val_set_size": 0.01, "special_tokens": { "pad_token": "<|endoftext|>", }, @@ -126,13 +127,14 @@ def test_fft_schedule_free_adamw(self, temp_dir): }, ], "num_epochs": 1, - "micro_batch_size": 4, + "micro_batch_size": 2, "gradient_accumulation_steps": 2, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "schedule_free_adamw", "lr_scheduler": "constant", "save_safetensors": True, + "max_steps": 10, } ) # pylint: disable=duplicate-code diff --git a/tests/e2e/test_relora_llama.py b/tests/e2e/test_relora_llama.py index 4ba130c9dc..5de5db11b7 100644 --- a/tests/e2e/test_relora_llama.py +++ b/tests/e2e/test_relora_llama.py @@ -52,6 +52,7 @@ def test_relora(self, temp_dir): ], "warmup_steps": 15, "num_epochs": 2, + "max_steps": 51, # at least 2x relora_steps "micro_batch_size": 4, "gradient_accumulation_steps": 1, "output_dir": temp_dir, diff --git a/tests/test_datasets.py b/tests/test_datasets.py index f3bed00fd0..b1ecfd6d52 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -67,7 +67,7 @@ def test_load_hub(self): def test_load_local_hub(self): """Niche use case. Verify that a local copy of a hub dataset can be loaded""" with tempfile.TemporaryDirectory() as tmp_dir: - tmp_ds_path = Path("mhenrichsen/alpaca_2k_test") + tmp_ds_path = Path(tmp_dir) / "mhenrichsen/alpaca_2k_test" tmp_ds_path.mkdir(parents=True, exist_ok=True) snapshot_download( repo_id="mhenrichsen/alpaca_2k_test", @@ -89,7 +89,7 @@ def test_load_local_hub(self): "ds_type": "parquet", "type": "alpaca", "data_files": [ - "mhenrichsen/alpaca_2k_test/alpaca_2000.parquet", + f"{tmp_ds_path}/alpaca_2000.parquet", ], }, ], diff --git a/tests/test_perplexity.py b/tests/test_perplexity.py index 8688827cec..b32cd52835 100644 --- a/tests/test_perplexity.py +++ b/tests/test_perplexity.py @@ -7,7 +7,7 @@ from axolotl.utils.callbacks.perplexity import Perplexity -MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" +MODEL_NAME = "HuggingFaceTB/SmolLM2-135M" @fixture() @@ -22,7 +22,9 @@ def model(): @fixture() def tokenizer(): - return AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) + tokenizer_ = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) + tokenizer_.add_special_tokens({"pad_token": "<|endoftext|>"}) + return tokenizer_ def test_perplexity_longer_than_stride(model, metric): @@ -33,7 +35,7 @@ def test_perplexity_longer_than_stride(model, metric): """ result = metric.compute(model, [sample_text]) ppl = result["score"] - assert round(ppl, 2) == 5.37 + assert round(ppl, 2) == 7.41 def test_perplexity_short(model, metric): @@ -41,4 +43,4 @@ def test_perplexity_short(model, metric): sample_text = "Once upon a time, there was a little car named Beep. Beep loved to go fast and play in the sun." result = metric.compute(model, [sample_text]) ppl = result["score"] - assert round(ppl, 2) == 10.02 + assert round(ppl, 2) == 10.33