Expose packed: False, set log_peak_memory_stats: True, set compile: F…

…alse (#1872) Co-authored-by: krammnic <[email protected]>
pytorch · Oct 26, 2024 · 2c948c6 · 2c948c6
1 parent 33b8143
commit 2c948c6
Show file tree

Hide file tree

Showing 93 changed files with 275 additions and 117 deletions.
diff --git a/recipes/configs/code_llama2/7B_full_low_memory.yaml b/recipes/configs/code_llama2/7B_full_low_memory.yaml
@@ -45,7 +45,9 @@ resume_from_checkpoint: False
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
+
 seed: null
 shuffle: True
 
@@ -75,4 +77,4 @@ metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: /tmp/CodeLlama-7b-hf/logs
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
diff --git a/recipes/configs/code_llama2/7B_lora_single_device.yaml b/recipes/configs/code_llama2/7B_lora_single_device.yaml
@@ -49,7 +49,9 @@ save_adapter_weights_only: False
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_cleaned_dataset
+
 seed: null
 shuffle: True
 
@@ -84,7 +86,7 @@ metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: /tmp/CodeLlama-7b-hf/logs
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Showcase the usage of PyTorch profiler
 # Set enabled to False as it's only needed for debugging training

diff --git a/recipes/configs/code_llama2/7B_qlora_single_device.yaml b/recipes/configs/code_llama2/7B_qlora_single_device.yaml
@@ -49,6 +49,7 @@ save_adapter_weights_only: False
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_cleaned_dataset
 seed: null
 shuffle: True
@@ -84,7 +85,7 @@ metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: /tmp/CodeLlama-7b-hf/logs
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Show case the usage of pytorch profiler
 # Set enabled to False as it's only needed for debugging training

diff --git a/recipes/configs/dev/8B_full_experimental.yaml b/recipes/configs/dev/8B_full_experimental.yaml
@@ -26,6 +26,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -57,7 +58,7 @@ loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1
-
+compile: False
 
 # Training env
 device: cuda
@@ -78,3 +79,4 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/alpaca-llama3-finetune
 log_every_n_steps: null
+log_peak_memory_stats: True
diff --git a/recipes/configs/gemma/2B_full.yaml b/recipes/configs/gemma/2B_full.yaml
@@ -23,6 +23,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -54,6 +55,7 @@ loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1
+compile: False
 
 # Training env
 device: cuda
@@ -70,4 +72,4 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/alpaca-gemma-finetune
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
diff --git a/recipes/configs/gemma/2B_lora.yaml b/recipes/configs/gemma/2B_lora.yaml
@@ -22,6 +22,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -66,6 +67,7 @@ batch_size: 4
 epochs: 3
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1
+compile: False
 
 # Training env
 device: cuda
@@ -82,4 +84,4 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/alpaca-gemma-lora
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
diff --git a/recipes/configs/gemma/2B_lora_single_device.yaml b/recipes/configs/gemma/2B_lora_single_device.yaml
@@ -22,6 +22,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -83,7 +84,7 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/alpaca-gemma-lora
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Show case the usage of pytorch profiler
 # Set enabled to False as it's only needed for debugging training

diff --git a/recipes/configs/gemma/2B_qlora_single_device.yaml b/recipes/configs/gemma/2B_qlora_single_device.yaml
@@ -22,6 +22,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -83,7 +84,7 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/alpaca-gemma-lora
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Show case the usage of pytorch profiler
 # Set enabled to False as it's only needed for debugging training

diff --git a/recipes/configs/gemma/7B_full.yaml b/recipes/configs/gemma/7B_full.yaml
@@ -23,6 +23,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -56,6 +57,7 @@ loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1
+compile: False
 
 # Training env
 device: cuda
@@ -72,4 +74,4 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/alpaca-gemma-finetune
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
diff --git a/recipes/configs/gemma/7B_lora.yaml b/recipes/configs/gemma/7B_lora.yaml
@@ -23,6 +23,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -68,6 +69,7 @@ batch_size: 4
 epochs: 3
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1
+compile: False
 
 # Training env
 device: cuda
@@ -84,4 +86,4 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/alpaca-gemma-lora
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
diff --git a/recipes/configs/gemma/7B_lora_single_device.yaml b/recipes/configs/gemma/7B_lora_single_device.yaml
@@ -22,6 +22,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -85,7 +86,7 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/alpaca-gemma-lora
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Show case the usage of pytorch profiler
 # Set enabled to False as it's only needed for debugging training

diff --git a/recipes/configs/gemma/7B_qlora_single_device.yaml b/recipes/configs/gemma/7B_qlora_single_device.yaml
@@ -22,6 +22,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -85,7 +86,7 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/alpaca-gemma-lora
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Show case the usage of pytorch profiler
 # Set enabled to False as it's only needed for debugging training

diff --git a/recipes/configs/llama2/13B_full.yaml b/recipes/configs/llama2/13B_full.yaml
@@ -43,6 +43,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -58,6 +59,7 @@ loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1
+compile: False
 
 # Training env
 device: cuda
@@ -74,4 +76,4 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/alpaca-llama2-finetune
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
diff --git a/recipes/configs/llama2/13B_lora.yaml b/recipes/configs/llama2/13B_lora.yaml
@@ -52,6 +52,7 @@ tokenizer:
 
 # Dataset and Sampler
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_cleaned_dataset
 seed: null
 shuffle: True
@@ -74,14 +75,15 @@ loss:
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 16
+compile: False
 
 # Logging
 output_dir: /tmp/lora_finetune_output
 metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: ${output_dir}
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Environment
 device: cuda

diff --git a/recipes/configs/llama2/13B_qlora_single_device.yaml b/recipes/configs/llama2/13B_qlora_single_device.yaml
@@ -47,6 +47,7 @@ save_adapter_weights_only: False
 
 # Dataset and Sampler
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_cleaned_dataset
 seed: null
 shuffle: True
@@ -77,7 +78,7 @@ metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: ${output_dir}
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Environment
 device: cuda

diff --git a/recipes/configs/llama2/70B_lora.yaml b/recipes/configs/llama2/70B_lora.yaml
@@ -52,6 +52,7 @@ save_adapter_weights_only: False
 
 # Dataset and Sampler
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -81,7 +82,7 @@ metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: ${output_dir}
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Environment
 device: cuda

diff --git a/recipes/configs/llama2/70B_qlora.yaml b/recipes/configs/llama2/70B_qlora.yaml
@@ -57,6 +57,7 @@ save_adapter_weights_only: False
 
 # Dataset and Sampler
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
   train_on_input: True
 seed: null
@@ -91,7 +92,7 @@ metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: ${output_dir}
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Environment
 device: cuda

diff --git a/recipes/configs/llama2/7B_full.yaml b/recipes/configs/llama2/7B_full.yaml
@@ -26,6 +26,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -57,7 +58,7 @@ loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1
-
+compile: False
 
 # Training env
 device: cuda
@@ -74,4 +75,4 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/alpaca-llama2-finetune
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
diff --git a/recipes/configs/llama2/7B_full_low_memory.yaml b/recipes/configs/llama2/7B_full_low_memory.yaml
@@ -28,6 +28,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -79,4 +80,4 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/alpaca-llama2-finetune
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
diff --git a/recipes/configs/llama2/7B_lora.yaml b/recipes/configs/llama2/7B_lora.yaml
@@ -49,6 +49,7 @@ save_adapter_weights_only: False
 
 # Dataset and Sampler
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_cleaned_dataset
 seed: null
 shuffle: True
@@ -78,7 +79,7 @@ metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: ${output_dir}
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Environment
 device: cuda
@@ -92,14 +93,14 @@ profiler:
 
   enabled: False
 
-  #Output directory of trace artifacts
+  # Output directory of trace artifacts
   output_dir: ${output_dir}/profiling_outputs
 
   #`torch.profiler.ProfilerActivity` types to trace
   cpu: True
   cuda: True
 
-  #trace options passed to `torch.profiler.profile`
+  # trace options passed to `torch.profiler.profile`
   profile_memory: False
   with_stack: False
   record_shapes: True