Skip to content

Commit

Permalink
Expose packed: False, set log_peak_memory_stats: True, set compile: F…
Browse files Browse the repository at this point in the history
…alse (#1872)

Co-authored-by: krammnic <[email protected]>
  • Loading branch information
krammnic and krammnic authored Oct 26, 2024
1 parent 33b8143 commit 2c948c6
Show file tree
Hide file tree
Showing 93 changed files with 275 additions and 117 deletions.
4 changes: 3 additions & 1 deletion recipes/configs/code_llama2/7B_full_low_memory.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,9 @@ resume_from_checkpoint: False

# Dataset
dataset:
packed: False # Set to true for great speed ups
_component_: torchtune.datasets.alpaca_dataset

seed: null
shuffle: True

Expand Down Expand Up @@ -75,4 +77,4 @@ metric_logger:
_component_: torchtune.training.metric_logging.DiskLogger
log_dir: /tmp/CodeLlama-7b-hf/logs
log_every_n_steps: 1
log_peak_memory_stats: False
log_peak_memory_stats: True
4 changes: 3 additions & 1 deletion recipes/configs/code_llama2/7B_lora_single_device.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,9 @@ save_adapter_weights_only: False

# Dataset
dataset:
packed: False # Set to true for great speed ups
_component_: torchtune.datasets.alpaca_cleaned_dataset

seed: null
shuffle: True

Expand Down Expand Up @@ -84,7 +86,7 @@ metric_logger:
_component_: torchtune.training.metric_logging.DiskLogger
log_dir: /tmp/CodeLlama-7b-hf/logs
log_every_n_steps: 1
log_peak_memory_stats: False
log_peak_memory_stats: True

# Showcase the usage of PyTorch profiler
# Set enabled to False as it's only needed for debugging training
Expand Down
3 changes: 2 additions & 1 deletion recipes/configs/code_llama2/7B_qlora_single_device.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ save_adapter_weights_only: False

# Dataset
dataset:
packed: False # Set to true for great speed ups
_component_: torchtune.datasets.alpaca_cleaned_dataset
seed: null
shuffle: True
Expand Down Expand Up @@ -84,7 +85,7 @@ metric_logger:
_component_: torchtune.training.metric_logging.DiskLogger
log_dir: /tmp/CodeLlama-7b-hf/logs
log_every_n_steps: 1
log_peak_memory_stats: False
log_peak_memory_stats: True

# Show case the usage of pytorch profiler
# Set enabled to False as it's only needed for debugging training
Expand Down
4 changes: 3 additions & 1 deletion recipes/configs/dev/8B_full_experimental.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ tokenizer:

# Dataset
dataset:
packed: False # Set to true for great speed ups
_component_: torchtune.datasets.alpaca_dataset
seed: null
shuffle: True
Expand Down Expand Up @@ -57,7 +58,7 @@ loss:
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
max_steps_per_epoch: null
gradient_accumulation_steps: 1

compile: False

# Training env
device: cuda
Expand All @@ -78,3 +79,4 @@ metric_logger:
log_dir: ${output_dir}
output_dir: /tmp/alpaca-llama3-finetune
log_every_n_steps: null
log_peak_memory_stats: True
4 changes: 3 additions & 1 deletion recipes/configs/gemma/2B_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ tokenizer:

# Dataset
dataset:
packed: False # Set to true for great speed ups
_component_: torchtune.datasets.alpaca_dataset
seed: null
shuffle: True
Expand Down Expand Up @@ -54,6 +55,7 @@ loss:
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
max_steps_per_epoch: null
gradient_accumulation_steps: 1
compile: False

# Training env
device: cuda
Expand All @@ -70,4 +72,4 @@ metric_logger:
log_dir: ${output_dir}
output_dir: /tmp/alpaca-gemma-finetune
log_every_n_steps: 1
log_peak_memory_stats: False
log_peak_memory_stats: True
4 changes: 3 additions & 1 deletion recipes/configs/gemma/2B_lora.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ tokenizer:

# Dataset
dataset:
packed: False # Set to true for great speed ups
_component_: torchtune.datasets.alpaca_dataset
seed: null
shuffle: True
Expand Down Expand Up @@ -66,6 +67,7 @@ batch_size: 4
epochs: 3
max_steps_per_epoch: null
gradient_accumulation_steps: 1
compile: False

# Training env
device: cuda
Expand All @@ -82,4 +84,4 @@ metric_logger:
log_dir: ${output_dir}
output_dir: /tmp/alpaca-gemma-lora
log_every_n_steps: 1
log_peak_memory_stats: False
log_peak_memory_stats: True
3 changes: 2 additions & 1 deletion recipes/configs/gemma/2B_lora_single_device.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ tokenizer:

# Dataset
dataset:
packed: False # Set to true for great speed ups
_component_: torchtune.datasets.alpaca_dataset
seed: null
shuffle: True
Expand Down Expand Up @@ -83,7 +84,7 @@ metric_logger:
log_dir: ${output_dir}
output_dir: /tmp/alpaca-gemma-lora
log_every_n_steps: 1
log_peak_memory_stats: False
log_peak_memory_stats: True

# Show case the usage of pytorch profiler
# Set enabled to False as it's only needed for debugging training
Expand Down
3 changes: 2 additions & 1 deletion recipes/configs/gemma/2B_qlora_single_device.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ tokenizer:

# Dataset
dataset:
packed: False # Set to true for great speed ups
_component_: torchtune.datasets.alpaca_dataset
seed: null
shuffle: True
Expand Down Expand Up @@ -83,7 +84,7 @@ metric_logger:
log_dir: ${output_dir}
output_dir: /tmp/alpaca-gemma-lora
log_every_n_steps: 1
log_peak_memory_stats: False
log_peak_memory_stats: True

# Show case the usage of pytorch profiler
# Set enabled to False as it's only needed for debugging training
Expand Down
4 changes: 3 additions & 1 deletion recipes/configs/gemma/7B_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ tokenizer:

# Dataset
dataset:
packed: False # Set to true for great speed ups
_component_: torchtune.datasets.alpaca_dataset
seed: null
shuffle: True
Expand Down Expand Up @@ -56,6 +57,7 @@ loss:
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
max_steps_per_epoch: null
gradient_accumulation_steps: 1
compile: False

# Training env
device: cuda
Expand All @@ -72,4 +74,4 @@ metric_logger:
log_dir: ${output_dir}
output_dir: /tmp/alpaca-gemma-finetune
log_every_n_steps: 1
log_peak_memory_stats: False
log_peak_memory_stats: True
4 changes: 3 additions & 1 deletion recipes/configs/gemma/7B_lora.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ tokenizer:

# Dataset
dataset:
packed: False # Set to true for great speed ups
_component_: torchtune.datasets.alpaca_dataset
seed: null
shuffle: True
Expand Down Expand Up @@ -68,6 +69,7 @@ batch_size: 4
epochs: 3
max_steps_per_epoch: null
gradient_accumulation_steps: 1
compile: False

# Training env
device: cuda
Expand All @@ -84,4 +86,4 @@ metric_logger:
log_dir: ${output_dir}
output_dir: /tmp/alpaca-gemma-lora
log_every_n_steps: 1
log_peak_memory_stats: False
log_peak_memory_stats: True
3 changes: 2 additions & 1 deletion recipes/configs/gemma/7B_lora_single_device.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ tokenizer:

# Dataset
dataset:
packed: False # Set to true for great speed ups
_component_: torchtune.datasets.alpaca_dataset
seed: null
shuffle: True
Expand Down Expand Up @@ -85,7 +86,7 @@ metric_logger:
log_dir: ${output_dir}
output_dir: /tmp/alpaca-gemma-lora
log_every_n_steps: 1
log_peak_memory_stats: False
log_peak_memory_stats: True

# Show case the usage of pytorch profiler
# Set enabled to False as it's only needed for debugging training
Expand Down
3 changes: 2 additions & 1 deletion recipes/configs/gemma/7B_qlora_single_device.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ tokenizer:

# Dataset
dataset:
packed: False # Set to true for great speed ups
_component_: torchtune.datasets.alpaca_dataset
seed: null
shuffle: True
Expand Down Expand Up @@ -85,7 +86,7 @@ metric_logger:
log_dir: ${output_dir}
output_dir: /tmp/alpaca-gemma-lora
log_every_n_steps: 1
log_peak_memory_stats: False
log_peak_memory_stats: True

# Show case the usage of pytorch profiler
# Set enabled to False as it's only needed for debugging training
Expand Down
4 changes: 3 additions & 1 deletion recipes/configs/llama2/13B_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ tokenizer:

# Dataset
dataset:
packed: False # Set to true for great speed ups
_component_: torchtune.datasets.alpaca_dataset
seed: null
shuffle: True
Expand All @@ -58,6 +59,7 @@ loss:
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
max_steps_per_epoch: null
gradient_accumulation_steps: 1
compile: False

# Training env
device: cuda
Expand All @@ -74,4 +76,4 @@ metric_logger:
log_dir: ${output_dir}
output_dir: /tmp/alpaca-llama2-finetune
log_every_n_steps: 1
log_peak_memory_stats: False
log_peak_memory_stats: True
4 changes: 3 additions & 1 deletion recipes/configs/llama2/13B_lora.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ tokenizer:

# Dataset and Sampler
dataset:
packed: False # Set to true for great speed ups
_component_: torchtune.datasets.alpaca_cleaned_dataset
seed: null
shuffle: True
Expand All @@ -74,14 +75,15 @@ loss:
epochs: 1
max_steps_per_epoch: null
gradient_accumulation_steps: 16
compile: False

# Logging
output_dir: /tmp/lora_finetune_output
metric_logger:
_component_: torchtune.training.metric_logging.DiskLogger
log_dir: ${output_dir}
log_every_n_steps: 1
log_peak_memory_stats: False
log_peak_memory_stats: True

# Environment
device: cuda
Expand Down
3 changes: 2 additions & 1 deletion recipes/configs/llama2/13B_qlora_single_device.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ save_adapter_weights_only: False

# Dataset and Sampler
dataset:
packed: False # Set to true for great speed ups
_component_: torchtune.datasets.alpaca_cleaned_dataset
seed: null
shuffle: True
Expand Down Expand Up @@ -77,7 +78,7 @@ metric_logger:
_component_: torchtune.training.metric_logging.DiskLogger
log_dir: ${output_dir}
log_every_n_steps: 1
log_peak_memory_stats: False
log_peak_memory_stats: True

# Environment
device: cuda
Expand Down
3 changes: 2 additions & 1 deletion recipes/configs/llama2/70B_lora.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ save_adapter_weights_only: False

# Dataset and Sampler
dataset:
packed: False # Set to true for great speed ups
_component_: torchtune.datasets.alpaca_dataset
seed: null
shuffle: True
Expand Down Expand Up @@ -81,7 +82,7 @@ metric_logger:
_component_: torchtune.training.metric_logging.DiskLogger
log_dir: ${output_dir}
log_every_n_steps: 1
log_peak_memory_stats: False
log_peak_memory_stats: True

# Environment
device: cuda
Expand Down
3 changes: 2 additions & 1 deletion recipes/configs/llama2/70B_qlora.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ save_adapter_weights_only: False

# Dataset and Sampler
dataset:
packed: False # Set to true for great speed ups
_component_: torchtune.datasets.alpaca_dataset
train_on_input: True
seed: null
Expand Down Expand Up @@ -91,7 +92,7 @@ metric_logger:
_component_: torchtune.training.metric_logging.DiskLogger
log_dir: ${output_dir}
log_every_n_steps: 1
log_peak_memory_stats: False
log_peak_memory_stats: True

# Environment
device: cuda
Expand Down
5 changes: 3 additions & 2 deletions recipes/configs/llama2/7B_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ tokenizer:

# Dataset
dataset:
packed: False # Set to true for great speed ups
_component_: torchtune.datasets.alpaca_dataset
seed: null
shuffle: True
Expand Down Expand Up @@ -57,7 +58,7 @@ loss:
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
max_steps_per_epoch: null
gradient_accumulation_steps: 1

compile: False

# Training env
device: cuda
Expand All @@ -74,4 +75,4 @@ metric_logger:
log_dir: ${output_dir}
output_dir: /tmp/alpaca-llama2-finetune
log_every_n_steps: 1
log_peak_memory_stats: False
log_peak_memory_stats: True
3 changes: 2 additions & 1 deletion recipes/configs/llama2/7B_full_low_memory.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ tokenizer:

# Dataset
dataset:
packed: False # Set to true for great speed ups
_component_: torchtune.datasets.alpaca_dataset
seed: null
shuffle: True
Expand Down Expand Up @@ -79,4 +80,4 @@ metric_logger:
log_dir: ${output_dir}
output_dir: /tmp/alpaca-llama2-finetune
log_every_n_steps: 1
log_peak_memory_stats: False
log_peak_memory_stats: True
7 changes: 4 additions & 3 deletions recipes/configs/llama2/7B_lora.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ save_adapter_weights_only: False

# Dataset and Sampler
dataset:
packed: False # Set to true for great speed ups
_component_: torchtune.datasets.alpaca_cleaned_dataset
seed: null
shuffle: True
Expand Down Expand Up @@ -78,7 +79,7 @@ metric_logger:
_component_: torchtune.training.metric_logging.DiskLogger
log_dir: ${output_dir}
log_every_n_steps: 1
log_peak_memory_stats: False
log_peak_memory_stats: True

# Environment
device: cuda
Expand All @@ -92,14 +93,14 @@ profiler:

enabled: False

#Output directory of trace artifacts
# Output directory of trace artifacts
output_dir: ${output_dir}/profiling_outputs

#`torch.profiler.ProfilerActivity` types to trace
cpu: True
cuda: True

#trace options passed to `torch.profiler.profile`
# trace options passed to `torch.profiler.profile`
profile_memory: False
with_stack: False
record_shapes: True
Expand Down
Loading

0 comments on commit 2c948c6

Please sign in to comment.