Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Preliminary MoE extension #541

Closed
wants to merge 18 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 107 additions & 0 deletions configs/OLMoE-200m-80m.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
run_name: OLMoE
seed: 6198
dry_run: false

wandb:
name: ${run_name}
project: olmoe
group: null

model:
d_model: 512
n_heads: 8
n_layers: 10
mlp_ratio: 4 # 4 vs 8 (for swiglu)
weight_tying: true
alibi: false
rope: true
flash_attention: true
attention_dropout: 0.0
attention_layer_norm: false
multi_query_attention: false
include_bias: false
block_type: moe
layer_norm_type: default
layer_norm_with_affine: false
bias_for_layer_norm: false
attention_layer_norm_with_affine: false
activation_type: gelu # gelu vs swiglu
residual_dropout: 0.0
embedding_dropout: 0.0
max_sequence_length: 2048
vocab_size: 50280
embedding_size: 50304
eos_token_id: 50279
pad_token_id: 1
init_device: meta
init_fn: normal # mitchell vs normal

compile: null # causes instability on AMD GPUs

optimizer:
name: adamw
learning_rate: 4.0e-4
weight_decay: 0.1
betas:
- 0.9
- 0.95
metrics_log_interval: 10

scheduler:
name: cosine_with_warmup
t_warmup: 2000
alpha_f: 0.1

tokenizer:
identifier: tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json
truncate_direction: right

save_folder: /data/niklas/olmoe
save_overwrite: false
# Sharded checkpoints (best for restarts)
save_interval: 1000
save_num_checkpoints_to_keep: 9
# Unsharded checkpoints (for final storage)
save_interval_unsharded: 10000
save_num_unsharded_checkpoints_to_keep: -1

load_path: null

max_duration: 10e9T # 10B tokens
global_train_batch_size: 2048
device_train_microbatch_size: 8

precision: amp_bf16

fsdp:
wrapping_strategy: null
precision: mixed

max_grad_norm: 1.0
max_grad_norm_ratio: null

speed_monitor:
window_size: 20

eval_interval: ${save_interval}
eval_subset_num_batches: -1
device_eval_batch_size: ${device_train_microbatch_size}

data:
pad_direction: right
num_workers: 0
drop_last: true
pin_memory: true
prefetch_factor: 16
persistent_workers: true
timeout: 0
paths:
- /data/niklas/llm/data/part-000-00000.npy
- /data/niklas/llm/data/part-000-00001.npy
- /data/niklas/llm/data/part-001-00000.npy
- /data/niklas/llm/data/part-002-00000.npy
- /data/niklas/llm/data/part-003-00000.npy
- /data/niklas/llm/data/part-004-00000.npy
- /data/niklas/llm/data/part-004-00001.npy
- /data/niklas/llm/data/part-005-00000.npy
- /data/niklas/llm/data/part-005-00001.npy
107 changes: 107 additions & 0 deletions configs/OLMoE-200m.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
run_name: OLMoE
seed: 6198
dry_run: false

wandb:
name: ${run_name}
project: olmoe
group: null

model:
d_model: 896
n_heads: 14
n_layers: 16
mlp_ratio: 4 # 4 vs 8 (for swiglu)
weight_tying: true
alibi: false
rope: true
flash_attention: true
attention_dropout: 0.0
attention_layer_norm: false
multi_query_attention: false
include_bias: false
block_type: sequential
layer_norm_type: default
layer_norm_with_affine: false
bias_for_layer_norm: false
attention_layer_norm_with_affine: false
activation_type: gelu # gelu vs swiglu
residual_dropout: 0.0
embedding_dropout: 0.0
max_sequence_length: 2048
vocab_size: 50280
embedding_size: 50304
eos_token_id: 50279
pad_token_id: 1
init_device: meta
init_fn: normal # mitchell vs normal

compile: null # causes instability on AMD GPUs

optimizer:
name: adamw
learning_rate: 4.0e-4
weight_decay: 0.1
betas:
- 0.9
- 0.95
metrics_log_interval: 10

scheduler:
name: cosine_with_warmup
t_warmup: 2000
alpha_f: 0.1

tokenizer:
identifier: tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json
truncate_direction: right

save_folder: /data/niklas/olmoe
save_overwrite: false
# Sharded checkpoints (best for restarts)
save_interval: 1000
save_num_checkpoints_to_keep: 9
# Unsharded checkpoints (for final storage)
save_interval_unsharded: 10000
save_num_unsharded_checkpoints_to_keep: -1

load_path: null

max_duration: 10e9T # 10B tokens
global_train_batch_size: 2048
device_train_microbatch_size: 8

precision: amp_bf16

fsdp:
wrapping_strategy: null
precision: mixed

max_grad_norm: 1.0
max_grad_norm_ratio: null

speed_monitor:
window_size: 20

eval_interval: ${save_interval}
eval_subset_num_batches: -1
device_eval_batch_size: ${device_train_microbatch_size}

data:
pad_direction: right
num_workers: 0
drop_last: true
pin_memory: true
prefetch_factor: 16
persistent_workers: true
timeout: 0
paths:
- /data/niklas/llm/data/part-000-00000.npy
- /data/niklas/llm/data/part-000-00001.npy
- /data/niklas/llm/data/part-001-00000.npy
- /data/niklas/llm/data/part-002-00000.npy
- /data/niklas/llm/data/part-003-00000.npy
- /data/niklas/llm/data/part-004-00000.npy
- /data/niklas/llm/data/part-004-00001.npy
- /data/niklas/llm/data/part-005-00000.npy
- /data/niklas/llm/data/part-005-00001.npy
107 changes: 107 additions & 0 deletions configs/OLMoE-600m-200m.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
run_name: OLMoE
seed: 6198
dry_run: false

wandb:
name: ${run_name}
project: olmoe
group: null

model:
d_model: 768
n_heads: 12
n_layers: 14
mlp_ratio: 4 # 4 vs 8 (for swiglu)
weight_tying: true
alibi: false
rope: true
flash_attention: true
attention_dropout: 0.0
attention_layer_norm: false
multi_query_attention: false
include_bias: false
block_type: moe
layer_norm_type: default
layer_norm_with_affine: false
bias_for_layer_norm: false
attention_layer_norm_with_affine: false
activation_type: gelu # gelu vs swiglu
residual_dropout: 0.0
embedding_dropout: 0.0
max_sequence_length: 2048
vocab_size: 50280
embedding_size: 50304
eos_token_id: 50279
pad_token_id: 1
init_device: meta
init_fn: normal # mitchell vs normal

compile: null # causes instability on AMD GPUs

optimizer:
name: adamw
learning_rate: 4.0e-4
weight_decay: 0.1
betas:
- 0.9
- 0.95
metrics_log_interval: 10

scheduler:
name: cosine_with_warmup
t_warmup: 2000
alpha_f: 0.1

tokenizer:
identifier: tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json
truncate_direction: right

save_folder: /data/niklas/olmoe
save_overwrite: false
# Sharded checkpoints (best for restarts)
save_interval: 1000
save_num_checkpoints_to_keep: 9
# Unsharded checkpoints (for final storage)
save_interval_unsharded: 10000
save_num_unsharded_checkpoints_to_keep: -1

load_path: null

max_duration: 10e9T # 10B tokens
global_train_batch_size: 2048
device_train_microbatch_size: 8

precision: amp_bf16

fsdp:
wrapping_strategy: null
precision: mixed

max_grad_norm: 1.0
max_grad_norm_ratio: null

speed_monitor:
window_size: 20

eval_interval: ${save_interval}
eval_subset_num_batches: -1
device_eval_batch_size: ${device_train_microbatch_size}

data:
pad_direction: right
num_workers: 0
drop_last: true
pin_memory: true
prefetch_factor: 16
persistent_workers: true
timeout: 0
paths:
- /data/niklas/llm/data/part-000-00000.npy
- /data/niklas/llm/data/part-000-00001.npy
- /data/niklas/llm/data/part-001-00000.npy
- /data/niklas/llm/data/part-002-00000.npy
- /data/niklas/llm/data/part-003-00000.npy
- /data/niklas/llm/data/part-004-00000.npy
- /data/niklas/llm/data/part-004-00001.npy
- /data/niklas/llm/data/part-005-00000.npy
- /data/niklas/llm/data/part-005-00001.npy
Loading
Loading