Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cleanup bert/squad #22

Merged
merged 3 commits into from
May 19, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion BingBertSquad/run_squad_deepspeed.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,14 @@ OUTPUT_DIR=$4
LR=${5:-0.00003}
SEED=${6:-12345}
MASTER_PORT=${7:-29500}
echo "lr is ${LR}"
echo "seed is $SEED"
echo "master port is $MASTER_PORT"

# Force deepspeed to run with only local node
NUM_NODES=1
HOSTFILE=/dev/null

NGPU=$((NGPU_PER_NODE*NUM_NODES))
EFFECTIVE_BATCH_SIZE=24
MAX_GPU_BATCH_SIZE=3
Expand All @@ -29,6 +33,7 @@ JOB_NAME="deepspeed_${NGPU}GPUs_${EFFECTIVE_BATCH_SIZE}batch_size"
config_json=deepspeed_bsz24_config.json
run_cmd="deepspeed --num_nodes ${NUM_NODES} --num_gpus ${NGPU_PER_NODE} \
--master_port=${MASTER_PORT} \
--hostfile ${HOSTFILE} \
nvidia_run_squad_deepspeed.py \
--bert_model bert-large-uncased \
--do_train \
Expand All @@ -48,7 +53,6 @@ run_cmd="deepspeed --num_nodes ${NUM_NODES} --num_gpus ${NGPU_PER_NODE} \
--fp16 \
--deepspeed \
--deepspeed_config ${config_json} \
--deepspeed_transformer_kernel \
--model_file $MODEL_FILE \
--seed ${SEED} \
--preln \
Expand Down
42 changes: 3 additions & 39 deletions BingBertSquad/turing/nvidia_modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -499,46 +499,10 @@ class BertEncoder(nn.Module):
def __init__(self, config, args):
super(BertEncoder, self).__init__()

if args.deepspeed_transformer_kernel:
from deepspeed import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig, DeepSpeedConfig
layer = BertLayer(config)
self.layer = nn.ModuleList(
[copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])

if hasattr(args, 'deepspeed_config') and args.deepspeed_config:
ds_config = DeepSpeedConfig(args.deepspeed_config)
else:
raise RuntimeError('deepspeed_config is not found in args.')

cuda_config = DeepSpeedTransformerConfig(
batch_size=ds_config.train_micro_batch_size_per_gpu,
max_seq_length=args.max_seq_length,
hidden_size=config.hidden_size,
heads=config.num_attention_heads,
attn_dropout_ratio=config.attention_probs_dropout_prob,
hidden_dropout_ratio=config.hidden_dropout_prob,
num_hidden_layers=config.num_hidden_layers,
initializer_range=config.initializer_range,
seed=args.seed,
fp16=ds_config.fp16_enabled,
pre_layer_norm=False)

self.layer = nn.ModuleList([
copy.deepcopy(DeepSpeedTransformerLayer(i, cuda_config))
for i in range(config.num_hidden_layers)
])
else:
layer = BertLayer(config)
self.layer = nn.ModuleList([
copy.deepcopy(layer) for _ in range(config.num_hidden_layers)
])

# def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
# all_encoder_layers = []
# for layer_module in self.layer:
# hidden_states = layer_module(hidden_states, attention_mask)
# if output_all_encoded_layers:
# all_encoder_layers.append(hidden_states)
# if not output_all_encoded_layers:
# all_encoder_layers.append(hidden_states)
# return all_encoder_layers
def forward(self,
hidden_states,
attention_mask,
Expand Down
42 changes: 3 additions & 39 deletions BingBertSquad/turing/nvidia_modelingpreln.py
Original file line number Diff line number Diff line change
Expand Up @@ -519,46 +519,10 @@ def __init__(self, config, args):
#Added later to make it similar to GPT-2
self.FinalLayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)

if args.deepspeed_transformer_kernel:
from deepspeed import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig, DeepSpeedConfig
layer = BertLayer(config)
self.layer = nn.ModuleList(
[copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])

if hasattr(args, 'deepspeed_config') and args.deepspeed_config:
ds_config = DeepSpeedConfig(args.deepspeed_config)
else:
raise RuntimeError('deepspeed_config is not found in args.')

cuda_config = DeepSpeedTransformerConfig(
batch_size=ds_config.train_micro_batch_size_per_gpu,
max_seq_length=args.max_seq_length,
hidden_size=config.hidden_size,
heads=config.num_attention_heads,
attn_dropout_ratio=config.attention_probs_dropout_prob,
hidden_dropout_ratio=config.hidden_dropout_prob,
num_hidden_layers=config.num_hidden_layers,
initializer_range=config.initializer_range,
seed=args.seed,
fp16=ds_config.fp16_enabled,
pre_layer_norm=True)

self.layer = nn.ModuleList([
copy.deepcopy(DeepSpeedTransformerLayer(i, cuda_config))
for i in range(config.num_hidden_layers)
])
else:
layer = BertLayer(config)
self.layer = nn.ModuleList([
copy.deepcopy(layer) for _ in range(config.num_hidden_layers)
])

# def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
# all_encoder_layers = []
# for layer_module in self.layer:
# hidden_states = layer_module(hidden_states, attention_mask)
# if output_all_encoded_layers:
# all_encoder_layers.append(hidden_states)
# if not output_all_encoded_layers:
# all_encoder_layers.append(hidden_states)
# return all_encoder_layers
def forward(self,
hidden_states,
attention_mask,
Expand Down
5 changes: 0 additions & 5 deletions BingBertSquad/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,11 +203,6 @@ def get_argument_parser():
default=100,
help='Interval to print training details.')

parser.add_argument('--deepspeed_transformer_kernel',
default=False,
action='store_true',
help='Use DeepSpeed transformer kernel to accelerate.')

return parser


Expand Down
60 changes: 30 additions & 30 deletions bing_bert/bert_large_lamb.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,38 +20,38 @@
"pretrain_dataset": true,
"pretrain_type": "wiki_bc"
},
"mixed_seq_datasets": {
"128": {
"wiki_pretrain_dataset": "bnorick_format/128/wiki_pretrain",
"bc_pretrain_dataset": "bnorick_format/128/bookcorpus_pretrain"
},
"512": {
"wiki_pretrain_dataset": "bnorick_format/512/wiki_pretrain",
"bc_pretrain_dataset": "bnorick_format/512/bookcorpus_pretrain"
}
}
"mixed_seq_datasets": {
"128": {
"wiki_pretrain_dataset": "bnorick_format/128/wiki_pretrain",
"bc_pretrain_dataset": "bnorick_format/128/bookcorpus_pretrain"
},
"512": {
"wiki_pretrain_dataset": "bnorick_format/512/wiki_pretrain",
"bc_pretrain_dataset": "bnorick_format/512/bookcorpus_pretrain"
}
}
},
"mixed_seq_training": {
"128": {
"num_epochs": 150,
"warmup_proportion": 0.06,
"learning_rate": 11e-3,
"num_workers": 0,
"async_worker": true,
"decay_rate": 0.90,
"decay_step": 250,
"total_training_steps": 7500
},
"512": {
"num_epochs": 160,
"warmup_proportion": 0.02,
"learning_rate": 2e-3,
"num_workers": 0,
"async_worker": true,
"decay_rate": 0.90,
"decay_step": 150,
"total_training_steps": 7500
}
"128": {
"num_epochs": 150,
"warmup_proportion": 0.06,
"learning_rate": 11e-3,
"num_workers": 0,
"async_worker": true,
"decay_rate": 0.90,
"decay_step": 250,
"total_training_steps": 7500
},
"512": {
"num_epochs": 160,
"warmup_proportion": 0.02,
"learning_rate": 2e-3,
"num_workers": 0,
"async_worker": true,
"decay_rate": 0.90,
"decay_step": 150,
"total_training_steps": 7500
}
},
"validation": {
"path": "validation_set/"
Expand Down
5 changes: 2 additions & 3 deletions bing_bert/ds_train_bert_bsz32k_seq512.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,17 @@ echo "checkpoint id: $CHECKPOINT_EPOCH150_NAME"

mkdir -p $OUTPUT_DIR

NCCL_TREE_THRESHOLD=0 deepspeed ${base_dir}/deepspeed_train.py \
deepspeed ${base_dir}/deepspeed_train.py \
--cf ${base_dir}/bert_large_lamb.json \
--max_seq_length 512 \
--output_dir $OUTPUT_DIR \
--print_steps 100 \
--deepspeed \
--job_name $JOB_NAME \
--deepspeed_config ${base_dir}/deepspeed_bsz32K_lamb_config_seq512.json \
--deepspeed_config ${base_dir}/deepspeed_bsz32k_lamb_config_seq512.json \
--data_path_prefix /data/bert \
--rewarmup \
--lr_schedule "EE" \
--attention_dropout_checkpoint \
--lr_offset 0.0 \
--load_training_checkpoint ${CHECKPOINT_BASE_PATH} \
--load_checkpoint_id ${CHECKPOINT_EPOCH150_NAME} \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,6 @@ NCCL_TREE_THRESHOLD=0 deepspeed ${base_dir}/deepspeed_train.py \
--lr_schedule "EE" \
--lr_offset 10e-4 \
--job_name $JOB_NAME \
--deepspeed_config ${base_dir}/deepspeed_bsz64K_lamb_config_seq128.json \
--deepspeed_config ${base_dir}/deepspeed_bsz64k_lamb_config_seq128.json \
--data_path_prefix /data/bert \
&> ${JOB_NAME}.log
47 changes: 3 additions & 44 deletions bing_bert/nvidia/modelingpreln.py
Original file line number Diff line number Diff line change
Expand Up @@ -516,51 +516,10 @@ def __init__(self, config, args):
#Added later to make it similar to GPT-2
self.FinalLayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)

if args.deepspeed_transformer_kernel:
from deepspeed import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig, DeepSpeedConfig
layer = BertLayer(config)
self.layer = nn.ModuleList(
[copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])

if hasattr(args, 'deepspeed_config') and args.deepspeed_config:
ds_config = DeepSpeedConfig(args.deepspeed_config)
else:
raise RuntimeError('deepspeed_config is not found in args.')

cuda_config = DeepSpeedTransformerConfig(
batch_size=ds_config.train_micro_batch_size_per_gpu,
max_seq_length=args.max_seq_length,
hidden_size=config.hidden_size,
heads=config.num_attention_heads,
attn_dropout_ratio=config.attention_probs_dropout_prob,
hidden_dropout_ratio=config.hidden_dropout_prob,
num_hidden_layers=config.num_hidden_layers,
initializer_range=config.initializer_range,
local_rank=args.local_rank
if hasattr(args, 'local_rank') else -1,
seed=args.seed,
fp16=ds_config.fp16_enabled,
pre_layer_norm=True,
attn_dropout_checkpoint=args.attention_dropout_checkpoint,
normalize_invertible=args.normalize_invertible,
gelu_checkpoint=args.gelu_checkpoint)

self.layer = nn.ModuleList([
copy.deepcopy(DeepSpeedTransformerLayer(i, cuda_config))
for i in range(config.num_hidden_layers)
])
else:
layer = BertLayer(config)
self.layer = nn.ModuleList([
copy.deepcopy(layer) for _ in range(config.num_hidden_layers)
])

# def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
# all_encoder_layers = []
# for layer_module in self.layer:
# hidden_states = layer_module(hidden_states, attention_mask)
# if output_all_encoded_layers:
# all_encoder_layers.append(hidden_states)
# if not output_all_encoded_layers:
# all_encoder_layers.append(hidden_states)
# return all_encoder_layers
def forward(self,
hidden_states,
attention_mask,
Expand Down
27 changes: 0 additions & 27 deletions bing_bert/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,11 +138,6 @@ def get_argument_parser():
help=
"Path to prefix data loading, helpful for AML and other environments")

parser.add_argument('--deepspeed_transformer_kernel',
default=False,
action='store_true',
help='Use DeepSpeed transformer kernel to accelerate.')

parser.add_argument(
'--ckpt_to_save',
nargs='+',
Expand All @@ -151,28 +146,6 @@ def get_argument_parser():
'Indicates which checkpoints to save, e.g. --ckpt_to_save 160 161, by default all checkpoints are saved.'
)

parser.add_argument(
'--attention_dropout_checkpoint',
default=False,
action='store_true',
help=
'Use DeepSpeed transformer kernel memory optimization to checkpoint dropout output.'
)
parser.add_argument(
'--normalize_invertible',
default=False,
action='store_true',
help=
'Use DeepSpeed transformer kernel memory optimization to perform invertible normalize backpropagation.'
)
parser.add_argument(
'--gelu_checkpoint',
default=False,
action='store_true',
help=
'Use DeepSpeed transformer kernel memory optimization to checkpoint GELU activation.'
)

return parser


Expand Down