Skip to content

Commit

Permalink
Merge pull request #2 from microsoft/eltonz/apply-cuda-ext
Browse files Browse the repository at this point in the history
apply transformer kernel
  • Loading branch information
RezaYazdaniAminabadi authored May 8, 2020
2 parents fdec44b + 8955cc3 commit 7acc3ce
Show file tree
Hide file tree
Showing 17 changed files with 1,538 additions and 1,788 deletions.
2 changes: 1 addition & 1 deletion BingBertSquad/deepspeed_bsz24_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
"optimizer": {
"type": "Adam",
"params": {
"lr": 3e-5,
"lr": 1e-5,
"weight_decay": 0.0,
"bias_correction": false
}
Expand Down
2 changes: 1 addition & 1 deletion BingBertSquad/nvidia_run_squad_deepspeed.py
Original file line number Diff line number Diff line change
Expand Up @@ -806,7 +806,7 @@ def main():
# Padding for divisibility by 8
if bert_config.vocab_size % 8 != 0:
bert_config.vocab_size += 8 - (bert_config.vocab_size % 8)
model = BertForQuestionAnswering(bert_config)
model = BertForQuestionAnswering(bert_config, args)
print("VOCAB SIZE:", bert_config.vocab_size)
if args.model_file is not "0":
logger.info(f"Loading Pretrained Bert Encoder from: {args.model_file}")
Expand Down
1 change: 1 addition & 0 deletions BingBertSquad/run_squad_deepspeed.sh
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ run_cmd="deepspeed --num_nodes ${NUM_NODES} --num_gpus ${NGPU_PER_NODE} \
--fp16 \
--deepspeed \
--deepspeed_config ${config_json} \
--deepspeed_transformer_kernel \
--model_file $MODEL_FILE
"
echo ${run_cmd}
Expand Down
37 changes: 30 additions & 7 deletions BingBertSquad/turing/nvidia_modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -459,10 +459,33 @@ def forward(self, hidden_states, attention_mask):
return layer_output

class BertEncoder(nn.Module):
def __init__(self, config):
def __init__(self, config, args):
super(BertEncoder, self).__init__()
layer = BertLayer(config)
self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])

if args.deepspeed_transformer_kernel:
from deepspeed import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig, DeepSpeedConfig

if hasattr(args, 'deepspeed_config') and args.deepspeed_config:
ds_config = DeepSpeedConfig(args.deepspeed_config)
else:
raise RuntimeError('deepspeed_config is not found in args.')

cuda_config = DeepSpeedTransformerConfig(batch_size = ds_config.train_micro_batch_size_per_gpu,
max_seq_length = args.max_seq_length,
hidden_size = config.hidden_size,
heads = config.num_attention_heads,
attn_dropout_ratio = config.attention_probs_dropout_prob,
hidden_dropout_ratio = config.hidden_dropout_prob,
num_hidden_layers = config.num_hidden_layers,
initializer_range = config.initializer_range,
seed = args.seed,
fp16 = ds_config.fp16_enabled,
pre_layer_norm=False)

self.layer = nn.ModuleList([copy.deepcopy(DeepSpeedTransformerLayer(i, cuda_config)) for i in range(config.num_hidden_layers)])
else:
layer = BertLayer(config)
self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])

# def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
# all_encoder_layers = []
Expand Down Expand Up @@ -794,10 +817,10 @@ class BertModel(BertPreTrainedModel):
all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
```
"""
def __init__(self, config):
def __init__(self, config, args):
super(BertModel, self).__init__(config)
self.embeddings = BertEmbeddings(config)
self.encoder = BertEncoder(config)
self.encoder = BertEncoder(config, args)
self.pooler = BertPooler(config)
self.apply(self.init_bert_weights)

Expand Down Expand Up @@ -1306,9 +1329,9 @@ class BertForQuestionAnswering(BertPreTrainedModel):
start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
```
"""
def __init__(self, config):
def __init__(self, config, args):
super(BertForQuestionAnswering, self).__init__(config)
self.bert = BertModel(config)
self.bert = BertModel(config, args)
# TODO check with Google if it's normal there is no dropout on the token classifier of SQuAD in the TF version
# self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.qa_outputs = nn.Linear(config.hidden_size, 2)
Expand Down
5 changes: 5 additions & 0 deletions BingBertSquad/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,11 @@ def get_argument_parser():
default=100,
help='Interval to print training details.')

parser.add_argument('--deepspeed_transformer_kernel',
default=False,
action='store_true',
help='Use DeepSpeed transformer kernel to accelerate.')

return parser

def get_summary_writer(name, base=".."):
Expand Down
9 changes: 5 additions & 4 deletions bing_bert/bert_large_lamb_seq128.json
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,13 @@
},
"training": {
"num_epochs": 150,
"warmup_proportion": 0.02,
"learning_rate": 4e-3,
"warmup_proportion": 0.06,
"learning_rate": 11e-3,
"num_workers": 0,
"async_worker": true,
"decay_rate": 0.90,
"decay_step": 1000,
"total_training_steps": 187000
"decay_step": 250,
"total_training_steps": 7500
},
"validation": {
"path": "validation_set/"
Expand Down
13 changes: 7 additions & 6 deletions bing_bert/bert_large_lamb_seq512.json
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,14 @@
]
},
"training": {
"num_epochs": 162,
"warmup_proportion": 0.01,
"learning_rate": 4e-3,
"num_epochs": 160,
"warmup_proportion": 0.02,
"learning_rate": 2e-3,
"num_workers": 0,
"decay_rate": 0.70,
"decay_step": 1000,
"total_training_steps": 187000
"async_worker": true,
"decay_rate": 0.90,
"decay_step": 150,
"total_training_steps": 7500
},
"validation": {
"path": "/data/bert/validation_set/"
Expand Down
36 changes: 36 additions & 0 deletions bing_bert/data_worker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import threading
import queue
import time

class AsyncWorker(threading.Thread):
def __init__(self, dataloaders, dataset_picker):
threading.Thread.__init__(self)
self.req_queue = queue.Queue()
self.ret_queue = queue.Queue()
self.dataloaders = dataloaders
self.dataset_picker = dataset_picker
self.prefetch_idx = 3
for i in range(self.prefetch_idx):
self.req_queue.put(dataset_picker[i])

def run(self):
while True:
dataset_type = self.req_queue.get(block=True)
if dataset_type is None:
break
batch = next(self.dataloaders[dataset_type])
self.req_queue.task_done()
self.ret_queue.put(batch)

def get(self):
batch = self.ret_queue.get()
self.ret_queue.task_done()
return batch

def prefetch(self):
if self.prefetch_idx < len(self.dataset_picker):
self.req_queue.put(self.dataset_picker[self.prefetch_idx])
self.prefetch_idx += 1

def stop(self):
self.req_queue.put(None)
16 changes: 9 additions & 7 deletions bing_bert/deepspeed_bsz16K_lamb_config.json
Original file line number Diff line number Diff line change
@@ -1,20 +1,22 @@
{
"train_batch_size": 16384,
"train_micro_batch_size_per_gpu": 16,
"train_batch_size": 65536,
"train_micro_batch_size_per_gpu": 64,
"steps_per_print": 1000,
"prescale_gradients": true,
"prescale_gradients": false,
"optimizer": {
"type": "Lamb",
"params": {
"lr": 4e-3,
"lr": 11e-3,
"max_grad_norm": 1.0,
"weight_decay": 0.01,
"bias_correction": false,
"max_coeff": 0.5,
"min_coeff": 0.08
"max_coeff": 0.3,
"min_coeff": 0.01
}
},
"wall_clock_breakdown": true,

"wall_clock_breakdown": false,

"fp16": {
"enabled": true,
"loss_scale": 0
Expand Down
16 changes: 9 additions & 7 deletions bing_bert/deepspeed_bsz16K_lamb_config_seq512.json
Original file line number Diff line number Diff line change
@@ -1,20 +1,22 @@
{
"train_batch_size": 16384,
"train_micro_batch_size_per_gpu": 4,
"train_batch_size": 32768,
"train_micro_batch_size_per_gpu": 8,
"steps_per_print": 1000,
"prescale_gradients": true,
"prescale_gradients": false,
"optimizer": {
"type": "Lamb",
"params": {
"lr": 1e-3,
"lr": 4e-3,
"max_grad_norm": 1.0,
"weight_decay": 0.01,
"bias_correction": false,
"max_coeff": 0.5,
"min_coeff": 0.08
"max_coeff": 0.3,
"min_coeff": 0.01
}
},
"wall_clock_breakdown": true,

"wall_clock_breakdown": false,

"fp16": {
"enabled": true,
"loss_scale": 0
Expand Down
Loading

0 comments on commit 7acc3ce

Please sign in to comment.