NVIDIA · yangchao-zhou · Oct 5, 2024 · Oct 10, 2024
diff --git a/README_YC.md b/README_YC.md
@@ -0,0 +1,54 @@
+
+## Step 1: Format the data
+python /opt/NeMo-Aligner/examples/nlp/data/steerlm/preprocess_openassistant_data.py --output_directory=data/oasst
+
+## Step 2: Run SFT training
+
+export WANDB_DISABLED=true
+export NCCL_IB_DISABLE=1  # 禁用 InfiniBand，如果通信出错时可尝试
+export NCCL_P2P_DISABLE=1  # 禁用 P2P 传输，排查问题时有用
+
+export NCCL_DEBUG=INFO
+export TMPDIR=/mnt/workspace/yangchao.zhou/opt/models/tmp
+MODEL="/mnt/workspace/yangchao.zhou/opt/models/Mistral-NeMo-12B-Instruct/Mistral-NeMo-12B-Instruct.nemo"
+TRAIN_DS="/mnt/workspace/yangchao.zhou/opt/data/oasst/train.jsonl"
+VALID_DS="/mnt/workspace/yangchao.zhou/opt/data/oasst/val.jsonl"
+RESULTS="/mnt/workspace/yangchao.zhou/opt/RESULTS/7B"
+
+
+python examples/nlp/gpt/train_gpt_sft4linky.py \
+   trainer.precision=bf16 \
+   trainer.num_nodes=1 \
+   trainer.devices=8 \
+   trainer.sft.max_steps=-1 \
+   trainer.sft.limit_val_batches=40 \
+   trainer.sft.val_check_interval=1000 \
+   model.tensor_model_parallel_size=1 \
+   model.pipeline_model_parallel_size=8 \
+   model.megatron_amp_O2=True \
+   model.activations_checkpoint_granularity=selective\
+   model.restore_from_path=${MODEL} \
+   model.optim.lr=5e-6 \
+   model.data.chat=True \
+   model.data.num_workers=0 \
+   model.data.train_ds.micro_batch_size=1 \
+   model.data.train_ds.global_batch_size=8 \
+   model.data.train_ds.max_seq_length=1024 \
+   model.data.train_ds.file_path=${TRAIN_DS} \
+   model.data.validation_ds.micro_batch_size=1 \
+   model.data.validation_ds.global_batch_size=8 \
+   model.data.validation_ds.file_path=${VALID_DS} \
+   model.data.validation_ds.max_seq_length=1024 \
+   exp_manager.create_wandb_logger=False \
+   exp_manager.explicit_log_dir=${RESULTS} \
+   exp_manager.wandb_logger_kwargs.project=sft_run \
+   exp_manager.wandb_logger_kwargs.name=chat_sft_run \
+   exp_manager.checkpoint_callback_params.save_nemo_on_train_end=True \
+   exp_manager.resume_if_exists=True \
+   exp_manager.resume_ignore_no_checkpoint=True \
+   exp_manager.create_checkpoint_callback=True \
+   exp_manager.checkpoint_callback_params.monitor=validation_loss
+
+### 杀掉进程
+ps -ef | grep train_gpt_sft4linky
+pkill -f train_gpt_sft4linky.py
diff --git a/examples/nlp/gpt/conf/gpt_sft4linky.yaml b/examples/nlp/gpt/conf/gpt_sft4linky.yaml
@@ -0,0 +1,204 @@
+name: megatron_gpt_sft
+
+trainer:
+  num_nodes: 1
+  devices: 1
+  accelerator: gpu
+  precision: bf16
+
+  sft:
+    max_epochs: 1
+    max_steps: -1
+
+    val_check_interval: 100
+    save_interval: ${.val_check_interval}
+    limit_train_batches: 1.0
+
+    limit_val_batches: 1.0
+    gradient_clip_val: 1.0
+
+    # can be used to register any custom metrics that require token-by-token generation
+    # inference_metrics:
+    #   my_metric_name1:
+    #     _target_: <metric class>
+    #   my_metric_name2:
+    #     _target_: <metric class>
+    #     <any required arguments>
+
+  # do not change these
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_time: null
+  max_epochs: ${.sft.max_epochs}
+  max_steps: ${.sft.max_steps}
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: ${name}
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 5
+    mode: min
+    save_nemo_on_train_end: False 
+    filename: 'megatron_gpt_sft--{${.monitor}:.3f}-{step}-{consumed_samples}-{epoch}'
+    model_parallel_size: ${model.tensor_model_parallel_size}
+    save_best_model: False   # need to keep this false otherwise it will create multiple last.ckpt files because restore reset the previous best model
+
+model:
+  seed: 1234
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  restore_from_path: ??? # Path to an existing p-tuned/prompt tuned .nemo model you wish to add new tasks to or run inference with
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+  save_nemo_on_validation_end: True # Saves an inference ready .nemo file every time a checkpoint is saved during training.
+  sync_batch_comm: False
+  megatron_amp_O2: False
+  encoder_seq_length: 4096  # the sequence length of the encoder model, it will be overwriten by loaded GPT model
+
+  ## Sequence Parallelism
+  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  sequence_parallel: False
+
+  ## Activation Checkpoint
+  activations_checkpoint_granularity: null # 'selective' or 'full'
+  activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null # not used with 'selective'
+  activations_checkpoint_layers_per_pipeline: null
+  # This feature is valid only when used with pipeline-model-parallelism. More details in megatron_gpt_config.yaml.
+  answer_only_loss: False # not used right now
+  gradient_as_bucket_view: False
+  seq_len_interpolation_factor: null # if not None, seq_len_interpolation_factor will match the base model's value
+  use_flash_attention: null # if not None, will match the base model's value
+
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+
+  steerlm2:
+    forward_micro_batch_size:  1  # the micro batch size for the forward pass, used to compute the weights
+    micro_batch_size:  1 # the steerlm2 training micro batch size
+
+  # can be used to customize behavior of model.generate for inference metrics
+  # note that you have to specify all parameters explicitly even if they match defaults
+  # as long as you change at least one parameter
+  #
+  # inference:
+  #   sampling_params:
+  #     use_greedy: False
+  #     temperature: 0.7
+  #     top_k: 0
+  #     top_p: 0.95
+  #     repetition_penalty: 1.0
+  #     add_BOS: True
+  #     all_probs: False
+  #     compute_logprob: False
+  #     end_strings: ["<|endoftext|>", "<extra_id_1>"]
+  #   length_params:
+  #     min_length: 0
+  #     max_length: 512
+  #   strategy:
+  #     _target_: <custom strategy class>
+  #     <any required arguments>
+
+
+  peft:
+    peft_scheme: "none"  # ["lora", "none"]
+    restore_from_path: null
+
+    lora_tuning:
+      target_modules: ['attention_qkv'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', 'attention' (qkv & dense), 'mlp' (fc1 & fc2), 'all'
+      adapter_dim: 32
+      adapter_dropout: 0.0
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      layer_selection:  null  # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+
+  data:
+    chat: False # whether use chatbot data or not
+    chat_prompt_tokens:  # special tokens for the chat prompts, a dictionary of {token_type: token}. note that some tokenizer may combine the characters at the junction between {end_of_turn}{turn_start}. e.g. '<im end><im start>', the '><' sometimes is merged to be a single token. This is not supported, try to avoid
+      system_turn_start: "\x00"
+      turn_start: "\x11"
+      label_start: "\x12"
+      end_of_turn: "\x0A"  # \0x0A is '\n'
+      end_of_name: "\x0A"  # \0x0A is '\n'
+    sample: False # create the index mapping files for the sample data, so max_steps * global_batch_size can be larger than the dataset size
+    num_workers: 0
+    train_ds:
+      # Example of how to specify paths to multiple datasets
+      # file_names:
+      #   - /path/to/squad.jsonl
+      #   - /path/to/mnli.jsonl
+      #   - /path/to/boolq.jsonl
+      # Example of how each dataset is formatted
+      # {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'}
+      file_path: ??? # Path to a JSONL file corresponding to the source data. Data format is identical to validation_ds.
+      global_batch_size: 128
+      micro_batch_size: 1
+      shuffle: True
+      memmap_workers: null
+      max_seq_length: ${model.encoder_seq_length}
+      min_seq_length: 1
+      drop_last: True  # note that `False` is not currently supported
+      # Example of how to specify concat_sampling_probabilities
+      # concat_sampling_probabilities:
+      #   - 0.5
+      #   - 0.25
+      #   - 0.25
+      label_key: 'output'
+      add_eos: True
+      add_sep: False
+      add_bos: False
+      truncation_field: "input" # # Can be multiple keys separated with ',' Options: keys in prompt_template
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: "{input} {output}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+      hf_dataset: False # Whether to load the json file with the HuggingFace dataset. otherwise, will load the jsonl file with the JSONLMemMapDataset.
+      truncation_method: 'right' # Truncation from which position, Options: ['left', 'right']
+
+    validation_ds:
+      file_path: ??? # Path to a JSONL file corresponding to the source data. Data format is identical to validation_ds.
+      global_batch_size: ${model.data.train_ds.global_batch_size}
+      micro_batch_size: ${model.data.train_ds.micro_batch_size}
+      shuffle: False
+      memmap_workers: ${model.data.train_ds.memmap_workers}
+      max_seq_length: ${model.data.train_ds.max_seq_length}
+      min_seq_length: 1
+      drop_last: True  #  note that `False` is not currently supported
+      label_key: ${model.data.train_ds.label_key}
+      add_eos: ${model.data.train_ds.add_eos}
+      add_sep: ${model.data.train_ds.add_sep}
+      add_bos: ${model.data.train_ds.add_bos}
+      truncation_field: ${model.data.train_ds.truncation_field} # Options: keys in prompt_template
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: ${model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+      hf_dataset: False # Whether to load the json file with the HuggingFace dataset. otherwise, will load the jsonl file with the JSONLMemMapDataset.
+      truncation_method: 'right' # Truncation from which position, Options: ['left', 'right']
+      output_original_text: True  # needed for the proper metrics support
+
+  optim:
+    name: distributed_fused_adam # Supports distributed optimizer for memory savings. To enable, set to 'distributed_fused_adam'. Needs Apex to be built with specific args to work.
+    lr: 3e-5
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 10
+      constant_steps: 1000
+      min_lr: 9e-7