From d36dc9dd6647306def88456d9a4033625e0f4486 Mon Sep 17 00:00:00 2001 From: Steve Farrell Date: Wed, 8 Sep 2021 12:04:16 -0700 Subject: [PATCH] Squashed 'open_catalyst/' changes from 89b0f1d..2281b92 2281b92 Merge pull request #4 from sparticlesteve/fix-opt-name-logging 897dc5d Add opt_name to mlperf logging ac5f5a5 rearrange config file lines for clarity git-subtree-dir: open_catalyst git-subtree-split: 2281b9242e6a0f775d812d2956128d7b1b2da8e2 --- configs/mlperf_hpc.yml | 12 +++++------- ocpmodels/trainers/mlperf_forces_trainer.py | 1 + 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/configs/mlperf_hpc.yml b/configs/mlperf_hpc.yml index 4856a60..543d135 100755 --- a/configs/mlperf_hpc.yml +++ b/configs/mlperf_hpc.yml @@ -52,19 +52,17 @@ optim: batch_size: 8 eval_batch_size: 8 num_workers: 8 - # Evaluate every epoch - #eval_every: -1000 - lr_initial: 0.0004 - lr_gamma: 0.1 - # New LR schedule based on DistributedSampler partitioning + optimizer: AdamW + lr_initial: 0.0004 + warmup_steps: 31252 # 4 epochs + warmup_factor: 0.2 lr_milestones: - 125008 # 16 epochs - 187512 # 24 epochs - 250016 # 32 epochs - warmup_steps: 31252 # 4 epochs + lr_gamma: 0.1 - warmup_factor: 0.2 max_epochs: 30 energy_coefficient: 0 force_coefficient: 50 diff --git a/ocpmodels/trainers/mlperf_forces_trainer.py b/ocpmodels/trainers/mlperf_forces_trainer.py index 0b934f7..b9f12f3 100644 --- a/ocpmodels/trainers/mlperf_forces_trainer.py +++ b/ocpmodels/trainers/mlperf_forces_trainer.py @@ -369,6 +369,7 @@ def train(self): value=self.config["optim"]["batch_size"] * self.config["gpus"]) mllogger.event(key=mllog.constants.TRAIN_SAMPLES, value=len(self.train_loader.dataset)) mllogger.event(key=mllog.constants.EVAL_SAMPLES, value=len(self.val_loader.dataset)) + mllogger.event(key=mllog.constants.OPT_NAME, value=self.config["optim"].get("optimizer", "AdamW")) mllogger.event(key=mllog.constants.OPT_BASE_LR, value=self.config["optim"]["lr_initial"]) mllogger.event(key=mllog.constants.OPT_LR_WARMUP_STEPS, value=self.config["optim"]["warmup_steps"]) mllogger.event(key=mllog.constants.OPT_LR_WARMUP_FACTOR, value=self.config["optim"]["warmup_factor"])