benchmarks/earthquake/latest/experiments/rivanna/rivanna-shm.yaml

name: earthquake-shm
sbatch:
  mode: h
  dir: shm

data:
  git: https://github.com/laszewsk/mlcommons-data-earthquake.git
  destination: "$HOME/mlcommons-work"
  earthquake: mlcommons-data-earthquake/data/EarthquakeDec2020
  images: images

experiment:
  card_name: "a100,v100,p100,k80,rtx2080"
  gpu_count: 1
  cpu_num: 6
  mem: "64GB"
  repeat: "1"
  #repeat: "1,2,3,4,5"
  ## TFTTransformerepochs = num_epochs
  #
  TFTTransformerepochs: "2,10,20,30,34,40,50,60,70"

system:
  allocation: "bii_dsc"
  partition: "gpu"
  host: rivanna
  user: "{user}"
  reservation: ""
  constraint: ""

submission:
  benchmark: earthquake
  submitter: Gregor von Laszewski
  email: laszewski@gmail.com
  org: University of Virginia
  division: closed
  version: mlcommons-earthquake-v1.0
  github_commit_version: TBD
  status: completed
  platform: rivanna

benchmark:
  name: Earthquake
  user: Gregor von Laszewski
  e-mail: laszewski@gmail.com
  organisation: University of Virginia
  version: mlcommons-earthquake-v1.0
  github_commit_version: TBD
  division: closed
  status: completed
  platform: rivanna

code:
  script: FFFFWNPFEARTHQ_newTFTv29-gregor-parameters-fig.ipynb

run:
  allocation: ds6011-sp22-002
  filesystem: "/dev/shm/{os.USER}"
  # workdir: /dev/shm/{os.USER}/mlcommons-tmp
  venvpath: "$HOME/mlcommons-work/python"
  datadir: data
  branch: 'main'

colortheme: "False"

time: "3-0"
set_soft_device_placement: False
debugging_set_log_device_placement: False
DLAnalysisOnly: False
DLRestorefromcheckpoint: False
DLinputCheckpointpostfix: ''

## TFTTransformerbatch_size = minibatch_size:
##   splits training data into batches used to calculate model error and update model coefficients
##
## TFTTransformertestvalbatch_size =
##  max(128,TFTTransformerbatch_size)
##  the maxibatch_size is a range between min and max for batch size
##
TFTTransformerbatch_size: 64

## TFTd_model = hidden_layer_size : number of hidden layers in model
TFTd_model: 160

## Tseq = num_encoder_steps :
##        Size of sequence window, number of days included in that section of data.
##        This is used throughout a large portion of the code.
Tseq: 26

## TFTdropout_rate = dropout_rate
##                   The dropout rate when training models.
##                   It randomly drop nodes from a neural network to prevent overfitting
TFTdropout_rate: 0.1

## learning_rate : how quickly the model adapts to the problem,
##                 * Larger means faster convergence but less optimal solutions,
##                 * Slower means slower convergence but more optimal solutions potentially
##                          fail if learning rate it too small.
##                 In general a variable learning rate is best. start larger and decrease as you see
##                          less returns or as your solution converges.
learning_rate: 0.0000005

## max_gradient_norm : Gradient Clipping. Not currently used in code
max_gradient_norm: 0.01

## early_stopping_patience : Early stopping param for keras, a way to prevent overfit or various metric decreases
early_stopping_patience: 60

## TFTnum_AttentionLayers = num_stacks | stack_size : number of layers in attention head? , Not currently used in code
TFTnum_AttentionLayers: 2

## TFTnum_heads = num_heads : number of attention heads
TFTnum_heads: 4