-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathrivanna-shm.yaml
113 lines (95 loc) · 3.23 KB
/
rivanna-shm.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
name: earthquake-shm
sbatch:
mode: h
dir: shm
data:
git: https://github.com/laszewsk/mlcommons-data-earthquake.git
destination: "$HOME/mlcommons-work"
earthquake: mlcommons-data-earthquake/data/EarthquakeDec2020
images: images
experiment:
card_name: "a100,v100,p100,k80,rtx2080"
gpu_count: 1
cpu_num: 6
mem: "64GB"
repeat: "1"
#repeat: "1,2,3,4,5"
## TFTTransformerepochs = num_epochs
#
TFTTransformerepochs: "2,10,20,30,34,40,50,60,70"
system:
allocation: "bii_dsc"
partition: "gpu"
host: rivanna
user: "{user}"
reservation: ""
constraint: ""
submission:
benchmark: earthquake
submitter: Gregor von Laszewski
email: [email protected]
org: University of Virginia
division: closed
version: mlcommons-earthquake-v1.0
github_commit_version: TBD
status: completed
platform: rivanna
benchmark:
name: Earthquake
user: Gregor von Laszewski
e-mail: [email protected]
organisation: University of Virginia
version: mlcommons-earthquake-v1.0
github_commit_version: TBD
division: closed
status: completed
platform: rivanna
code:
script: FFFFWNPFEARTHQ_newTFTv29-gregor-parameters-fig.ipynb
run:
allocation: ds6011-sp22-002
filesystem: "/dev/shm/{os.USER}"
# workdir: /dev/shm/{os.USER}/mlcommons-tmp
venvpath: "$HOME/mlcommons-work/python"
datadir: data
branch: 'main'
colortheme: "False"
time: "3-0"
set_soft_device_placement: False
debugging_set_log_device_placement: False
DLAnalysisOnly: False
DLRestorefromcheckpoint: False
DLinputCheckpointpostfix: ''
## TFTTransformerbatch_size = minibatch_size:
## splits training data into batches used to calculate model error and update model coefficients
##
## TFTTransformertestvalbatch_size =
## max(128,TFTTransformerbatch_size)
## the maxibatch_size is a range between min and max for batch size
##
TFTTransformerbatch_size: 64
## TFTd_model = hidden_layer_size : number of hidden layers in model
TFTd_model: 160
## Tseq = num_encoder_steps :
## Size of sequence window, number of days included in that section of data.
## This is used throughout a large portion of the code.
Tseq: 26
## TFTdropout_rate = dropout_rate
## The dropout rate when training models.
## It randomly drop nodes from a neural network to prevent overfitting
TFTdropout_rate: 0.1
## learning_rate : how quickly the model adapts to the problem,
## * Larger means faster convergence but less optimal solutions,
## * Slower means slower convergence but more optimal solutions potentially
## fail if learning rate it too small.
## In general a variable learning rate is best. start larger and decrease as you see
## less returns or as your solution converges.
learning_rate: 0.0000005
## max_gradient_norm : Gradient Clipping. Not currently used in code
max_gradient_norm: 0.01
## early_stopping_patience : Early stopping param for keras, a way to prevent overfit or various metric decreases
early_stopping_patience: 60
## TFTnum_AttentionLayers = num_stacks | stack_size : number of layers in attention head? , Not currently used in code
TFTnum_AttentionLayers: 2
## TFTnum_heads = num_heads : number of attention heads
TFTnum_heads: 4