diff --git a/reporting/config/benchmarks.yaml b/reporting/config/benchmarks.yaml new file mode 100644 index 0000000..e019fa9 --- /dev/null +++ b/reporting/config/benchmarks.yaml @@ -0,0 +1,865 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# +# Configuration for benchmarks to be included in the Benchmark.AI report. This +# file is of the following forMmat: +# +# benchmarks: +# - Metric Prefix: the metric prefix without trailing period (required). +# Metric Suffix: the metric suffix without leading period (required). +# Type: Training CV, Training NLP or Inference (required) +# Framework: The framework being tested, e.g. MXNet, Tensorflow, MXNet Import +# Model: The model being used to test. e.g. ResNet-50 +# Benchmark Desc: Optional Benchmark Description +# Instance Type: m4.4xlarge +# Num Instances: +# CPU Memory: +# GPU Memory Max: +# GPU Memory Mean: +# GPU Memory Std: +# ### This section for Inference tasks ### +# Latency: +# P50 Latency: +# P90 Latency: +# Throughput: +# ### This section for Training CV tasks ### +# Model: +# Precision: +# Perplexity: +# GPU Memory Max: +# GPU Memory Mean: +# GPU Memory Std: +# Top 1 Val Acc: +# Top 1 Train Acc: +# Throughput: +# Time to Train: +# ### This section for Training NLP tasks ### +# Model: +# Precision: +# Perplexity: +# Throughput: +# Time to Train: +# TODO(vishaalk): Use default values to avoid redundancy. +benchmarks: +# ------------------ Tensorflow MKL -------------------------------------------- + - Metric Prefix: mxnet.tensorflow_mkl_c5_resnet50_18xlg + Metric Suffix: nightly.c5_18x + Type: Training CV + Framework: Tensorflow + Model: ResNet-50 + Benchmark Desc: Batch size 1 + Instance Type: c5.18xlarge + CPU Memory: cpu_memory_usage + Throughput: images/sec(training with batch size 1) + + - Metric Prefix: mxnet.tensorflow_mkl_c5_resnet50_18xlg + Metric Suffix: nightly.c5_18x + Type: Training CV + Framework: Tensorflow + Model: ResNet-50 + Benchmark Desc: Batch size 32 + Instance Type: c5.18xlarge + CPU Memory: cpu_memory_usage + Throughput: images/sec(training with batch size 32) + + - Metric Prefix: mxnet.tensorflow_mkl_c5_resnet50_18xlg + Metric Suffix: nightly.c5_18x + Type: Inference + Framework: Tensorflow + Model: ResNet-50 + Benchmark Desc: Batch size 1 + Instance Type: c5.18xlarge + CPU Memory: cpu_memory_usage + Throughput: images/sec(Inference with batch size 1) + + - Metric Prefix: mxnet.tensorflow_mkl_c5_resnet50_18xlg + Metric Suffix: nightly.c5_18x + Type: Inference + Framework: Tensorflow + Model: ResNet-50 + Benchmark Desc: Batch size 32 + Instance Type: c5.18xlarge + CPU Memory: cpu_memory_usage + Throughput: images/sec(Inference with batch size 32) +# ------------------ Tensorflow Horovod ---------------------------------------- + - Metric Prefix: mxnet.tensorflow_horovod_resnet50_p3_16xlg_batch_2048 + Metric Suffix: nightly.p3_16x + Type: Training CV + Framework: Tensorflow Horovod + Model: ResNet-50 + Instance Type: p3.16xlarge + CPU Memory: cpu_memory_usage + GPU Memory Max: gpu_memory_usage_max + GPU Memory Mean: gpu_memory_usage_mean + GPU Memory Std: gpu_memory_usage_std + Top 1 Val Acc: Top 1 accuracy + Throughput: Images per sec + Time to Train: Time-to-train (seconds) +# ------------------ Resnet-50 ------------------------------------------------- +# TODO(vishaalk): This test doesn't appear in the dashboards, should it be removed? + - Metric Prefix: chainer.resnet50_imagenet_sagemaker_ch_docker + Metric Suffix: nightly.p3_16x + Type: Training CV + Framework: Chainer Sagemaker + Model: ResNet-50 + Benchmark Desc: ImageNet + Instance Type: p3.16xlarge + CPU Memory: cpu_memory_usage + GPU Memory Max: gpu_memory_usage_max + GPU Memory Mean: gpu_memory_usage_mean + GPU Memory Std: gpu_memory_usage_std + Uptime: uptime_in_seconds + +# ------------------ DAWNBench CIFAR-10 ---------------------------------------- + - Metric Prefix: mxnet.dawnbench_cifar10_resnet164_basic_gluon + Metric Suffix: _p3_2x_nightly + Type: Training CV + Framework: MXNet Gluon + Model: ResNet-164 + Benchmark Desc: CIFAR-10 + Instance Type: p3.2xlarge + Throughput: training_throughput + Time to Train: training_duration + CPU Memory: cpu_memory_usage + GPU Memory Max: gpu_memory_usage_max + GPU Memory Mean: gpu_memory_usage_mean + GPU Memory Std: gpu_memory_usage_std + Uptime: uptime_in_seconds + + - Metric Prefix: mxnet.dawnbench_cifar10_resnet164_basic_gluon + Metric Suffix: _p3_2x_nightly + Type: Inference + Framework: MXNet + Model: ResNet-164 + Benchmark Desc: CIFAR-10 + Instance Type: p3.2xlarge + Latency: inference_latency + Throughput: inference_throughput + CPU Memory: cpu_memory_usage + GPU Memory Max: gpu_memory_usage_max + GPU Memory Mean: gpu_memory_usage_mean + GPU Memory Std: gpu_memory_usage_std + Uptime: uptime_in_seconds + + - Metric Prefix: mxnet.dawnbench_cifar10_resnet164_basic_gluon_infer + Metric Suffix: _c5_18x_nightly + Type: Inference + Framework: MXNet + Model: ResNet-164 + Benchmark Desc: CIFAR-10 + Instance Type: c5.18xlarge + Latency: inference_latency + Throughput: inference_throughput + CPU Memory: cpu_memory_usage + Uptime: uptime_in_seconds + +# ------------------ DAWNBench CIFAR-10 Gluon Hybrid --------------------------- + - Metric Prefix: mxnet.dawnbench_cifar10_resnet164_basic_gluon_hybrid + Metric Suffix: _p3_2x_nightly + Type: Training CV + Framework: MXNet Gluon Hybrid + Model: ResNet-164 + Benchmark Desc: CIFAR-10 + Instance Type: p3.2xlarge + Throughput: training_throughput + Time to Train: training_duration + CPU Memory: cpu_memory_usage + GPU Memory Max: gpu_memory_usage_max + GPU Memory Mean: gpu_memory_usage_mean + GPU Memory Std: gpu_memory_usage_std + Uptime: uptime_in_seconds + + - Metric Prefix: mxnet.dawnbench_cifar10_resnet164_basic_gluon_hybrid + Metric Suffix: _p3_2x_nightly + Type: Inference + Framework: MXNet + Model: ResNet-164 + Benchmark Desc: CIFAR-10 + Instance Type: p3.2xlarge + Latency: inference_latency + Throughput: inference_throughput + Throughput: inference_throughput + CPU Memory: cpu_memory_usage + GPU Memory Max: gpu_memory_usage_max + GPU Memory Mean: gpu_memory_usage_mean + GPU Memory Std: gpu_memory_usage_std + Uptime: uptime_in_seconds + + - Metric Prefix: mxnet.dawnbench_cifar10_resnet164_basic_gluon_hybrid_infer + Metric Suffix: _c5_18x_nightly + Type: Inference + Framework: MXNet + Model: ResNet-164 + Benchmark Desc: CIFAR-10 + Instance Type: c5.18xlarge + Latency: inference_latency + Throughput: inference_throughput + CPU Memory: cpu_memory_usage + Uptime: uptime_in_seconds + +# ------------------ DAWNBench CIFAR-10 Module ---------------------------- + - Metric Prefix: mxnet.dawnbench_cifar10_resnet164_basic_module + Metric Suffix: _p3_2x_nightly + Type: Training CV + Framework: MXNet Module + Model: ResNet-164 + Benchmark Desc: CIFAR-10 + Instance Type: p3.2xlarge + Throughput: training_throughput + Time to Train: training_duration + CPU Memory: cpu_memory_usage + GPU Memory Max: gpu_memory_usage_max + GPU Memory Mean: gpu_memory_usage_mean + GPU Memory Std: gpu_memory_usage_std + Uptime: uptime_in_seconds + + - Metric Prefix: mxnet.dawnbench_cifar10_resnet164_basic_module + Metric Suffix: _p3_2x_nightly + Type: Inference + Framework: MXNet + Model: ResNet-164 + Benchmark Desc: CIFAR-10 + Instance Type: p3.2xlarge + Latency: inference_latency + Throughput: inference_throughput + CPU Memory: cpu_memory_usage + GPU Memory Max: gpu_memory_usage_max + GPU Memory Mean: gpu_memory_usage_mean + GPU Memory Std: gpu_memory_usage_std + Uptime: uptime_in_seconds + + - Metric Prefix: mxnet.dawnbench_cifar10_resnet164_basic_module_infer + Metric Suffix: _c5_18x_nightly + Type: Inference + Framework: MXNet + Model: ResNet-164 + Benchmark Desc: CIFAR-10 + Instance Type: c5.18xlarge + Latency: inference_latency + Throughput: inference_throughput + CPU Memory: cpu_memory_usage + Uptime: uptime_in_seconds + +# ------------------ LSTM PTB -------------------------------------------------- + - Metric Prefix: mxnet.lstm_ptb_imperative + Metric Suffix: nightly.c4_8x + Type: Training NLP + Framework: MXNet Imperative + Model: LSTM + Benchmark Desc: PTB + Instance Type: c4.8xlarge + CPU Memory: cpu_memory_usage + GPU Memory Max: gpu_memory_usage_max + GPU Memory Mean: gpu_memory_usage_mean + GPU Memory Std: gpu_memory_usage_std + Loss: test_loss + Perplexity: test_perplexity + Throughput: + Time to Train: total_training_time + Validation Loss: validation_loss + Validation Perplexity: validation_perplexity + Uptime: uptime_in_seconds + - Metric Prefix: mxnet.lstm_ptb_imperative + Metric Suffix: nightly.c5_18x + Type: Training NLP + Framework: MXNet Imperative + Model: LSTM + Benchmark Desc: PTB + Instance Type: c5.18xlarge + CPU Memory: cpu_memory_usage + GPU Memory Max: gpu_memory_usage_max + GPU Memory Mean: gpu_memory_usage_mean + GPU Memory Std: gpu_memory_usage_std + Precision: + Loss: test_loss + Perplexity: test_perplexity + Throughput: + Time to Train: total_training_time + Validation Loss: validation_loss + Validation Perplexity: validation_perplexity + Uptime: uptime_in_seconds + - Metric Prefix: mxnet.lstm_ptb_imperative + Metric Suffix: nightly.p2_16x + Type: Training NLP + Framework: MXNet Imperative + Model: LSTM + Benchmark Desc: PTB + Instance Type: p2.16xlarge + CPU Memory: cpu_memory_usage + GPU Memory Max: gpu_memory_usage_max + GPU Memory Mean: gpu_memory_usage_mean + GPU Memory Std: gpu_memory_usage_std + Precision: + Loss: test_loss + Perplexity: test_perplexity + Throughput: + Time to Train: total_training_time + Validation Loss: validation_loss + Validation Perplexity: validation_perplexity + Uptime: uptime_in_seconds + - Metric Prefix: mxnet.lstm_ptb_imperative + Metric Suffix: nightly.p3_x + Type: Training NLP + Framework: MXNet Imperative + Model: LSTM + Benchmark Desc: PTB + Instance Type: p3.8xlarge + CPU Memory: cpu_memory_usage + GPU Memory Max: gpu_memory_usage_max + GPU Memory Mean: gpu_memory_usage_mean + GPU Memory Std: gpu_memory_usage_std + Precision: + Loss: test_loss + Perplexity: test_perplexity + Throughput: + Time to Train: total_training_time + Validation Loss: validation_loss + Validation Perplexity: validation_perplexity + Uptime: uptime_in_seconds + + - Metric Prefix: mxnet.lstm_ptb_symbolic + Metric Suffix: nightly.c4_8x + Type: Training NLP + Framework: MXNet Symbolic + Model: LSTM + Benchmark Desc: PTB + Instance Type: c4.8xlarge + CPU Memory: cpu_memory_usage + GPU Memory Max: gpu_memory_usage_max + GPU Memory Mean: gpu_memory_usage_mean + GPU Memory Std: gpu_memory_usage_std + Precision: + Loss: test_loss +# TODO(vishaalk): Train perplexity should be test perplexity. + Perplexity: train_perplexity +# TODO(vishaalk): Speed is likely wrong below. Investigate. + Throughput: speed + Time to Train: total_training_time + Validation Perplexity: validation_perplexity + Uptime: uptime_in_seconds + - Metric Prefix: mxnet.lstm_ptb_symbolic + Metric Suffix: nightly.c5_18x + Type: Training NLP + Framework: MXNet Symbolic + Model: LSTM + Benchmark Desc: PTB + Instance Type: c5.18xlarge + CPU Memory: cpu_memory_usage + GPU Memory Max: gpu_memory_usage_max + GPU Memory Mean: gpu_memory_usage_mean + GPU Memory Std: gpu_memory_usage_std + Precision: + Loss: test_loss +# TODO(vishaalk): Train perplexity should be test perplexity. + Perplexity: train_perplexity +# TODO(vishaalk): Speed is likely wrong below. Investigate. + Throughput: speed + Time to Train: total_training_time + Validation Perplexity: validation_perplexity + Uptime: uptime_in_seconds + - Metric Prefix: mxnet.lstm_ptb_symbolic + Metric Suffix: nightly.p2_16x + Type: Training NLP + Framework: MXNet Symbolic + Model: LSTM + Benchmark Desc: PTB + Instance Type: p2.16xlarge + CPU Memory: cpu_memory_usage + GPU Memory Max: gpu_memory_usage_max + GPU Memory Mean: gpu_memory_usage_mean + GPU Memory Std: gpu_memory_usage_std + Precision: + Loss: test_loss +# TODO(vishaalk): Train perplexity should be test perplexity. + Perplexity: train_perplexity +# TODO(vishaalk): Speed is likely wrong below. Investigate. + Throughput: speed + Time to Train: total_training_time + Validation Perplexity: validation_perplexity + Uptime: uptime_in_seconds + - Metric Prefix: mxnet.lstm_ptb_symbolic + Metric Suffix: nightly.p3_x + Type: Training NLP + Framework: MXNet Symbolic + Model: LSTM + Benchmark Desc: PTB + Instance Type: p3.8xlarge + CPU Memory: cpu_memory_usage + GPU Memory Max: gpu_memory_usage_max + GPU Memory Mean: gpu_memory_usage_mean + GPU Memory Std: gpu_memory_usage_std + Precision: + Loss: test_loss +# TODO(vishaalk): Train perplexity should be test perplexity. + Perplexity: train_perplexity +# TODO(vishaalk): Speed is likely wrong below. Investigate. + Throughput: speed + Time to Train: total_training_time + Validation Perplexity: validation_perplexity + Uptime: uptime_in_seconds + +# ------------------ Resnet-18 -------------------------------------------------- + - Metric Prefix: mxnet.mms_resnet18_gpu + Metric Suffix: nightly.p3_8x + Type: Inference + Framework: MXNet MMS + Model: ResNet-18 + Benchmark Desc: 50 req 25 + Instance Type: p3.8xlarge + CPU Memory: cpu_memory_usage + Throughput: Throughput_concurrency_50_req_25 + Error Rate: Error_rate_concurrency_50_req_25 + Latency: Average_latency_concurrency_50_req_25 + P50 Latency: Median_latency_concurrency_50_req_25 + P90 Latency: P90_latency_concurrency_50_req_25 + Uptime: uptime_in_seconds + - Metric Prefix: mxnet.mms_resnet18_gpu + Metric Suffix: nightly.p3_8x + Type: Inference + Framework: MXNet MMS + Model: ResNet-18 + Benchmark Desc: 50 req 50 + Instance Type: p3.8xlarge + Num Instances: + CPU Memory: cpu_memory_usage + Throughput: Throughput_concurrency_50_req_50 + Error Rate: Error_rate_concurrency_50_req_50 + Latency: Average_latency_concurrency_50_req_50 + P50 Latency: Median_latency_concurrency_50_req_50 + P90 Latency: P90_latency_concurrency_50_req_50 + Uptime: uptime_in_seconds + - Metric Prefix: mxnet.mms_resnet18_gpu + Metric Suffix: nightly.p3_8x + Type: Inference + Framework: MXNet MMS + Model: ResNet-18 + Benchmark Desc: 100 req 25 + Instance Type: p3.8xlarge + Num Instances: + CPU Memory: cpu_memory_usage + Precision: + Perplexity: + Top 1 Val Acc: + Top 1 Train Acc: + Throughput: Throughput_concurrency_100_req_25 + Time to Train: + Error Rate: Error_rate_concurrency_100_req_25 + Latency: Average_latency_concurrency_100_req_25 + P50 Latency: Median_latency_concurrency_100_req_25 + P90 Latency: P90_latency_concurrency_100_req_25 + Uptime: uptime_in_seconds + - Metric Prefix: mxnet.mms_resnet18_gpu + Metric Suffix: nightly.p3_8x + Type: Inference + Framework: MXNet MMS + Model: ResNet-18 + Benchmark Desc: 50 req 100 + Instance Type: p3.8xlarge + Num Instances: + CPU Memory: cpu_memory_usage + Precision: + Perplexity: + Top 1 Val Acc: + Top 1 Train Acc: + Throughput: Throughput_concurrency_50_req_100 + Time to Train: + Error Rate: Error_rate_concurrency_50_req_100 + Latency: Average_latency_concurrency_50_req_100 + P50 Latency: Median_latency_concurrency_50_req_100 + P90 Latency: P90_latency_concurrency_50_req_100 + Uptime: uptime_in_seconds + - Metric Prefix: mxnet.mms_resnet18_gpu + Metric Suffix: nightly.p3_8x + Type: Inference + Framework: MXNet MMS + Model: ResNet-18 + Benchmark Desc: 100 req 50 + Instance Type: p3.8xlarge + Num Instances: + CPU Memory: cpu_memory_usage + Precision: + Perplexity: + Top 1 Val Acc: + Top 1 Train Acc: + Throughput: Throughput_concurrency_100_req_50 + Time to Train: + Error Rate: Error_rate_concurrency_100_req_50 + Latency: Average_latency_concurrency_100_req_50 + P50 Latency: Median_latency_concurrency_100_req_50 + P90 Latency: P90_latency_concurrency_100_req_50 + Uptime: uptime_in_seconds + - Metric Prefix: mxnet.mms_resnet18_gpu + Metric Suffix: nightly.p3_8x + Type: Inference + Framework: MXNet MMS + Model: ResNet-18 + Benchmark Desc: 100 req 100 + Instance Type: p3.8xlarge + Num Instances: + CPU Memory: cpu_memory_usage + Precision: + Perplexity: + Top 1 Val Acc: + Top 1 Train Acc: + Throughput: Throughput_concurrency_100_req_100 + Time to Train: + Error Rate: Error_rate_concurrency_100_req_100 + Latency: Average_latency_concurrency_100_req_100 + P50 Latency: Median_latency_concurrency_100_req_100 + P90 Latency: P90_latency_concurrency_100_req_100 + Uptime: uptime_in_seconds + +# ------------------ Resnet-50 -------------------------------------------------- + - Metric Prefix: mxnet.resnet50_imagenet_sagemaker_mx_docker + Metric Suffix: nightly.p3_16x + Type: Training CV + Framework: Sagemaker + Model: ResNet-50 + Benchmark Desc: ImageNet + Instance Type: p3.16xlarge + Num Instances: + CPU Memory: cpu_memory_usage + GPU Memory Max: gpu_memory_usage_max + GPU Memory Mean: gpu_memory_usage_mean + GPU Memory Std: gpu_memory_usage_std + Precision: + Perplexity: + Top 1 Val Acc: validation_acc + Top 1 Train Acc: training_acc +# TODO(vishaalk): Is speed the throughput? + Throughput: speed + Time to Train: total_training_time + Uptime: uptime_in_seconds +# ------------------ Resnet-50V1 ------------------------------------------------ + - Metric Prefix: mxnet.mxnet_resnet50v1_imagenet_gluon_fp16 + Metric Suffix: daily.p3_16x + Type: Training CV + Framework: MXNet Gluon + Model: ResNet-50V1 + Benchmark Desc: ImageNet + Instance Type: p3.16xlarge + Num Instances: + CPU Memory: cpu_memory_usage + GPU Memory Max: gpu_memory_usage_max + GPU Memory Mean: gpu_memory_usage_mean + GPU Memory Std: gpu_memory_usage_std + Precision: FP16 + Perplexity: + Top 1 Val Acc: validation_acc + Top 1 Train Acc: training_acc +# TODO(vishaalk): Is speed the throughput? + Throughput: speed + Time to Train: total_training_time + Uptime: uptime_in_seconds + + - Metric Prefix: mxnet.mxnet_resnet50v1_imagenet_symbolic_fp16 + Metric Suffix: daily.p3_16x + Type: Training CV + Framework: MXNet Symbolic + Model: ResNet-50V1 + Benchmark Desc: ImageNet + Instance Type: p3.16xlarge + Num Instances: + CPU Memory: cpu_memory_usage + GPU Memory Max: gpu_memory_usage_max + GPU Memory Mean: gpu_memory_usage_mean + GPU Memory Std: gpu_memory_usage_std + Precision: FP16 + Perplexity: + Top 1 Val Acc: validation_acc + Top 1 Train Acc: training_acc +# TODO(vishaalk): Is speed the throughput? + Throughput: speed + Time to Train: total_training_time + Uptime: uptime_in_seconds + + - Metric Prefix: mxnet.mxnet_resnet50v1_imagenet_symbolic_fp32 + Metric Suffix: daily.p3_16x + Type: Training CV + Framework: MXNet Symbolic + Model: ResNet-50V1 + Benchmark Desc: ImageNet + Instance Type: p3.16xlarge + Num Instances: + CPU Memory: cpu_memory_usage + GPU Memory Max: gpu_memory_usage_max + GPU Memory Mean: gpu_memory_usage_mean + GPU Memory Std: gpu_memory_usage_std + Precision: FP32 + Perplexity: + Top 1 Val Acc: validation_acc + Top 1 Train Acc: training_acc +# TODO(vishaalk): Is speed the throughput? + Throughput: speed + Time to Train: total_training_time + Uptime: uptime_in_seconds + + - Metric Prefix: mxnet.mxnet_resnet50v1_imagenet_symbolic_fp16_p38x + Metric Suffix: daily.p3_16x + Type: Training CV + Framework: MXNet Symbolic + Model: ResNet-50V1 + Benchmark Desc: ImageNet + Instance Type: p3.16xlarge + Num Instances: + CPU Memory: cpu_memory_usage + GPU Memory Max: gpu_memory_usage_max + GPU Memory Mean: gpu_memory_usage_mean + GPU Memory Std: gpu_memory_usage_std + Precision: FP16 + Perplexity: + Top 1 Val Acc: validation_acc + Top 1 Train Acc: training_acc +# TODO(vishaalk): Is speed the throughput? + Throughput: speed + Time to Train: total_training_time + Uptime: uptime_in_seconds + + - Metric Prefix: mxnet.mxnet_resnet50v1_imagenet_symbolic_fp16_p38x + Metric Suffix: daily.p3_16x + Type: Training CV + Framework: MXNet Symbolic + Model: ResNet-50V1 + Benchmark Desc: ImageNet + Instance Type: p3.16xlarge + Num Instances: + CPU Memory: cpu_memory_usage + GPU Memory Max: gpu_memory_usage_max + GPU Memory Mean: gpu_memory_usage_mean + GPU Memory Std: gpu_memory_usage_std + Precision: FP16 + Perplexity: + Top 1 Val Acc: validation_acc + Top 1 Train Acc: training_acc +# TODO(vishaalk): Is speed the throughput? + Throughput: speed + Time to Train: total_training_time + Uptime: uptime_in_seconds + + - Metric Prefix: mxnet.resnet50_imagenet-480px-256px-q95_p3_16x_fp16_docker + Metric Suffix: nightly.p3_16x + Type: Training CV + Framework: MXNet + Model: ResNet-50 + Benchmark Desc: ImageNet 480x256-q95 + Instance Type: p3.16xlarge + Num Instances: + CPU Memory: cpu_memory_usage + GPU Memory Max: gpu_memory_usage_max + GPU Memory Mean: gpu_memory_usage_mean + GPU Memory Std: gpu_memory_usage_std + Precision: FP16 + Perplexity: + Top 1 Val Acc: validation_acc + Top 1 Train Acc: training_acc +# TODO(vishaalk): Is speed the throughput? + Throughput: speed + Time to Train: total_training_time + Uptime: uptime_in_seconds + + - Metric Prefix: pytorch.resnet50_imagenet_sagemaker_pt_docker + Metric Suffix: nightly.p3_16x + Type: Training CV + Framework: PyTorch + Model: ResNet-50 + Benchmark Desc: ImageNet 480x256-q95 + Instance Type: p3.16xlarge + Num Instances: + CPU Memory: cpu_memory_usage + GPU Memory Max: gpu_memory_usage_max + GPU Memory Mean: gpu_memory_usage_mean + GPU Memory Std: gpu_memory_usage_std + Perplexity: + Uptime: uptime_in_seconds +# Unused: +# top 1 precision +# loss +# top 5 precision +# seconds + + - Metric Prefix: tensorflow.resnet50_imagenet_sagemaker_tf_docker + Metric Suffix: nightly.p3_16x + Type: Training CV + Framework: Tensorflow + Model: ResNet-50 + Benchmark Desc: ImageNet + Instance Type: p3.16xlarge + Num Instances: + CPU Memory: cpu_memory_usage + GPU Memory Max: gpu_memory_usage_max + GPU Memory Mean: gpu_memory_usage_mean + GPU Memory Std: gpu_memory_usage_std + Perplexity: + Uptime: uptime_in_seconds +# Unused: +# train-error-top5 +# train-error-top1 +# queue_size +# xentropy-loss +# l2_regularize_loss +# learning_rate + +# ------------------ Resnet-50V2 ------------------------------------------------ + - Metric Prefix: mxnet.mxnet_resnet50v2_imagenet_symbolic_fp16 + Metric Suffix: daily.p3_16x + Type: Training CV + Framework: MXNet Symbolic + Model: ResNet-50V2 + Benchmark Desc: ImageNet + Instance Type: p3.16xlarge + Num Instances: + CPU Memory: cpu_memory_usage + GPU Memory Max: gpu_memory_usage_max + GPU Memory Mean: gpu_memory_usage_mean + GPU Memory Std: gpu_memory_usage_std + Precision: FP16 + Perplexity: + Top 1 Val Acc: validation_acc + Top 1 Train Acc: training_acc +# TODO(vishaalk): Is speed the throughput? + Throughput: speed + Time to Train: total_training_time + Uptime: uptime_in_seconds + + - Metric Prefix: mxnet.mxnet_resnet50v2_imagenet_symbolic_fp16 + Metric Suffix: daily.p3_16x + Type: Training CV + Framework: MXNet Symbolic + Model: ResNet-50V2 + Benchmark Desc: ImageNet + Instance Type: p3.16xlarge + Num Instances: + CPU Memory: cpu_memory_usage + GPU Memory Max: gpu_memory_usage_max + GPU Memory Mean: gpu_memory_usage_mean + GPU Memory Std: gpu_memory_usage_std + Precision: FP32 + Perplexity: + Top 1 Val Acc: validation_acc + Top 1 Train Acc: training_acc +# TODO(vishaalk): Is speed the throughput? + Throughput: speed + Time to Train: total_training_time + Uptime: uptime_in_seconds + +# ------------------ ONNX MXNet Import Model ------------------------------------ + - Metric Prefix: mxnet.onnx_mxnet_import_model_inference_test_cpu + Metric Suffix: nightly.c5_18x + Type: Inference + Framework: MXNet + Model: BVLC Alexnet + Benchmark Desc: Import ONNX Model + Instance Type: c5.18xlarge + CPU Memory: cpu_memory_usage + Latency: Average_inference_time_bvlc_alexnet_cpu + Uptime: uptime_in_seconds + - Metric Prefix: mxnet.onnx_mxnet_import_model_inference_test_cpu + Metric Suffix: nightly.c5_18x + Type: Inference + Framework: MXNet + Model: BVLC GoogleNet + Benchmark Desc: Import ONNX Model + Instance Type: c5.18xlarge + CPU Memory: cpu_memory_usage + Latency: Average_inference_time_bvlc_googlenet_cpu + Uptime: uptime_in_seconds + - Metric Prefix: mxnet.onnx_mxnet_import_model_inference_test_cpu + Metric Suffix: nightly.c5_18x + Type: Inference + Framework: MXNet + Framework: ONNX CPU + Model: DenseNet-121 + Benchmark Desc: Import ONNX Model + Instance Type: c5.18xlarge + CPU Memory: cpu_memory_usage + Latency: Average_inference_time_densenet121_cpu + Uptime: uptime_in_seconds + - Metric Prefix: mxnet.onnx_mxnet_import_model_inference_test_cpu + Metric Suffix: nightly.c5_18x + Type: Inference + Framework: ONNX CPU + Framework: MXNet + Model: SqueezeNet + Benchmark Desc: Import ONNX Model + Instance Type: c5.18xlarge + CPU Memory: cpu_memory_usage + Latency: Average_inference_time_squeezenet_cpu + Uptime: uptime_in_seconds +# TODO(vishaalk): This is not in the dashboards and appears to be unused. +# - Metric Prefix: mxnet.onnx_mxnet_import_model_inference_test_cpu +# Metric Suffix: nightly.c5_18x +# Type: Inference +# Framework: ONNX CPU +# Framework: MXNet +# Model: Ref CaffeNet +# Benchmark Desc: Import ONNX Model +# Instance Type: c5.18xlarge +# CPU Memory: cpu_memory_usage +# Latency: Average_inference_time_reference_caffenet_cpu +# Uptime: uptime_in_seconds + - Metric Prefix: mxnet.onnx_mxnet_import_model_inference_test_cpu + Metric Suffix: nightly.c5_18x + Type: Inference + Framework: MXNet + Model: BVLC Ref CaffeNet + Benchmark Desc: Import ONNX Model + Instance Type: c5.18xlarge + CPU Memory: cpu_memory_usage + Latency: Average_inference_time_bvlc_reference_caffenet_cpu + Uptime: uptime_in_seconds + - Metric Prefix: mxnet.onnx_mxnet_import_model_inference_test_cpu + Metric Suffix: nightly.c5_18x + Type: Inference + Framework: MXNet + Model: ShuffleNet + Benchmark Desc: Import ONNX Model + Instance Type: c5.18xlarge + CPU Memory: cpu_memory_usage + Latency: Average_inference_time_shufflenet_cpu + Uptime: uptime_in_seconds + - Metric Prefix: mxnet.onnx_mxnet_import_model_inference_test_cpu + Metric Suffix: nightly.c5_18x + Type: Inference + Framework: MXNet + Model: BVLC Ref RCNN Ilsvrc13 + Benchmark Desc: Import ONNX Model + Instance Type: c5.18xlarge + CPU Memory: cpu_memory_usage + Latency: Average_inference_time_bvlc_reference_rcnn_ilsvrc13_cpu + Uptime: uptime_in_seconds + - Metric Prefix: mxnet.onnx_mxnet_import_model_inference_test_cpu + Metric Suffix: nightly.c5_18x + Type: Inference + Framework: MXNet + Model: VGG19 + Benchmark Desc: Import ONNX Model + Instance Type: c5.18xlarge + CPU Memory: cpu_memory_usage + Latency: Average_inference_time_vgg19_cpu + Uptime: uptime_in_seconds + - Metric Prefix: mxnet.onnx_mxnet_import_model_inference_test_cpu + Metric Suffix: nightly.c5_18x + Type: Inference + Framework: MXNet + Model: ResNet-50 + Benchmark Desc: Import ONNX Model + Instance Type: c5.18xlarge + CPU Memory: cpu_memory_usage + Latency: Average_inference_time_resnet50_cpu + Uptime: uptime_in_seconds +# -----------------------------------------------------------------------------