Merge pull request #561 from runame/scoring

Fix scoring issues
mlcommons · Nov 3, 2023 · 0943802 · 0943802
2 parents 931f71f + 4151e09
commit 0943802
Show file tree

Hide file tree

Showing 5 changed files with 48 additions and 17 deletions.
diff --git a/scoring/score_submission.py b/scoring/score_submission.py
@@ -5,8 +5,7 @@
 from absl import logging
 import scoring_utils
 
-from algorithmic_efficiency import workloads
-import scoring
+from scoring import scoring
 
 flags.DEFINE_string(
     'experiment_path',

diff --git a/scoring/scoring.py b/scoring/scoring.py
@@ -40,6 +40,12 @@
 WORKLOADS = workloads_registry.WORKLOADS
 WORKLOAD_NAME_PATTERN = '(.*)(_jax|_pytorch)'
 BASE_WORKLOADS_DIR = 'algorithmic_efficiency/workloads/'
+# These global variables have to be set according to the current set of
+# workloads and rules for the scoring to be correct.
+# We do not use the workload registry since it contains test and development
+# workloads as well.
+NUM_WORKLOADS = 8
+NUM_TRIALS = 5
 
 MIN_EVAL_METRICS = [
     'ce_loss',
@@ -133,9 +139,10 @@ def get_index_that_reaches_target(workload_df,
   # Remove trials that never reach the target
   target_reached = target_reached[target_reached.apply(np.any)]
 
-  # If we have no trials that have reached the target, return -1. Else, return
-  # the eval index of the earliest point the target is reached.
-  if target_reached.empty:
+  # If less than 3 trials reach the target, the submission will be scored as
+  # missing the target on this workload; return -1. Else, return the eval index
+  # of the earliest point the target is reached.
+  if len(target_reached) < 3:
     return -1, -1
   else:
     index_reached = target_reached.apply(np.argmax)
@@ -287,7 +294,7 @@ def compute_performance_profiles(results,
         np.log10(min_tau), np.log10(max_tau), num=num_points, base=10.0)
 
   def rho(r, tau):
-    return (r <= tau).sum(axis=1) / len(r.columns)
+    return (r <= tau).sum(axis=1) / NUM_WORKLOADS
 
   perf_df = pd.concat([rho(df, tau) for tau in points], axis=1)
 

diff --git a/scoring/scoring_utils.py b/scoring/scoring_utils.py
@@ -1,10 +1,14 @@
 import json
 import os
 import re
+import warnings
 
 from absl import logging
 import pandas as pd
 
+from scoring.scoring import NUM_TRIALS
+from scoring.scoring import NUM_WORKLOADS
+
 TRIAL_LINE_REGEX = '(.*) --- Tuning run (\d+)/(\d+) ---'
 METRICS_LINE_REGEX = '(.*) Metrics: ({.*})'
 TRIAL_DIR_REGEX = 'trial_(\d+)'
@@ -103,8 +107,7 @@ def get_trials_df_dict(logfile):
     """
   trials_dict = get_trials_dict(logfile)
   trials_df_dict = {}
-  for trial in trials_dict.keys():
-    metrics = trials_dict[trial]
+  for trial, metrics in trials_dict.items():
     trials_df_dict[trial] = pd.DataFrame(metrics)
   return trials_df_dict
 
@@ -156,6 +159,10 @@ def get_experiment_df(experiment_dir):
   """
   df = pd.DataFrame()
   workload_dirs = os.listdir(experiment_dir)
+  num_workloads = len(workload_dirs)
+  if num_workloads != NUM_WORKLOADS:
+    warnings.warn(f'There should be {NUM_WORKLOADS} workloads but there are '
+                  f'{num_workloads}.')
   for workload in workload_dirs:
     data = {
         'workload': workload,
@@ -164,6 +171,7 @@ def get_experiment_df(experiment_dir):
         t for t in os.listdir(os.path.join(experiment_dir, workload))
         if re.match(TRIAL_DIR_REGEX, t)
     ]
+    workload_df = pd.DataFrame()
     for trial in trial_dirs:
       eval_measurements_filepath = os.path.join(
           experiment_dir,
@@ -173,13 +181,18 @@ def get_experiment_df(experiment_dir):
       )
       try:
         trial_df = pd.read_csv(eval_measurements_filepath)
-      except FileNotFoundError as e:
+      except FileNotFoundError:
         logging.info(f'Could not read {eval_measurements_filepath}')
         continue
       data['trial'] = trial
       for column in trial_df.columns:
         values = trial_df[column].to_numpy()
         data[column] = values
       trial_df = pd.DataFrame([data])
-      df = pd.concat([df, trial_df], ignore_index=True)
+      workload_df = pd.concat([workload_df, trial_df], ignore_index=True)
+    num_trials = len(workload_df)
+    if num_trials != NUM_TRIALS:
+      warnings.warn(f'There should be {NUM_TRIALS} trials for workload '
+                    f'{workload} but there are only {num_trials}.')
+    df = pd.concat([df, workload_df], ignore_index=True)
   return df
diff --git a/scoring/test_scoring_utils.py b/scoring/test_scoring_utils.py
@@ -1,8 +1,11 @@
 from absl.testing import absltest
-import scoring_utils
 
-TEST_LOGFILE = 'test_data/adamw_fastmri_jax_04-18-2023-13-10-58.log'
-TEST_DIR = 'test_data/experiment_dir'
+from scoring import scoring_utils
+from scoring.scoring import NUM_TRIALS
+from scoring.scoring import NUM_WORKLOADS
+
+TEST_LOGFILE = 'scoring/test_data/adamw_fastmri_jax_04-18-2023-13-10-58.log'
+TEST_DIR = 'scoring/test_data/experiment_dir'
 NUM_EVALS = 18
 
 
@@ -14,8 +17,7 @@ def test_get_trials_dict(self):
 
   def test_get_trials_df_dict(self):
     trials_dict = scoring_utils.get_trials_df_dict(TEST_LOGFILE)
-    for trial in trials_dict:
-      df = trials_dict[trial]
+    for df in trials_dict.values():
       self.assertEqual(len(df.index), NUM_EVALS)
 
   def test_get_trials_df(self):
@@ -24,7 +26,18 @@ def test_get_trials_df(self):
       self.assertEqual(len(df.at['1', column]), NUM_EVALS)
 
   def test_get_experiment_df(self):
-    df = scoring_utils.get_experiment_df(TEST_DIR)
+    _ = scoring_utils.get_experiment_df(TEST_DIR)
+    self.assertWarnsRegex(
+        Warning,
+        f'There should be {NUM_WORKLOADS} workloads but there are 1.',
+        scoring_utils.get_experiment_df,
+        TEST_DIR)
+    self.assertWarnsRegex(
+        Warning,
+        f'There should be {NUM_TRIALS} trials for workload mnist_jax but there '
+        'are only 1.',
+        scoring_utils.get_experiment_df,
+        TEST_DIR)
 
 
 if __name__ == '__main__':

diff --git a/setup.cfg b/setup.cfg
@@ -37,7 +37,6 @@ install_requires =
   absl-py==1.4.0
   numpy>=1.23
   pandas>=2.0.1
-  tabulate==0.9.0
   tensorflow==2.12.0
   tensorflow-datasets==4.9.2
   tensorflow-probability==0.20.0