From a19ba044e0ad42c49e95f09298b491db5d8ed05c Mon Sep 17 00:00:00 2001
From: Waymo Research <waymo-research@google.com>
Date: Wed, 21 Apr 2021 21:09:42 +0000
Subject: [PATCH] Merged commit includes the following changes: 369731584  by
 Waymo Research:

    Internal changes

--
369724720  by Waymo Research:

    Set minimum score and IOU with flag, tidy up the console outputs

--
369556120  by Waymo Research:

    Add features for frame timestamp and rolling shutter

--
368078025  by Waymo Research:

    Enable motion metrics in get_bp_metrics_handler.cc.

--
365599653  by Waymo Research:

    Fix the matcher to work when there are a different number of latency vs submission detections

--
365491346  by Waymo Research:

    Relax the box comparison thresholds

--

PiperOrigin-RevId: 369731584
---
 waymo_open_dataset/latency/README.md          |   5 +
 ...compare_objects_file_to_submission_main.cc | 121 ++++++++++--------
 .../make_objects_file_from_latency_results.py |   1 +
 .../latency/run_latency_evaluation.sh         |   2 +-
 waymo_open_dataset/metrics/motion_metrics.cc  |   4 +-
 waymo_open_dataset/utils/frame_utils.py       |  18 +++
 6 files changed, 94 insertions(+), 57 deletions(-)

diff --git a/waymo_open_dataset/latency/README.md b/waymo_open_dataset/latency/README.md
index 5f355ea..6aac4d3 100644
--- a/waymo_open_dataset/latency/README.md
+++ b/waymo_open_dataset/latency/README.md
@@ -20,6 +20,7 @@ User-submitted models will take the form of a Python module named `wod_latency_s
 Converting from Frame protos to usable point clouds/images can be non-trivially expensive (involving various unzippings and transforms) and does not reflect a workflow that would realistically be present in an autonomous driving scenario. Thus, our evaluation of submitted models does not time the conversion from Frame proto to tensor. Instead, we have pre-extracted the dataset into numpy ndarrays. The keys, shapes, and data types are:
 
 * `POSE`: 4x4 float32 array with the vehicle pose.
+* `TIMESTAMP`: int64 scalar with the timestamp of the frame in microseconds.
 * For each lidar:
   * `<LIDAR_NAME>_RANGE_IMAGE_FIRST_RETURN`: HxWx6 float32 array with the range image of the first return for this lidar. The six channels are range, intensity, elongation, x, y, and z. The x, y, and z values are in vehicle frame. Pixels with range 0 are not valid points.
   * `<LIDAR_NAME>_RANGE_IMAGE_SECOND_RETURN`: HxWx6 float32 array with the range image of the first return for this lidar. Same channels as the first return range image.
@@ -34,6 +35,10 @@ Converting from Frame protos to usable point clouds/images can be non-trivially
   * `<CAMERA_NAME>_EXTRINSIC`: 4x4 float32 array with the 4x4 extrinsic matrix for this camera.
   * `<CAMERA_NAME>_WIDTH`: int64 scalar with the width of this camera image.
   * `<CAMERA_NAME>_HEIGHT`: int64 scalar with the height of this camera image.
+  * `<CAMERA_NAME>_POSE`: 4x4 float32 array with the vehicle pose at the timestamp of this camera image.
+  * `<CAMERA_NAME>_POSE_TIMESTAMP`: float32 scalar with the timestamp in seconds for the image (i.e. the timestamp that `<CAMERA_NAME>_POSE` is valid at).
+  * `<CAMERA_NAME>_ROLLING_SHUTTER_DURATION`: float32 scalar with the duration of the rolling shutter in seconds. See the documentation for `CameraImage.shutter in [dataset.proto](https://github.com/waymo-research/waymo-open-dataset/blob/eb7d74d1e11f40f5f8485ae8e0dc71f0944e8661/waymo_open_dataset/dataset.proto#L268-L283) for details.
+  * `<CAMERA_NAME>_ROLLING_SHUTTER_DIRECTION`: int64 scalar with the direction of the rolling shutter, expressed as the int value of a `CameraCalibration.RollingShutterReadOutDirection` enum.
 
 See the `LaserName.Name` and `CameraName.Name` enums in [dataset.proto](https://github.com/waymo-research/waymo-open-dataset/blob/eb7d74d1e11f40f5f8485ae8e0dc71f0944e8661/waymo_open_dataset/dataset.proto#L48-L69) for the valid lidar and camera name strings.
 
diff --git a/waymo_open_dataset/latency/compare_objects_file_to_submission_main.cc b/waymo_open_dataset/latency/compare_objects_file_to_submission_main.cc
index 9e272ab..08035fb 100644
--- a/waymo_open_dataset/latency/compare_objects_file_to_submission_main.cc
+++ b/waymo_open_dataset/latency/compare_objects_file_to_submission_main.cc
@@ -33,12 +33,18 @@ limitations under the License.
 #include "waymo_open_dataset/protos/submission.pb.h"
 
 ABSL_FLAG(std::string, latency_result_filename, {},
-          "Comma separated list of sharded files that contains "
-          "car.open_dataset.Objects proto from the latency evaluation"
-          "scripts..");
+          "File that contains the car.open_dataset.Objects proto from the "
+          "latency evaluation scripts.");
 ABSL_FLAG(std::vector<std::string>, full_result_filenames, {},
           "Comma separated list of sharded files that contains "
           "car.open_dataset.Objects proto from user provided submissions.");
+ABSL_FLAG(double, iou_threshold, 0.9,
+          "IOU threshold to match detections between the latency evaluator "
+          "results and the user submission.");
+ABSL_FLAG(double, minimum_score, 0.0,
+          "Minimum score of detections to consider. Detections with scores "
+          "lower than this will not be checked for equivalence between the "
+          "submission proto and the latency evaluation script.");
 
 namespace waymo {
 namespace open_dataset {
@@ -48,15 +54,15 @@ namespace {
 // proto with ones from the objects file. Uses very high IOU thresholds since
 // the boxes should be nearly identical. If is_3d is true, this binary will do 3
 // IOU matching; otherwise, it will do 2D axis-aligned IOU matching.
-Config GetConfig(bool is_3d) {
+Config GetConfig(bool is_3d, double iou_threshold) {
   Config config;
 
   config.set_matcher_type(MatcherProto::TYPE_HUNGARIAN);
-  config.add_iou_thresholds(0.9);
-  config.add_iou_thresholds(0.9);
-  config.add_iou_thresholds(0.9);
-  config.add_iou_thresholds(0.9);
-  config.add_iou_thresholds(0.9);
+  config.add_iou_thresholds(iou_threshold);
+  config.add_iou_thresholds(iou_threshold);
+  config.add_iou_thresholds(iou_threshold);
+  config.add_iou_thresholds(iou_threshold);
+  config.add_iou_thresholds(iou_threshold);
   if (is_3d) {
     config.set_box_type(Label::Box::TYPE_3D);
   } else {
@@ -77,8 +83,6 @@ Objects ReadObjectsFromFile(const std::vector<std::string>& paths) {
     const std::string content((std::istreambuf_iterator<char>(s)),
                               std::istreambuf_iterator<char>());
     if (!objs.ParseFromString(content)) {
-      LOG(ERROR) << "Could not parse " << path
-                 << " as Objects file. Trying as a Submission file.";
       Submission submission;
       if (!submission.ParseFromString(content)) {
         LOG(FATAL) << "Could not parse " << path << " as submission either.";
@@ -100,7 +104,8 @@ Objects ReadObjectsFromFile(const std::vector<std::string>& paths) {
 // dimensions, confidence score, and class name) are nearly identical.
 // Returns 0 if the two sets of results match and 1 otherwise.
 int Compute(const std::string& latency_result_filename,
-            const std::vector<std::string>& full_result_filename) {
+            const std::vector<std::string>& full_result_filename,
+            double iou_threshold, double minimum_score) {
   using KeyTuple = std::tuple<std::string, int64, CameraName::Name>;
   Objects latency_result_objs = ReadObjectsFromFile({latency_result_filename});
   Objects full_result_objs = ReadObjectsFromFile(full_result_filename);
@@ -120,20 +125,22 @@ int Compute(const std::string& latency_result_filename,
 
   bool is_2d;
   for (auto& o : *latency_result_objs.mutable_objects()) {
-    const KeyTuple key(o.context_name(), o.frame_timestamp_micros(),
-                       o.camera_name());
     is_2d = o.object().box().has_heading();
-    latency_result_map[key].push_back(std::move(o));
+    if (o.score() >= minimum_score) {
+      const KeyTuple key(o.context_name(), o.frame_timestamp_micros(),
+                         o.camera_name());
+      latency_result_map[key].push_back(std::move(o));
+    }
   }
   for (auto& o : *full_result_objs.mutable_objects()) {
-    const KeyTuple key(o.context_name(), o.frame_timestamp_micros(),
-                       o.camera_name());
-    full_result_map[key].push_back(std::move(o));
+    if (o.score() >= minimum_score) {
+      const KeyTuple key(o.context_name(), o.frame_timestamp_micros(),
+                         o.camera_name());
+      full_result_map[key].push_back(std::move(o));
+    }
   }
 
-  std::cout << latency_result_map.size() << " frames found.\n";
-
-  const Config config = GetConfig(is_2d);
+  const Config config = GetConfig(is_2d, iou_threshold);
   std::unique_ptr<Matcher> matcher = Matcher::Create(config);
 
   // This loop iterates over the key-value pairs in the latency result map
@@ -144,29 +151,34 @@ int Compute(const std::string& latency_result_filename,
     const auto& latency_results = kv.second;
     auto full_result_it = full_result_map.find(example_key);
     if (full_result_it == full_result_map.end()) {
-      std::cerr << print_key(example_key) << " not found in full results"
-                << std::endl;
+      LOG(FATAL) << print_key(example_key)
+                 << " in latency evaluator results but not in submission.";
       return 1;
     }
     const auto& full_results = full_result_it->second;
 
     const size_t num_detections = latency_results.size();
-    if (full_results.size() != num_detections) {
-      std::cerr << "Different number of detections found: " << num_detections
-                << " in latency results, " << full_results.size()
-                << " in full results for frame " << print_key(example_key)
-                << std::endl;
-      return 1;
+
+    // Keep track of the number of detections that do not match, starting by
+    // subtracting the number of detections in the latency results from the
+    // number of detections in the full results since that difference
+    // constitutes detections in the full results that cannot have a match in
+    // the latency results.
+    size_t unmatched_detections = 0;
+    if (full_results.size() > num_detections) {
+      unmatched_detections = full_results.size() - num_detections;
     }
 
     // Run the Hungarian matcher on the two sets of results from this frame.
     matcher->SetPredictions(latency_results);
     matcher->SetGroundTruths(full_results);
 
-    std::vector<int> subset(num_detections);
-    std::iota(subset.begin(), subset.end(), 0);
-    matcher->SetPredictionSubset(subset);
-    matcher->SetGroundTruthSubset(subset);
+    std::vector<int> pred_subset(num_detections);
+    std::iota(pred_subset.begin(), pred_subset.end(), 0);
+    matcher->SetPredictionSubset(pred_subset);
+    std::vector<int> gt_subset(full_results.size());
+    std::iota(gt_subset.begin(), gt_subset.end(), 0);
+    matcher->SetGroundTruthSubset(gt_subset);
 
     std::vector<int> matches;
     matcher->Match(&matches, nullptr);
@@ -175,37 +187,33 @@ int Compute(const std::string& latency_result_filename,
       const Object& latency_obj = latency_results[latency_ind];
       const int full_ind = matches[latency_ind];
       if (full_ind < 0) {
-        std::cerr << "No match found for object " << latency_ind
-                  << " for frame " << print_key(example_key) << std::endl;
-        return 1;
+        LOG(INFO) << "No match found for object " << latency_ind
+                  << " for frame " << print_key(example_key) << std::endl
+                  << latency_obj.DebugString();
+        ++unmatched_detections;
+        continue;
       }
       const Object& full_obj = full_results[full_ind];
 
-      if (std::abs(latency_obj.score() - full_obj.score()) > 1e-3 ||
-          std::abs(latency_obj.object().box().center_x() -
-                   full_obj.object().box().center_x()) > 1e-3 ||
-          std::abs(latency_obj.object().box().center_y() -
-                   full_obj.object().box().center_y()) > 1e-3 ||
-          std::abs(latency_obj.object().box().center_z() -
-                   full_obj.object().box().center_z()) > 1e-3 ||
-          std::abs(latency_obj.object().box().length() -
-                   full_obj.object().box().length()) > 1e-3 ||
-          std::abs(latency_obj.object().box().width() -
-                   full_obj.object().box().width()) > 1e-3 ||
-          std::abs(latency_obj.object().box().height() -
-                   full_obj.object().box().height()) > 1e-3 ||
-          std::abs(latency_obj.object().box().heading() -
-                   full_obj.object().box().heading()) > 1e-3 ||
-          latency_obj.object().type() != full_obj.object().type()) {
-        std::cerr << "Matched objects for frame " << print_key(example_key)
+      if (std::abs(latency_obj.score() - full_obj.score()) > 0.05) {
+        LOG(INFO) << "Matched objects for frame " << print_key(example_key)
                   << " are not identical: " << std::endl
                   << latency_obj.DebugString() << std::endl
                   << "vs" << std::endl
                   << full_obj.DebugString();
-        return 1;
+        ++unmatched_detections;
       }
     }
 
+    if (unmatched_detections > 0.05 * num_detections) {
+      LOG(FATAL) << "Latency evaluator results did not match submission "
+                 << "proto for " << print_key(example_key) << std::endl
+                 << unmatched_detections << " detections out of "
+                 << num_detections << " did not match. This exceeds our "
+                 << "cut-off of 5% of detections being unmatched.";
+      return 1;
+    }
+
     std::cout << "Results matched for " << print_key(example_key) << std::endl;
   }
 
@@ -224,7 +232,10 @@ int main(int argc, char* argv[]) {
       absl::GetFlag(FLAGS_latency_result_filename);
   const std::vector<std::string> full_result_filennames =
       absl::GetFlag(FLAGS_full_result_filenames);
+  const double iou_threshold = absl::GetFlag(FLAGS_iou_threshold);
+  const double minimum_score = absl::GetFlag(FLAGS_minimum_score);
 
   return waymo::open_dataset::Compute(latency_result_filename,
-                                    full_result_filennames);
+                                    full_result_filennames, iou_threshold,
+                                    minimum_score);
 }
diff --git a/waymo_open_dataset/latency/make_objects_file_from_latency_results.py b/waymo_open_dataset/latency/make_objects_file_from_latency_results.py
index 9830721..6591929 100644
--- a/waymo_open_dataset/latency/make_objects_file_from_latency_results.py
+++ b/waymo_open_dataset/latency/make_objects_file_from_latency_results.py
@@ -148,5 +148,6 @@ def make_object_list_from_subdir(np_dir,
       objects.objects.extend(make_object_list_from_subdir(
           timestamp_dir, context_name, int(timestamp_micros)))
 
+  print('Got ', len(objects.objects), 'objects')
   with open(args.output_file, 'wb') as f:
     f.write(objects.SerializeToString())
diff --git a/waymo_open_dataset/latency/run_latency_evaluation.sh b/waymo_open_dataset/latency/run_latency_evaluation.sh
index 3734ae8..5fe781d 100644
--- a/waymo_open_dataset/latency/run_latency_evaluation.sh
+++ b/waymo_open_dataset/latency/run_latency_evaluation.sh
@@ -38,7 +38,7 @@ docker run --rm \
 # submission.
 OBJS_FILE=$(mktemp)
 $MAKE_OBJS_CMD --results_dir $DETECTION_OUTPUT_DIR --output_file $OBJS_FILE
-$COMPARE_OBJS_TO_SUBMISSION_CMD $OBJS_FILE $SUBMISSION_PB
+$COMPARE_OBJS_TO_SUBMISSION_CMD --latency_result_filename $OBJS_FILE --full_result_filenames $SUBMISSION_PB
 
 # Clean up the outputs of the accuracy check.
 sudo rm -rf $DETECTION_OUTPUT_DIR $OBJS_FILE
diff --git a/waymo_open_dataset/metrics/motion_metrics.cc b/waymo_open_dataset/metrics/motion_metrics.cc
index e0b17ae..b93c880 100644
--- a/waymo_open_dataset/metrics/motion_metrics.cc
+++ b/waymo_open_dataset/metrics/motion_metrics.cc
@@ -140,7 +140,9 @@ Status ValidatePredictions(const MotionMetricsConfig& config,
                            const Scenario& scenario) {
   // Validate that the scenario IDs match.
   if (scenario_predictions.scenario_id() != scenario.scenario_id()) {
-    return InvalidArgumentError("Scenario IDs do not match.");
+    return InvalidArgumentError(
+        "Scenario IDs do not match : " + scenario_predictions.scenario_id() +
+        " vs. " + scenario.scenario_id());
   }
 
   // Validate the predictions trajectory lengths and construct a set of the
diff --git a/waymo_open_dataset/utils/frame_utils.py b/waymo_open_dataset/utils/frame_utils.py
index 181b06d..1276313 100644
--- a/waymo_open_dataset/utils/frame_utils.py
+++ b/waymo_open_dataset/utils/frame_utils.py
@@ -217,6 +217,7 @@ def convert_frame_to_dict(frame):
 
   The keys, shapes, and data types are:
     POSE: 4x4 float32 array
+    TIMESTAMP: int64 scalar
 
     For each lidar:
       <LIDAR_NAME>_BEAM_INCLINATION: H float32 array
@@ -233,6 +234,11 @@ def convert_frame_to_dict(frame):
       <CAMERA_NAME>_EXTRINSIC: 4x4 float32 array
       <CAMERA_NAME>_WIDTH: int64 scalar
       <CAMERA_NAME>_HEIGHT: int64 scalar
+      <CAMERA_NAME>_SDC_VELOCITY: 6 float32 array
+      <CAMERA_NAME>_POSE: 4x4 float32 array
+      <CAMERA_NAME>_POSE_TIMESTAMP: float32 scalar
+      <CAMERA_NAME>_ROLLING_SHUTTER_DURATION: float32 scalar
+      <CAMERA_NAME>_ROLLING_SHUTTER_DIRECTION: int64 scalar
 
   NOTE: This function only works in eager mode for now.
 
@@ -291,6 +297,15 @@ def convert_frame_to_dict(frame):
   for im in frame.images:
     cam_name_str = dataset_pb2.CameraName.Name.Name(im.name)
     data_dict[f'{cam_name_str}_IMAGE'] = tf.io.decode_jpeg(im.image).numpy()
+    data_dict[f'{cam_name_str}_SDC_VELOCITY'] = np.array([
+        im.velocity.v_x, im.velocity.v_y, im.velocity.v_z, im.velocity.w_x,
+        im.velocity.w_y, im.velocity.w_z
+    ], np.float32)
+    data_dict[f'{cam_name_str}_POSE'] = np.reshape(
+        np.array(im.pose.transform, np.float32), (4, 4))
+    data_dict[f'{cam_name_str}_POSE_TIMESTAMP'] = np.array(
+        im.pose_timestamp, np.float32)
+    data_dict[f'{cam_name_str}_ROLLING_SHUTTER_DURATION'] = np.array(im.shutter)
 
   # Save the intrinsics, 4x4 extrinsic matrix, width, and height of each camera.
   for c in frame.context.camera_calibrations:
@@ -300,6 +315,8 @@ def convert_frame_to_dict(frame):
         np.array(c.extrinsic.transform, np.float32), [4, 4])
     data_dict[f'{cam_name_str}_WIDTH'] = np.array(c.width)
     data_dict[f'{cam_name_str}_HEIGHT'] = np.array(c.height)
+    data_dict[f'{cam_name_str}_ROLLING_SHUTTER_DURATION'] = np.array(
+        c.rolling_shutter_direction)
 
   # Save the range image pixel pose for the top lidar.
   data_dict['TOP_RANGE_IMAGE_POSE'] = np.reshape(
@@ -308,5 +325,6 @@ def convert_frame_to_dict(frame):
 
   data_dict['POSE'] = np.reshape(
       np.array(frame.pose.transform, np.float32), (4, 4))
+  data_dict['TIMESTAMP'] = np.array(frame.timestamp_micros)
 
   return data_dict