apache · suneet-s · Oct 11, 2020 · Sep 4, 2020 · Sep 8, 2020 · Sep 9, 2020
diff --git a/core/src/main/java/org/apache/druid/java/util/metrics/Monitor.java b/core/src/main/java/org/apache/druid/java/util/metrics/Monitor.java
@@ -29,5 +29,10 @@ public interface Monitor
 
   void stop();
 
+  /**
+   * Emit metrics using the given emitter.
+   *
+   * @return true if this monitor needs to continue monitoring. False otherwise.
+   */
   boolean monitor(ServiceEmitter emitter);
 }
diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/TaskToolbox.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/TaskToolbox.java
@@ -43,7 +43,7 @@
 import org.apache.druid.indexing.common.task.IndexTaskClientFactory;
 import org.apache.druid.indexing.common.task.batch.parallel.ParallelIndexSupervisorTaskClient;
 import org.apache.druid.indexing.common.task.batch.parallel.ShuffleClient;
-import org.apache.druid.indexing.worker.IntermediaryDataManager;
+import org.apache.druid.indexing.worker.shuffle.IntermediaryDataManager;
 import org.apache.druid.java.util.emitter.service.ServiceEmitter;
 import org.apache.druid.java.util.metrics.Monitor;
 import org.apache.druid.java.util.metrics.MonitorScheduler;

diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/common/TaskToolboxFactory.java b/indexing-service/src/main/java/org/apache/druid/indexing/common/TaskToolboxFactory.java
@@ -42,7 +42,7 @@
 import org.apache.druid.indexing.common.task.Task;
 import org.apache.druid.indexing.common.task.batch.parallel.ParallelIndexSupervisorTaskClient;
 import org.apache.druid.indexing.common.task.batch.parallel.ShuffleClient;
-import org.apache.druid.indexing.worker.IntermediaryDataManager;
+import org.apache.druid.indexing.worker.shuffle.IntermediaryDataManager;
 import org.apache.druid.java.util.emitter.service.ServiceEmitter;
 import org.apache.druid.java.util.metrics.MonitorScheduler;
 import org.apache.druid.query.QueryRunnerFactoryConglomerate;

diff --git a/...ava/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTask.java b/...ava/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexSupervisorTask.java
@@ -58,6 +58,7 @@
 import org.apache.druid.indexing.common.task.batch.parallel.distribution.StringDistribution;
 import org.apache.druid.indexing.common.task.batch.parallel.distribution.StringDistributionMerger;
 import org.apache.druid.indexing.common.task.batch.parallel.distribution.StringSketchMerger;
+import org.apache.druid.indexing.worker.shuffle.IntermediaryDataManager;
 import org.apache.druid.java.util.common.IAE;
 import org.apache.druid.java.util.common.ISE;
 import org.apache.druid.java.util.common.Pair;
@@ -486,7 +487,7 @@ private TaskStatus runSinglePhaseParallel(TaskToolbox toolbox) throws Exception
    * - In the first phase, each task partitions input data and stores those partitions in local storage.
    *   - The partition is created based on the segment granularity (primary partition key) and the partition dimension
    *     values in {@link PartitionsSpec} (secondary partition key).
-   *   - Partitioned data is maintained by {@link org.apache.druid.indexing.worker.IntermediaryDataManager}.
+   *   - Partitioned data is maintained by {@link IntermediaryDataManager}.
    * - In the second phase, each task reads partitioned data from the intermediary data server (middleManager
    *   or indexer) and merges them to create the final segments.
    */

diff --git a/.../org/apache/druid/indexing/common/task/batch/parallel/PartialHashSegmentGenerateTask.java b/.../org/apache/druid/indexing/common/task/batch/parallel/PartialHashSegmentGenerateTask.java
@@ -29,6 +29,7 @@
 import org.apache.druid.indexing.common.task.TaskResource;
 import org.apache.druid.indexing.common.task.batch.parallel.iterator.DefaultIndexTaskInputRowIteratorBuilder;
 import org.apache.druid.indexing.common.task.batch.partition.HashPartitionAnalysis;
+import org.apache.druid.indexing.worker.shuffle.ShuffleDataSegmentPusher;
 import org.apache.druid.segment.indexing.granularity.GranularitySpec;
 import org.apache.druid.timeline.DataSegment;
 import org.apache.druid.timeline.partition.BucketNumberedShardSpec;
@@ -46,7 +47,7 @@
 /**
  * The worker task of {@link PartialHashSegmentGenerateParallelIndexTaskRunner}. This task partitions input data by
  * hashing the segment granularity and partition dimensions in {@link HashedPartitionsSpec}. Partitioned segments are
- * stored in local storage using {@link org.apache.druid.indexing.worker.ShuffleDataSegmentPusher}.
+ * stored in local storage using {@link ShuffleDataSegmentPusher}.
  */
 public class PartialHashSegmentGenerateTask extends PartialSegmentGenerateTask<GeneratedPartitionsMetadataReport>
 {

diff --git a/...org/apache/druid/indexing/common/task/batch/parallel/PartialRangeSegmentGenerateTask.java b/...org/apache/druid/indexing/common/task/batch/parallel/PartialRangeSegmentGenerateTask.java
@@ -31,7 +31,7 @@
 import org.apache.druid.indexing.common.task.TaskResource;
 import org.apache.druid.indexing.common.task.batch.parallel.iterator.RangePartitionIndexTaskInputRowIteratorBuilder;
 import org.apache.druid.indexing.common.task.batch.partition.RangePartitionAnalysis;
-import org.apache.druid.indexing.worker.ShuffleDataSegmentPusher;
+import org.apache.druid.indexing.worker.shuffle.ShuffleDataSegmentPusher;
 import org.apache.druid.timeline.DataSegment;
 import org.apache.druid.timeline.partition.BucketNumberedShardSpec;
 import org.apache.druid.timeline.partition.PartitionBoundaries;

diff --git a/...java/org/apache/druid/indexing/common/task/batch/parallel/PartialSegmentGenerateTask.java b/...java/org/apache/druid/indexing/common/task/batch/parallel/PartialSegmentGenerateTask.java
@@ -31,7 +31,7 @@
 import org.apache.druid.indexing.common.task.SequenceNameFunction;
 import org.apache.druid.indexing.common.task.TaskResource;
 import org.apache.druid.indexing.common.task.batch.parallel.iterator.IndexTaskInputRowIteratorBuilder;
-import org.apache.druid.indexing.worker.ShuffleDataSegmentPusher;
+import org.apache.druid.indexing.worker.shuffle.ShuffleDataSegmentPusher;
 import org.apache.druid.query.DruidMetrics;
 import org.apache.druid.segment.indexing.DataSchema;
 import org.apache.druid.segment.indexing.RealtimeIOConfig;

diff --git a/...ice/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/ShuffleClient.java b/...ice/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/ShuffleClient.java
@@ -19,6 +19,8 @@
 
 package org.apache.druid.indexing.common.task.batch.parallel;
 
+import org.apache.druid.indexing.worker.shuffle.IntermediaryDataManager;
+
 import java.io.File;
 import java.io.IOException;
 
@@ -27,7 +29,7 @@
  * The only available implementation for production code is {@link HttpShuffleClient} and
  * this interface is more for easier testing.
  *
- * @see org.apache.druid.indexing.worker.IntermediaryDataManager
+ * @see IntermediaryDataManager
  * @see PartialSegmentMergeTask
  */
 public interface ShuffleClient

diff --git a/...exing/worker/IntermediaryDataManager.java → ...rker/shuffle/IntermediaryDataManager.java b/...exing/worker/IntermediaryDataManager.java → ...rker/shuffle/IntermediaryDataManager.java
@@ -17,7 +17,7 @@
  * under the License.
  */
 
-package org.apache.druid.indexing.worker;
+package org.apache.druid.indexing.worker.shuffle;
 
 import com.google.common.collect.Iterators;
 import com.google.common.io.Files;
@@ -41,6 +41,7 @@
 import org.apache.druid.segment.loading.StorageLocation;
 import org.apache.druid.timeline.DataSegment;
 import org.apache.druid.utils.CompressionUtils;
+import org.checkerframework.checker.nullness.qual.MonotonicNonNull;
 import org.joda.time.DateTime;
 import org.joda.time.Interval;
 import org.joda.time.Period;
@@ -67,7 +68,7 @@
 /**
  * This class manages intermediary segments for data shuffle between native parallel index tasks.
  * In native parallel indexing, phase 1 tasks store segment files in local storage of middleManagers (or indexer)
- * and phase 2 tasks read those files via HTTP.
+ * and phase 2 tasks read those files over HTTP.
  *
  * The directory where segment files are placed is structured as
  * {@link StorageLocation#path}/supervisorTaskId/startTimeOfSegment/endTimeOfSegment/bucketIdOfSegment.
@@ -100,7 +101,7 @@ public class IntermediaryDataManager
   // but middleManager or indexer could miss the request. This executor is to automatically clean up unused intermediary
   // partitions.
   // This can be null until IntermediaryDataManager is started.
-  @Nullable
+  @MonotonicNonNull
   private ScheduledExecutorService supervisorTaskChecker;
 
   @Inject

diff --git a/...xing/worker/ShuffleDataSegmentPusher.java → ...ker/shuffle/ShuffleDataSegmentPusher.java b/...xing/worker/ShuffleDataSegmentPusher.java → ...ker/shuffle/ShuffleDataSegmentPusher.java
@@ -17,7 +17,7 @@
  * under the License.
  */
 
-package org.apache.druid.indexing.worker;
+package org.apache.druid.indexing.worker.shuffle;
 
 import org.apache.druid.segment.SegmentUtils;
 import org.apache.druid.segment.loading.DataSegmentPusher;

diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/worker/shuffle/ShuffleMetrics.java b/indexing-service/src/main/java/org/apache/druid/indexing/worker/shuffle/ShuffleMetrics.java
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.indexing.worker.shuffle;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.errorprone.annotations.concurrent.GuardedBy;
+
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * Shuffle metrcis for middleManagers and indexers. This class is thread-safe because shuffle can be performed by
+ * multiple HTTP threads while a monitoring thread periodically emits the snapshot of metrics.
+ *
+ * @see ShuffleResource
+ * @see org.apache.druid.java.util.metrics.MonitorScheduler
+ */
+public class ShuffleMetrics
+{
+  /**
+   * This lock is used to synchronize accesses to the reference to {@link #datasourceMetrics} and the
+   * {@link PerDatasourceShuffleMetrics} values of the map. This means,
+   *
+   * - Any updates on PerDatasourceShuffleMetrics in the map (and thus its key) should be synchronized under this lock.
+   * - Any updates on the reference to datasourceMetrics should be synchronized under this lock.
+   */
+  private final Object lock = new Object();
+
+  /**
+   * A map of (datasource name) -> {@link PerDatasourceShuffleMetrics}. This map is replaced with an empty map
+   * whenever a snapshot is taken since the map can keep growing over time otherwise. For concurrent access pattern,
+   * see {@link #shuffleRequested} and {@link #snapshotAndReset()}.
+   */
+  @GuardedBy("lock")
+  private Map<String, PerDatasourceShuffleMetrics> datasourceMetrics = new HashMap<>();
+
+  /**
+   * This method is called whenever a new shuffle is requested. Multiple tasks can request shuffle at the same time,
+   * while the monitoring thread takes a snapshot of the metrics. There is a happens-before relationship between
+   * shuffleRequested and {@link #snapshotAndReset()}.
+   */
+  public void shuffleRequested(String supervisorTaskId, long fileLength)
+  {
+    synchronized (lock) {
+      datasourceMetrics.computeIfAbsent(supervisorTaskId, k -> new PerDatasourceShuffleMetrics())
+                       .accumulate(fileLength);
+    }
+  }
+
+  /**
+   * This method is called whenever the monitoring thread takes a snapshot of the current metrics. The map inside
+   * AtomicReference will be reset to an empty map after this call. This is to return the snapshot metrics collected
+   * during the monitornig period. There is a happens-before relationship between snapshotAndReset() and
+   * {@link #shuffleRequested}.
+   */
+  public Map<String, PerDatasourceShuffleMetrics> snapshotAndReset()
+  {
+    synchronized (lock) {
+      final Map<String, PerDatasourceShuffleMetrics> snapshot = Collections.unmodifiableMap(datasourceMetrics);
+      datasourceMetrics = new HashMap<>();
+      return snapshot;
+    }
+  }
+
+  /**
+   * This method is visible only for testing. Use {@link #snapshotAndReset()} instead to get the current snapshot.
+   */
+  @VisibleForTesting
+  Map<String, PerDatasourceShuffleMetrics> getDatasourceMetrics()
+  {
+    synchronized (lock) {
+      return datasourceMetrics;
+    }
+  }
+
+  /**
+   * This class represents shuffle metrics of one datasource. This class is not thread-safe and should never be accessed
+   * by multiple threads at the same time.
+   */
+  public static class PerDatasourceShuffleMetrics
+  {
+    private long shuffleBytes;
+    private int shuffleRequests;
+
+    private void accumulate(long shuffleBytes)
+    {
+      this.shuffleBytes += shuffleBytes;
+      this.shuffleRequests++;
+    }
+
+    public long getShuffleBytes()
+    {
+      return shuffleBytes;
+    }
+
+    public int getShuffleRequests()
+    {
+      return shuffleRequests;
+    }
+  }
+}
diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/worker/shuffle/ShuffleModule.java b/indexing-service/src/main/java/org/apache/druid/indexing/worker/shuffle/ShuffleModule.java
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.indexing.worker.shuffle;
+
+import com.google.inject.Binder;
+import com.google.inject.Module;
+import org.apache.druid.guice.Jerseys;
+import org.apache.druid.guice.LazySingleton;
+import org.apache.druid.server.metrics.MetricsModule;
+
+public class ShuffleModule implements Module
+{
+  @Override
+  public void configure(Binder binder)
+  {
+    Jerseys.addResource(binder, ShuffleResource.class);
+
+    binder.bind(ShuffleMetrics.class).in(LazySingleton.class);
+    binder.bind(ShuffleMonitor.class).in(LazySingleton.class);
+    MetricsModule.register(binder, ShuffleMonitor.class);
+  }
+}
diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/worker/shuffle/ShuffleMonitor.java b/indexing-service/src/main/java/org/apache/druid/indexing/worker/shuffle/ShuffleMonitor.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.indexing.worker.shuffle;
+
+import com.google.inject.Inject;
+import org.apache.druid.indexing.worker.shuffle.ShuffleMetrics.PerDatasourceShuffleMetrics;
+import org.apache.druid.java.util.emitter.service.ServiceEmitter;
+import org.apache.druid.java.util.emitter.service.ServiceMetricEvent;
+import org.apache.druid.java.util.emitter.service.ServiceMetricEvent.Builder;
+import org.apache.druid.java.util.metrics.AbstractMonitor;
+
+import java.util.Map;
+
+public class ShuffleMonitor extends AbstractMonitor
+{
+  private static final String SUPERVISOR_TASK_ID_DIMENSION = "supervisorTaskId";
+  private static final String SHUFFLE_BYTES_KEY = "shuffle/bytes";
+  private static final String SHUFFLE_REQUESTS_KEY = "shuffle/requests";
+
+  private final ShuffleMetrics shuffleMetrics;
+
+  @Inject
+  public ShuffleMonitor(ShuffleMetrics shuffleMetrics)
+  {
+    this.shuffleMetrics = shuffleMetrics;
+  }
+
+  @Override
+  public boolean doMonitor(ServiceEmitter emitter)
+  {
+    final Map<String, PerDatasourceShuffleMetrics> snapshot = shuffleMetrics.snapshotAndReset();
+    snapshot.forEach((supervisorTaskId, perDatasourceShuffleMetrics) -> {
+      final Builder metricBuilder = ServiceMetricEvent
+          .builder()
+          .setDimension(SUPERVISOR_TASK_ID_DIMENSION, supervisorTaskId);
+      emitter.emit(metricBuilder.build(SHUFFLE_BYTES_KEY, perDatasourceShuffleMetrics.getShuffleBytes()));
+      emitter.emit(metricBuilder.build(SHUFFLE_REQUESTS_KEY, perDatasourceShuffleMetrics.getShuffleRequests()));
+    });
+
+    return true;
+  }
+}
diff --git a/...indexing/worker/http/ShuffleResource.java → ...exing/worker/shuffle/ShuffleResource.java b/...indexing/worker/http/ShuffleResource.java → ...exing/worker/shuffle/ShuffleResource.java
@@ -17,12 +17,11 @@
  * under the License.
  */
 
-package org.apache.druid.indexing.worker.http;
+package org.apache.druid.indexing.worker.shuffle;
 
 import com.google.common.io.ByteStreams;
 import com.google.inject.Inject;
 import com.sun.jersey.spi.container.ResourceFilters;
-import org.apache.druid.indexing.worker.IntermediaryDataManager;
 import org.apache.druid.java.util.common.DateTimes;
 import org.apache.druid.java.util.common.StringUtils;
 import org.apache.druid.java.util.common.logger.Logger;
@@ -60,11 +59,13 @@ public class ShuffleResource
   private static final Logger log = new Logger(ShuffleResource.class);
 
   private final IntermediaryDataManager intermediaryDataManager;
+  private final ShuffleMetrics shuffleMetrics;
 
   @Inject
-  public ShuffleResource(IntermediaryDataManager intermediaryDataManager)
+  public ShuffleResource(IntermediaryDataManager intermediaryDataManager, ShuffleMetrics shuffleMetrics)
   {
     this.intermediaryDataManager = intermediaryDataManager;
+    this.shuffleMetrics = shuffleMetrics;
   }
 
   @GET
@@ -96,6 +97,7 @@ public Response getPartition(
       );
       return Response.status(Status.NOT_FOUND).entity(errorMessage).build();
     } else {
+      shuffleMetrics.shuffleRequested(supervisorTaskId, partitionFile.length());
       return Response.ok(
           (StreamingOutput) output -> {
             try (final FileInputStream fileInputStream = new FileInputStream(partitionFile)) {