awslabs · arsenalgunnershubert777 · Sep 21, 2024 · Feb 8, 2024 · Nov 1, 2024 · Feb 15, 2024
diff --git a/src/main/scala/com/amazon/deequ/VerificationRunBuilder.scala b/src/main/scala/com/amazon/deequ/VerificationRunBuilder.scala
@@ -16,7 +16,7 @@
 
 package com.amazon.deequ
 
-import com.amazon.deequ.anomalydetection.AnomalyDetectionStrategy
+import com.amazon.deequ.anomalydetection.{AnomalyDetectionStrategy, AnomalyDetectionStrategyWithExtendedResults}
 import com.amazon.deequ.analyzers.Analyzer
 import com.amazon.deequ.analyzers.{State, _}
 import com.amazon.deequ.checks.{Check, CheckLevel}
@@ -240,6 +240,24 @@ class VerificationRunBuilderWithRepository(
       anomalyDetectionStrategy, analyzer, anomalyCheckConfigOrDefault)
     this
   }
+
+  def addAnomalyCheckWithExtendedResults[S <: State[S]](
+      anomalyDetectionStrategy: AnomalyDetectionStrategyWithExtendedResults,
+      analyzer: Analyzer[S, Metric[Double]],
+      anomalyCheckConfig: Option[AnomalyCheckConfig] = None)
+  : this.type = {
+
+    val anomalyCheckConfigOrDefault = anomalyCheckConfig.getOrElse {
+
+      val checkDescription = s"Anomaly check for ${analyzer.toString}"
+
+      AnomalyCheckConfig(CheckLevel.Warning, checkDescription)
+    }
+
+    checks :+= VerificationRunBuilderHelper.getAnomalyCheckWithExtendedResults(
+      metricsRepository.get, anomalyDetectionStrategy, analyzer, anomalyCheckConfigOrDefault)
+    this
+  }
 }
 
 class VerificationRunBuilderWithSparkSession(
@@ -315,6 +333,32 @@ private[this] object VerificationRunBuilderHelper {
         anomalyCheckConfig.beforeDate
       )
   }
+
+  /**
+   * Build a check using Anomaly Detection with extended results methods
+   *
+   * @param metricsRepository        A metrics repository to get the previous results
+   * @param anomalyDetectionStrategyWithExtendedResults The anomaly detection strategy with extended results
+   * @param analyzer                 The analyzer for the metric to run anomaly detection on
+   * @param anomalyCheckConfig       Some configuration settings for the Check
+   */
+  def getAnomalyCheckWithExtendedResults[S <: State[S]](
+      metricsRepository: MetricsRepository,
+      anomalyDetectionStrategyWithExtendedResults: AnomalyDetectionStrategyWithExtendedResults,
+      analyzer: Analyzer[S, Metric[Double]],
+      anomalyCheckConfig: AnomalyCheckConfig)
+  : Check = {
+
+    Check(anomalyCheckConfig.level, anomalyCheckConfig.description)
+      .isNewestPointNonAnomalousWithExtendedResults(
+        metricsRepository,
+        anomalyDetectionStrategyWithExtendedResults,
+        analyzer,
+        anomalyCheckConfig.withTagValues,
+        anomalyCheckConfig.afterDate,
+        anomalyCheckConfig.beforeDate
+      )
+  }
 }
 
 /**

diff --git a/src/main/scala/com/amazon/deequ/analyzers/applicability/Applicability.scala b/src/main/scala/com/amazon/deequ/analyzers/applicability/Applicability.scala
@@ -21,7 +21,8 @@ import java.util.Calendar
 
 import com.amazon.deequ.analyzers.{Analyzer, State}
 import com.amazon.deequ.checks.Check
-import com.amazon.deequ.constraints.{AnalysisBasedConstraint, Constraint, ConstraintDecorator}
+import com.amazon.deequ.constraints.{AnalysisBasedConstraint, AnomalyExtendedResultsConstraint,
+  Constraint, ConstraintDecorator}
 import com.amazon.deequ.metrics.Metric
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.{DataFrame, Row, SparkSession}
@@ -187,9 +188,13 @@ private[deequ] class Applicability(session: SparkSession) {
         case (name, nc: ConstraintDecorator) => name -> nc.inner
         case (name, c: Constraint) => name -> c
       }
-      .collect { case (name, constraint: AnalysisBasedConstraint[_, _, _]) =>
-        val metric = constraint.analyzer.calculate(data).value
-        name -> metric
+      .collect {
+        case (name, constraint: AnalysisBasedConstraint[_, _, _]) =>
+          val metric = constraint.analyzer.calculate(data).value
+          name -> metric
+        case (name, constraint: AnomalyExtendedResultsConstraint[_, _, _]) =>
+          val metric = constraint.analyzer.calculate(data).value
+          name -> metric
       }
 
     val constraintApplicabilities = check.constraints.zip(namedMetrics).map {

diff --git a/src/main/scala/com/amazon/deequ/anomalydetection/AnomalyDetectionStrategy.scala b/src/main/scala/com/amazon/deequ/anomalydetection/AnomalyDetectionStrategy.scala
@@ -30,3 +30,17 @@ trait AnomalyDetectionStrategy {
     dataSeries: Vector[Double],
     searchInterval: (Int, Int) = (0, Int.MaxValue)): Seq[(Int, Anomaly)]
 }
+trait AnomalyDetectionStrategyWithExtendedResults {
+
+  /**
+   * Search for anomalies in a series of data points, returns extended results.
+   *
+   * @param dataSeries     The data contained in a Vector of Doubles
+   * @param searchInterval The indices between which anomalies should be detected. [a, b).
+   * @return The indices of all data points with their corresponding anomaly extended results wrapper
+   *         object.
+   */
+  def detectWithExtendedResults(
+    dataSeries: Vector[Double],
+    searchInterval: (Int, Int) = (0, Int.MaxValue)): Seq[(Int, AnomalyDetectionDataPoint)]
+}
diff --git a/src/main/scala/com/amazon/deequ/anomalydetection/AnomalyDetector.scala b/src/main/scala/com/amazon/deequ/anomalydetection/AnomalyDetector.scala
@@ -56,12 +56,8 @@ case class AnomalyDetector(strategy: AnomalyDetectionStrategy) {
 
     val allDataPoints = sortedDataPoints :+ newPoint
 
-    // Run anomaly
-    val anomalies = detectAnomaliesInHistory(allDataPoints, (newPoint.time, Long.MaxValue))
-      .anomalies
-
-    // Create a Detection result with all anomalies
-    DetectionResult(anomalies)
+    // Run anomaly and create a Detection result with all anomalies
+    detectAnomaliesInHistory(allDataPoints, (newPoint.time, Long.MaxValue))
   }
 
   /**
@@ -100,3 +96,86 @@ case class AnomalyDetector(strategy: AnomalyDetectionStrategy) {
     DetectionResult(anomalies.map { case (index, anomaly) => (sortedTimestamps(index), anomaly) })
   }
 }
+
+case class AnomalyDetectorWithExtendedResults(strategy: AnomalyDetectionStrategyWithExtendedResults) {
+
+
+  /**
+   * Given a sequence of metrics and a current value, detects if there is an anomaly by using the
+   * given algorithm and returns extended results.
+   *
+   * @param historicalDataPoints Sequence of tuples (Points in time with corresponding Metric).
+   * @param newPoint             A new data point to check if there are anomalies
+   * @return
+   */
+  def isNewPointAnomalousWithExtendedResults(
+      historicalDataPoints: Seq[DataPoint[Double]],
+      newPoint: DataPoint[Double])
+    : ExtendedDetectionResult = {
+
+    require(historicalDataPoints.nonEmpty, "historicalDataPoints must not be empty!")
+
+    val sortedDataPoints = historicalDataPoints.sortBy(_.time)
+
+    val firstDataPointTime = sortedDataPoints.head.time
+    val lastDataPointTime = sortedDataPoints.last.time
+
+    val newPointTime = newPoint.time
+
+    require(lastDataPointTime < newPointTime,
+      s"Can't decide which range to use for anomaly detection. New data point with time " +
+        s"$newPointTime is in history range ($firstDataPointTime - $lastDataPointTime)!")
+
+    val allDataPoints = sortedDataPoints :+ newPoint
+
+    // Run anomaly and create an Extended Detection result with all data points and anomaly details
+    detectAnomaliesInHistoryWithExtendedResults(allDataPoints, (newPoint.time, Long.MaxValue))
+  }
+
+
+  /**
+   * Given a strategy, detects anomalies in a time series after some preprocessing
+   * and returns extended results.
+   *
+   * @param dataSeries     Sequence of tuples (Points in time with corresponding value).
+   * @param searchInterval The interval in which anomalies should be detected. [a, b).
+   * @return A wrapper object, containing all data points with anomaly extended results.
+   */
+  def detectAnomaliesInHistoryWithExtendedResults(
+      dataSeries: Seq[DataPoint[Double]],
+      searchInterval: (Long, Long) = (Long.MinValue, Long.MaxValue))
+    : ExtendedDetectionResult = {
+
+    def findIndexForBound(sortedTimestamps: Seq[Long], boundValue: Long): Int = {
+      sortedTimestamps.search(boundValue).insertionPoint
+    }
+
+    val (searchStart, searchEnd) = searchInterval
+
+    require(searchStart <= searchEnd,
+      "The first interval element has to be smaller or equal to the last.")
+
+    // Remove missing values and sort series by time
+    val removedMissingValues = dataSeries.filter {
+      _.metricValue.isDefined
+    }
+    val sortedSeries = removedMissingValues.sortBy {
+      _.time
+    }
+    val sortedTimestamps = sortedSeries.map {
+      _.time
+    }
+
+    // Find indices of lower and upper bound
+    val lowerBoundIndex = findIndexForBound(sortedTimestamps, searchStart)
+    val upperBoundIndex = findIndexForBound(sortedTimestamps, searchEnd)
+
+    val anomalies = strategy.detectWithExtendedResults(
+      sortedSeries.flatMap {
+        _.metricValue
+      }.toVector, (lowerBoundIndex, upperBoundIndex))
+
+    ExtendedDetectionResult(anomalies.map { case (index, anomaly) => (sortedTimestamps(index), anomaly) })
+  }
+
+}
diff --git a/src/main/scala/com/amazon/deequ/anomalydetection/BaseChangeStrategy.scala b/src/main/scala/com/amazon/deequ/anomalydetection/BaseChangeStrategy.scala
@@ -27,7 +27,7 @@ import breeze.linalg.DenseVector
  *                        Set to 1 it calculates the difference between two consecutive values.
  */
 trait BaseChangeStrategy
-  extends AnomalyDetectionStrategy {
+  extends AnomalyDetectionStrategy with AnomalyDetectionStrategyWithExtendedResults {
 
   def maxRateDecrease: Option[Double]
   def maxRateIncrease: Option[Double]
@@ -67,7 +67,8 @@ trait BaseChangeStrategy
   }
 
   /**
-   * Search for anomalies in a series of data points.
+   * Search for anomalies in a series of data points. This function uses the
+   * detectWithExtendedResults function and then filters and maps to return only anomaly data point objects.
    *
    * If there aren't enough data points preceding the searchInterval,
    * it may happen that the interval's first elements (depending on the specified order)
@@ -81,6 +82,30 @@ trait BaseChangeStrategy
     dataSeries: Vector[Double],
     searchInterval: (Int, Int))
   : Seq[(Int, Anomaly)] = {
+
+    detectWithExtendedResults(dataSeries, searchInterval)
+      .filter { case (_, anomDataPoint) => anomDataPoint.isAnomaly }
+      .map { case (i, anomDataPoint) =>
+        (i, Anomaly(Some(anomDataPoint.dataMetricValue), anomDataPoint.confidence, anomDataPoint.detail))
+      }
+  }
+
+  /**
+   * Search for anomalies in a series of data points, returns extended results.
+   *
+   * If there aren't enough data points preceding the searchInterval,
+   * it may happen that the interval's first elements (depending on the specified order)
+   * can't be flagged as anomalies.
+   *
+   * @param dataSeries     The data contained in a Vector of Doubles
+   * @param searchInterval The indices between which anomalies should be detected. [a, b).
+   * @return The indices of all anomalies in the interval and their corresponding wrapper object
+   *         with extended results.
+   */
+  override def detectWithExtendedResults(
+    dataSeries: Vector[Double],
+    searchInterval: (Int, Int))
+  : Seq[(Int, AnomalyDetectionDataPoint)] = {
     val (start, end) = searchInterval
 
     require(start <= end,
@@ -89,15 +114,25 @@ trait BaseChangeStrategy
     val startPoint = Seq(start - order, 0).max
     val data = diff(DenseVector(dataSeries.slice(startPoint, end): _*), order).data
 
-    data.zipWithIndex.filter { case (value, _) =>
-      (value < maxRateDecrease.getOrElse(Double.MinValue)
-        || value > maxRateIncrease.getOrElse(Double.MaxValue))
-    }
-      .map { case (change, index) =>
-      (index + startPoint + order, Anomaly(Option(dataSeries(index + startPoint + order)), 1.0,
-        Some(s"[AbsoluteChangeStrategy]: Change of $change is not in bounds [" +
-          s"${maxRateDecrease.getOrElse(Double.MinValue)}, " +
-          s"${maxRateIncrease.getOrElse(Double.MaxValue)}]. Order=$order")))
+    val lowerBound = maxRateDecrease.getOrElse(Double.MinValue)
+    val upperBound = maxRateIncrease.getOrElse(Double.MaxValue)
+
+
+    data.zipWithIndex.map {
+      case (change, index) =>
+        val outputSequenceIndex = index + startPoint + order
+        val value = dataSeries(outputSequenceIndex)
+        val (detail, isAnomaly) = if (change < lowerBound || change > upperBound) {
+          (Some(s"[AbsoluteChangeStrategy]: Change of $change is not in bounds [" +
+            s"$lowerBound, " +
+            s"$upperBound]. Order=$order"), true)
+        }
+        else {
+          (None, false)
+        }
+        (outputSequenceIndex, AnomalyDetectionDataPoint(value, change,
+          BoundedRange(lowerBound = Bound(lowerBound, inclusive = true),
+            upperBound = Bound(upperBound, inclusive = true)), isAnomaly, 1.0, detail))
     }
   }
 }
diff --git a/src/main/scala/com/amazon/deequ/anomalydetection/BatchNormalStrategy.scala b/src/main/scala/com/amazon/deequ/anomalydetection/BatchNormalStrategy.scala
@@ -33,7 +33,9 @@ import breeze.stats.meanAndVariance
 case class BatchNormalStrategy(
   lowerDeviationFactor: Option[Double] = Some(3.0),
   upperDeviationFactor: Option[Double] = Some(3.0),
-  includeInterval: Boolean = false) extends AnomalyDetectionStrategy {
+  includeInterval: Boolean = false)
+  extends AnomalyDetectionStrategy with AnomalyDetectionStrategyWithExtendedResults
+   {
 
   require(lowerDeviationFactor.isDefined || upperDeviationFactor.isDefined,
     "At least one factor has to be specified.")
@@ -43,7 +45,8 @@ case class BatchNormalStrategy(
 
 
   /**
-    * Search for anomalies in a series of data points.
+    * Search for anomalies in a series of data points. This function uses the
+    * detectWithExtendedResults function and then filters and maps to return only anomaly objects.
     *
     * @param dataSeries     The data contained in a Vector of Doubles
     * @param searchInterval The indices between which anomalies should be detected. [a, b).
@@ -53,6 +56,25 @@ case class BatchNormalStrategy(
     dataSeries: Vector[Double],
     searchInterval: (Int, Int)): Seq[(Int, Anomaly)] = {
 
+    detectWithExtendedResults(dataSeries, searchInterval)
+      .filter { case (_, anomDataPoint) => anomDataPoint.isAnomaly }
+      .map { case (i, anomDataPoint) =>
+        (i, Anomaly(Some(anomDataPoint.dataMetricValue), anomDataPoint.confidence, anomDataPoint.detail))
+      }
+  }
+
+ /**
+   * Search for anomalies in a series of data points, returns extended results.
+   *
+   * @param dataSeries     The data contained in a Vector of Doubles
+   * @param searchInterval The indices between which anomalies should be detected. [a, b).
+   * @return The indices of all anomalies in the interval and their corresponding wrapper object
+   *          with extended results.
+   */
+  override def detectWithExtendedResults(
+    dataSeries: Vector[Double],
+    searchInterval: (Int, Int)): Seq[(Int, AnomalyDetectionDataPoint)] = {
+
     val (searchStart, searchEnd) = searchInterval
 
     require(searchStart <= searchEnd, "The start of the interval can't be larger than the end.")
@@ -83,13 +105,18 @@ case class BatchNormalStrategy(
 
     dataSeries.zipWithIndex
       .slice(searchStart, searchEnd)
-      .filter { case (value, _) => value > upperBound || value < lowerBound }
       .map { case (value, index) =>
-
-        val detail = Some(s"[BatchNormalStrategy]: Value $value is not in " +
-          s"bounds [$lowerBound, $upperBound].")
-
-        (index, Anomaly(Option(value), 1.0, detail))
+        val (detail, isAnomaly) = if (value > upperBound || value < lowerBound) {
+          (Some(s"[BatchNormalStrategy]: Value $value is not in " +
+            s"bounds [$lowerBound, $upperBound]."), true)
+        } else {
+          (None, false)
+        }
+        (index, AnomalyDetectionDataPoint(value, value,
+          BoundedRange(lowerBound = Bound(lowerBound, inclusive = true),
+            upperBound = Bound(upperBound, inclusive = true)), isAnomaly, 1.0, detail))
       }
   }
+
+
 }