-
Notifications
You must be signed in to change notification settings - Fork 539
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Feature: Add Row Level Result Treatment Options for Uniqueness and Completeness #532
Changes from all commits
6328869
62da8dd
2cdd9a5
e13f6ba
5f715ac
e5b7821
8e37b92
6375a32
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,6 +17,7 @@ | |
package com.amazon.deequ.analyzers | ||
|
||
import com.amazon.deequ.analyzers.Analyzers._ | ||
import com.amazon.deequ.analyzers.FilteredRow.FilteredRow | ||
import com.amazon.deequ.analyzers.NullBehavior.NullBehavior | ||
import com.amazon.deequ.analyzers.runners._ | ||
import com.amazon.deequ.metrics.DoubleMetric | ||
|
@@ -69,7 +70,7 @@ trait Analyzer[S <: State[_], +M <: Metric[_]] extends Serializable { | |
* @param data data frame | ||
* @return | ||
*/ | ||
def computeStateFrom(data: DataFrame): Option[S] | ||
def computeStateFrom(data: DataFrame, filterCondition: Option[String] = None): Option[S] | ||
|
||
/** | ||
* Compute the metric from the state (sufficient statistics) | ||
|
@@ -97,13 +98,14 @@ trait Analyzer[S <: State[_], +M <: Metric[_]] extends Serializable { | |
def calculate( | ||
data: DataFrame, | ||
aggregateWith: Option[StateLoader] = None, | ||
saveStatesWith: Option[StatePersister] = None) | ||
saveStatesWith: Option[StatePersister] = None, | ||
filterCondition: Option[String] = None) | ||
: M = { | ||
|
||
try { | ||
preconditions.foreach { condition => condition(data.schema) } | ||
|
||
val state = computeStateFrom(data) | ||
val state = computeStateFrom(data, filterCondition) | ||
|
||
calculateMetric(state, aggregateWith, saveStatesWith) | ||
} catch { | ||
|
@@ -170,7 +172,6 @@ trait Analyzer[S <: State[_], +M <: Metric[_]] extends Serializable { | |
private[deequ] def copyStateTo(source: StateLoader, target: StatePersister): Unit = { | ||
source.load[S](this).foreach { state => target.persist(this, state) } | ||
} | ||
|
||
} | ||
|
||
/** An analyzer that runs a set of aggregation functions over the data, | ||
|
@@ -184,7 +185,7 @@ trait ScanShareableAnalyzer[S <: State[_], +M <: Metric[_]] extends Analyzer[S, | |
private[deequ] def fromAggregationResult(result: Row, offset: Int): Option[S] | ||
|
||
/** Runs aggregation functions directly, without scan sharing */ | ||
override def computeStateFrom(data: DataFrame): Option[S] = { | ||
override def computeStateFrom(data: DataFrame, where: Option[String] = None): Option[S] = { | ||
val aggregations = aggregationFunctions() | ||
val result = data.agg(aggregations.head, aggregations.tail: _*).collect().head | ||
fromAggregationResult(result, 0) | ||
|
@@ -255,12 +256,18 @@ case class NumMatchesAndCount(numMatches: Long, count: Long, override val fullCo | |
} | ||
} | ||
|
||
case class AnalyzerOptions(nullBehavior: NullBehavior = NullBehavior.Ignore) | ||
case class AnalyzerOptions(nullBehavior: NullBehavior = NullBehavior.Ignore, | ||
filteredRow: FilteredRow = FilteredRow.TRUE) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about |
||
object NullBehavior extends Enumeration { | ||
type NullBehavior = Value | ||
val Ignore, EmptyString, Fail = Value | ||
} | ||
|
||
object FilteredRow extends Enumeration { | ||
type FilteredRow = Value | ||
val NULL, TRUE = Value | ||
} | ||
|
||
/** Base class for analyzers that compute ratios of matching predicates */ | ||
abstract class PredicateMatchingAnalyzer( | ||
name: String, | ||
|
@@ -490,6 +497,18 @@ private[deequ] object Analyzers { | |
conditionalSelectionFromColumns(selection, conditionColumn) | ||
} | ||
|
||
def conditionalSelectionFilteredFromColumns( | ||
selection: Column, | ||
conditionColumn: Option[Column], | ||
filterTreatment: String) | ||
: Column = { | ||
conditionColumn | ||
.map { condition => { | ||
when(not(condition), expr(filterTreatment)).when(condition, selection) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: we can remove the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we delegate the |
||
} } | ||
.getOrElse(selection) | ||
} | ||
|
||
private[this] def conditionalSelectionFromColumns( | ||
selection: Column, | ||
conditionColumn: Option[Column]) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,19 +20,21 @@ import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isNotNested} | |
import org.apache.spark.sql.functions.sum | ||
import org.apache.spark.sql.types.{IntegerType, StructType} | ||
import Analyzers._ | ||
import com.amazon.deequ.analyzers.FilteredRow.FilteredRow | ||
import com.google.common.annotations.VisibleForTesting | ||
import org.apache.spark.sql.DataFrame | ||
import org.apache.spark.sql.functions.col | ||
import org.apache.spark.sql.functions.expr | ||
import org.apache.spark.sql.{Column, Row} | ||
|
||
/** Completeness is the fraction of non-null values in a column of a DataFrame. */ | ||
case class Completeness(column: String, where: Option[String] = None) extends | ||
case class Completeness(column: String, where: Option[String] = None, | ||
analyzerOptions: Option[AnalyzerOptions] = None) extends | ||
StandardScanShareableAnalyzer[NumMatchesAndCount]("Completeness", column) with | ||
FilterableAnalyzer { | ||
|
||
override def fromAggregationResult(result: Row, offset: Int): Option[NumMatchesAndCount] = { | ||
|
||
ifNoNullsIn(result, offset, howMany = 2) { _ => | ||
NumMatchesAndCount(result.getLong(offset), result.getLong(offset + 1), Some(criterion)) | ||
NumMatchesAndCount(result.getLong(offset), result.getLong(offset + 1), Some(rowLevelResults)) | ||
} | ||
} | ||
|
||
|
@@ -51,4 +53,16 @@ case class Completeness(column: String, where: Option[String] = None) extends | |
|
||
@VisibleForTesting // required by some tests that compare analyzer results to an expected state | ||
private[deequ] def criterion: Column = conditionalSelection(column, where).isNotNull | ||
|
||
@VisibleForTesting | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need this annotation? The method is accessible to classes in |
||
private[deequ] def rowLevelResults: Column = { | ||
val whereCondition = where.map { expression => expr(expression)} | ||
conditionalSelectionFilteredFromColumns(col(column).isNotNull, whereCondition, getRowLevelFilterTreatment.toString) | ||
} | ||
|
||
private def getRowLevelFilterTreatment: FilteredRow = { | ||
analyzerOptions | ||
.map { options => options.filteredRow } | ||
.getOrElse(FilteredRow.TRUE) | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -32,15 +32,17 @@ import org.apache.spark.sql.functions.count | |
import org.apache.spark.sql.functions.expr | ||
import org.apache.spark.sql.functions.lit | ||
import org.apache.spark.sql.types.StructType | ||
import org.apache.spark.sql.functions.when | ||
|
||
/** Base class for all analyzers that operate the frequencies of groups in the data */ | ||
abstract class FrequencyBasedAnalyzer(columnsToGroupOn: Seq[String]) | ||
extends GroupingAnalyzer[FrequenciesAndNumRows, DoubleMetric] { | ||
|
||
override def groupingColumns(): Seq[String] = { columnsToGroupOn } | ||
|
||
override def computeStateFrom(data: DataFrame): Option[FrequenciesAndNumRows] = { | ||
Some(FrequencyBasedAnalyzer.computeFrequencies(data, groupingColumns())) | ||
override def computeStateFrom(data: DataFrame, | ||
filterCondition: Option[String] = None): Option[FrequenciesAndNumRows] = { | ||
Some(FrequencyBasedAnalyzer.computeFrequencies(data, groupingColumns(), filterCondition)) | ||
} | ||
|
||
/** We need at least one grouping column, and all specified columns must exist */ | ||
|
@@ -88,7 +90,15 @@ object FrequencyBasedAnalyzer { | |
.count() | ||
|
||
// Set rows with value count 1 to true, and otherwise false | ||
val fullColumn: Column = count(UNIQUENESS_ID).over(Window.partitionBy(columnsToGroupBy: _*)) | ||
val fullColumn: Column = { | ||
val window = Window.partitionBy(columnsToGroupBy: _*) | ||
where.map { | ||
condition => { | ||
count(when(expr(condition), UNIQUENESS_ID)).over(window) | ||
} | ||
Comment on lines
+96
to
+98
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: we can remove the brackets after |
||
}.getOrElse(count(UNIQUENESS_ID).over(window)) | ||
} | ||
|
||
FrequenciesAndNumRows(frequencies, numRows, Option(fullColumn)) | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,13 +17,17 @@ | |
package com.amazon.deequ.analyzers | ||
|
||
import com.amazon.deequ.analyzers.Analyzers.COUNT_COL | ||
import com.amazon.deequ.analyzers.FilteredRow.FilteredRow | ||
import com.amazon.deequ.metrics.DoubleMetric | ||
import org.apache.spark.sql.functions.expr | ||
import org.apache.spark.sql.functions.not | ||
import org.apache.spark.sql.functions.when | ||
import org.apache.spark.sql.{Column, Row} | ||
import org.apache.spark.sql.functions.{col, count, lit, sum} | ||
import org.apache.spark.sql.types.DoubleType | ||
|
||
case class UniqueValueRatio(columns: Seq[String], where: Option[String] = None) | ||
case class UniqueValueRatio(columns: Seq[String], where: Option[String] = None, | ||
analyzerOptions: Option[AnalyzerOptions] = None) | ||
extends ScanShareableFrequencyBasedAnalyzer("UniqueValueRatio", columns) | ||
with FilterableAnalyzer { | ||
|
||
|
@@ -34,11 +38,27 @@ case class UniqueValueRatio(columns: Seq[String], where: Option[String] = None) | |
override def fromAggregationResult(result: Row, offset: Int, fullColumn: Option[Column] = None): DoubleMetric = { | ||
val numUniqueValues = result.getDouble(offset) | ||
val numDistinctValues = result.getLong(offset + 1).toDouble | ||
val fullColumnUniqueness = when((fullColumn.getOrElse(null)).equalTo(1), true).otherwise(false) | ||
toSuccessMetric(numUniqueValues / numDistinctValues, Option(fullColumnUniqueness)) | ||
val conditionColumn = where.map { expression => expr(expression) } | ||
val fullColumnUniqueness = fullColumn.map { | ||
rowLevelColumn => { | ||
conditionColumn.map { | ||
condition => { | ||
when(not(condition), expr(getRowLevelFilterTreatment.toString)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same comment for |
||
.when(rowLevelColumn.equalTo(1), true).otherwise(false) | ||
} | ||
}.getOrElse(when(rowLevelColumn.equalTo(1), true).otherwise(false)) | ||
} | ||
} | ||
toSuccessMetric(numUniqueValues / numDistinctValues, fullColumnUniqueness) | ||
} | ||
|
||
override def filterCondition: Option[String] = where | ||
|
||
private def getRowLevelFilterTreatment: FilteredRow = { | ||
analyzerOptions | ||
.map { options => options.filteredRow } | ||
.getOrElse(FilteredRow.TRUE) | ||
} | ||
Comment on lines
+57
to
+61
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: This is repeated in a few places, so could go into the base class. |
||
} | ||
|
||
object UniqueValueRatio { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: excess