-
Notifications
You must be signed in to change notification settings - Fork 1.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Dataframe join_on method #5210
Dataframe join_on method #5210
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -363,6 +363,55 @@ impl DataFrame { | |||||
Ok(DataFrame::new(self.session_state, plan)) | ||||||
} | ||||||
|
||||||
/// Join this DataFrame with another DataFrame using the specified expressions. | ||||||
/// | ||||||
/// Simply a thin wrapper over [`join`](Self::join) where the join keys are not provided, | ||||||
/// and the provided expressions are AND'ed together to form the filter expression. | ||||||
/// | ||||||
/// ``` | ||||||
/// # use datafusion::prelude::*; | ||||||
/// # use datafusion::error::Result; | ||||||
/// # #[tokio::main] | ||||||
/// # async fn main() -> Result<()> { | ||||||
/// let ctx = SessionContext::new(); | ||||||
/// let left = ctx | ||||||
/// .read_csv("tests/data/example.csv", CsvReadOptions::new()) | ||||||
/// .await?; | ||||||
/// let right = ctx | ||||||
/// .read_csv("tests/data/example.csv", CsvReadOptions::new()) | ||||||
/// .await? | ||||||
/// .select(vec![ | ||||||
/// col("a").alias("a2"), | ||||||
/// col("b").alias("b2"), | ||||||
/// col("c").alias("c2"), | ||||||
/// ])?; | ||||||
/// let join_on = left.join_on( | ||||||
/// right, | ||||||
/// JoinType::Inner, | ||||||
/// [col("a").not_eq(col("a2")), col("b").not_eq(col("b2"))], | ||||||
/// )?; | ||||||
/// let batches = join_on.collect().await?; | ||||||
/// # Ok(()) | ||||||
/// # } | ||||||
/// ``` | ||||||
pub fn join_on( | ||||||
self, | ||||||
right: DataFrame, | ||||||
join_type: JoinType, | ||||||
on_exprs: impl IntoIterator<Item = Expr>, | ||||||
) -> Result<DataFrame> { | ||||||
let expr = on_exprs.into_iter().reduce(Expr::and); | ||||||
let plan = LogicalPlanBuilder::from(self.plan) | ||||||
.join( | ||||||
right.plan, | ||||||
join_type, | ||||||
(Vec::<Column>::new(), Vec::<Column>::new()), | ||||||
expr, | ||||||
)? | ||||||
.build()?; | ||||||
Ok(DataFrame::new(self.session_state, plan)) | ||||||
} | ||||||
|
||||||
/// Repartition a DataFrame based on a logical partitioning scheme. | ||||||
/// | ||||||
/// ``` | ||||||
|
@@ -1039,6 +1088,33 @@ mod tests { | |||||
Ok(()) | ||||||
} | ||||||
|
||||||
#[tokio::test] | ||||||
async fn join_on() -> Result<()> { | ||||||
let left = test_table_with_name("a") | ||||||
.await? | ||||||
.select_columns(&["c1", "c2"])?; | ||||||
let right = test_table_with_name("b") | ||||||
.await? | ||||||
.select_columns(&["c1", "c2"])?; | ||||||
let join = left.join_on( | ||||||
right, | ||||||
JoinType::Inner, | ||||||
[ | ||||||
col("a.c1").not_eq(col("b.c1")), | ||||||
col("a.c2").not_eq(col("b.c2")), | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would it be possible here to also add an equality predicate to demonstrate they are automatically recognized as equi preds? Perhaps something like
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done as you suggested. it seems they still are considered as part of the filter, though this seems to track with the explicit SQL version too: edit: nvm there's the |
||||||
], | ||||||
)?; | ||||||
|
||||||
let expected_plan = "Inner Join: Filter: a.c1 != b.c1 AND a.c2 != b.c2\ | ||||||
\n Projection: a.c1, a.c2\ | ||||||
\n TableScan: a\ | ||||||
\n Projection: b.c1, b.c2\ | ||||||
\n TableScan: b"; | ||||||
assert_eq!(expected_plan, format!("{:?}", join.logical_plan())); | ||||||
|
||||||
Ok(()) | ||||||
} | ||||||
|
||||||
#[tokio::test] | ||||||
async fn limit() -> Result<()> { | ||||||
// build query using Table API | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,7 +22,10 @@ use crate::expr_rewriter::{ | |
normalize_cols, rewrite_sort_cols_by_aggs, | ||
}; | ||
use crate::type_coercion::binary::comparison_coercion; | ||
use crate::utils::{columnize_expr, compare_sort_expr, exprlist_to_fields, from_plan}; | ||
use crate::utils::{ | ||
columnize_expr, compare_sort_expr, ensure_any_column_reference_is_unambiguous, | ||
exprlist_to_fields, from_plan, | ||
}; | ||
use crate::{and, binary_expr, Operator}; | ||
use crate::{ | ||
logical_plan::{ | ||
|
@@ -502,6 +505,25 @@ impl LogicalPlanBuilder { | |
)); | ||
} | ||
|
||
let filter = if let Some(expr) = filter { | ||
// ambiguous check | ||
ensure_any_column_reference_is_unambiguous( | ||
&expr, | ||
&[self.schema(), right.schema()], | ||
)?; | ||
|
||
// normalize all columns in expression | ||
let using_columns = expr.to_columns()?; | ||
let filter = normalize_col_with_schemas( | ||
expr, | ||
&[self.schema(), right.schema()], | ||
&[using_columns], | ||
)?; | ||
Some(filter) | ||
} else { | ||
None | ||
}; | ||
|
||
Comment on lines
+508
to
+526
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. related to #4196 fix bug where you could do dataframe join with ambiguous column for the filter expr instead of having the check done in both DataFrame join api and SQL planner join mod, unify by having check done inside the logical plan builder this is technically an unrelated fix to the actual issue, so i can extract into separate issue if needed There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it is fine to include in this PR as long as it also has a test (for ambiguity check using the DataFrame API) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. test added |
||
let (left_keys, right_keys): (Vec<Result<Column>>, Vec<Result<Column>>) = | ||
join_keys | ||
.0 | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
👍 LGTM