Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Remove dangling table references in unparser #13405

Closed
116 changes: 114 additions & 2 deletions datafusion/sql/src/unparser/ast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ use core::fmt;

use sqlparser::ast;

use super::rewrite::remove_dangling_expr;

#[derive(Clone)]
pub(super) struct QueryBuilder {
with: Option<ast::With>,
Expand Down Expand Up @@ -238,7 +240,101 @@ impl SelectBuilder {
self.value_table_mode = value;
self
}
pub fn build(&self) -> Result<ast::Select, BuilderError> {
fn collect_valid_idents(&self, relation_builder: &RelationBuilder) -> Vec<String> {
let mut all_idents = Vec::new();
if let Some(source_alias) = relation_builder.get_alias() {
all_idents.push(source_alias);
} else if let Some(source_name) = relation_builder.get_name() {
let full_ident = source_name.to_string();
if let Some(name) = source_name.0.last() {
if full_ident != name.to_string() {
// supports identifiers that contain the entire path, as well as just the end table leaf
// like catalog.schema.table and table
all_idents.push(name.to_string());
}
}
all_idents.push(full_ident);
}

if let Some(twg) = self.from.last() {
twg.joins.iter().for_each(|join| match &join.relation {
ast::TableFactor::Table { alias, name, .. } => {
if let Some(alias) = alias {
all_idents.push(alias.name.to_string());
} else {
let full_ident = name.to_string();
if let Some(name) = name.0.last() {
if full_ident != name.to_string() {
// supports identifiers that contain the entire path, as well as just the end table leaf
// like catalog.schema.table and table
all_idents.push(name.to_string());
}
}
all_idents.push(full_ident);
}
}
ast::TableFactor::Derived {
alias: Some(alias), ..
} => {
all_idents.push(alias.name.to_string());
}
_ => {}
});
}

all_idents
}

/// Remove any dangling table identifiers from the projection, selection, group by, order by and function arguments
/// This removes any references to tables that are not part of any from/source or join, as they would be invalid
fn remove_dangling_identifiers(
&mut self,
query: &mut Option<QueryBuilder>,
relation_builder: &RelationBuilder,
) {
let all_idents = self.collect_valid_idents(relation_builder);

// Ensure that the projection contains references to sources that actually exist
self.projection.iter_mut().for_each(|select_item| {
if let ast::SelectItem::UnnamedExpr(expr) = select_item {
*expr = remove_dangling_expr(expr.clone(), &all_idents);
}
});

// replace dangling references in the selection
if let Some(expr) = self.selection.as_ref() {
self.selection = Some(remove_dangling_expr(expr.clone(), &all_idents));
}

// Check the order by as well
if let Some(query) = query.as_mut() {
query.order_by.iter_mut().for_each(|sort_item| {
sort_item.expr =
remove_dangling_expr(sort_item.expr.clone(), &all_idents);
});
}

// Order by could be a sort in the select builder
self.sort_by.iter_mut().for_each(|sort_item| {
*sort_item = remove_dangling_expr(sort_item.clone(), &all_idents);
});

// check the group by as well
if let Some(ast::GroupByExpr::Expressions(ref mut group_by, _)) =
self.group_by.as_mut()
{
group_by.iter_mut().for_each(|expr| {
*expr = remove_dangling_expr(expr.clone(), &all_idents);
});
}
}
pub fn build(
&mut self,
query: &mut Option<QueryBuilder>,
relation_builder: &RelationBuilder,
) -> Result<ast::Select, BuilderError> {
self.remove_dangling_identifiers(query, relation_builder);

Ok(ast::Select {
distinct: self.distinct.clone(),
top: self.top.clone(),
Expand Down Expand Up @@ -307,7 +403,6 @@ impl TableWithJoinsBuilder {
self.relation = Some(value);
self
}

pub fn joins(&mut self, value: Vec<ast::Join>) -> &mut Self {
self.joins = value;
self
Expand Down Expand Up @@ -360,6 +455,23 @@ impl RelationBuilder {
pub fn has_relation(&self) -> bool {
self.relation.is_some()
}
pub fn get_name(&self) -> Option<&ast::ObjectName> {
match self.relation {
Some(TableFactorBuilder::Table(ref value)) => value.name.as_ref(),
_ => None,
}
}
pub fn get_alias(&self) -> Option<String> {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can probably avoid a bunch of copies if you made this return a reference to a &str rather than a String -- if the caller needed the string they can always copy it.

Suggested change
pub fn get_alias(&self) -> Option<String> {
pub fn get_alias(&self) -> Option<&str> {

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like Ident doesn't implement anything that would return a &str, so it needs a String intermediary. I'm also not sure what copies you're referring too, I don't make any copies of the values from collect_valid_idents? The return from get_alias also isn't cloned, and is taken ownership of by collect_valid_idents.

match self.relation {
Some(TableFactorBuilder::Table(ref value)) => {
value.alias.as_ref().map(|a| a.name.to_string())
}
Some(TableFactorBuilder::Derived(ref value)) => {
value.alias.as_ref().map(|a| a.name.to_string())
}
_ => None,
}
}
pub fn table(&mut self, value: TableRelationBuilder) -> &mut Self {
self.relation = Some(TableFactorBuilder::Table(value));
self
Expand Down
6 changes: 4 additions & 2 deletions datafusion/sql/src/unparser/plan.rs
Original file line number Diff line number Diff line change
Expand Up @@ -158,10 +158,12 @@ impl Unparser<'_> {
}

let mut twj = select_builder.pop_from().unwrap();
twj.relation(relation_builder);
twj.relation(relation_builder.clone());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand why this needs to have a clone now 🤔

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because twj.relation() takes ownership of the relation_builder causing it to move, so we can't borrow it again later.

The only thing I use the relation builder for is to retrieve the list of all the identifiers, so I could probably do that before the twj.relation() then just pass those like:

let valid_idents = select_builder.collect_valid_idents();
twj.relation();

which shouldn't require a clone.

select_builder.push_from(twj);

Ok(SetExpr::Select(Box::new(select_builder.build()?)))
Ok(SetExpr::Select(Box::new(
select_builder.build(query, &relation_builder)?,
)))
}

/// Reconstructs a SELECT SQL statement from a logical plan by unprojecting column expressions
Expand Down
137 changes: 136 additions & 1 deletion datafusion/sql/src/unparser/rewrite.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ use datafusion_common::{
};
use datafusion_expr::{expr::Alias, tree_node::transform_sort_vec};
use datafusion_expr::{Expr, LogicalPlan, Projection, Sort, SortExpr};
use sqlparser::ast::Ident;
use sqlparser::ast::{self, display_separated, Ident};

/// Normalize the schema of a union plan to remove qualifiers from the schema fields and sort expressions.
///
Expand Down Expand Up @@ -363,3 +363,138 @@ impl TreeNodeRewriter for TableAliasRewriter<'_> {
}
}
}

/// Takes an input list of identifiers and a list of identifiers that are available from relations or joins.
/// Removes any table identifiers that are not present in the list of available identifiers, retains original column names.
pub fn remove_dangling_identifiers(idents: &mut Vec<Ident>, available_idents: &[String]) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand this code super deeply, but this seems to me like it is treating the symptom (incorrect qualifiers) rather than the root cause.

Specifically, did you look into fixing the code so that it didn't create incorrect indentifiers in the first place, rather than trying to modify the created AST after the fact to remove incorrect indentifers ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I had taken a look into doing this at the unparser LogicalPlan level but I wasn't making very good progress. It could be my lack of understanding with LogicalPlan, but I think the symptom originates from the parser rather than the unparser.

If you'd be open to merging this still as an AST modifier, perhaps we could gate it behind a dialect option or feature flag as a non-default?

if idents.len() > 1 {
let ident_source = display_separated(
&idents
.clone()
.into_iter()
.take(idents.len() - 1)
.collect::<Vec<Ident>>(),
".",
)
.to_string();
// If the identifier is not present in the list of all identifiers, it refers to a table that does not exist
if !available_idents.contains(&ident_source) {
let Some(last) = idents.last() else {
unreachable!("CompoundIdentifier must have a last element");
};
// Reset the identifiers to only the last element, which is the column name
*idents = vec![last.clone()];
}
}
}

/// Handle removing dangling identifiers from an expression
/// This function can call itself recursively to handle nested expressions
/// Like binary ops or functions which contain nested expressions/arguments
pub fn remove_dangling_expr(
expr: ast::Expr,
available_idents: &Vec<String>,
) -> ast::Expr {
match expr {
ast::Expr::BinaryOp { left, op, right } => {
let left = remove_dangling_expr(*left, available_idents);
let right = remove_dangling_expr(*right, available_idents);
ast::Expr::BinaryOp {
left: Box::new(left),
op,
right: Box::new(right),
}
}
ast::Expr::Nested(expr) => {
let expr = remove_dangling_expr(*expr, available_idents);
ast::Expr::Nested(Box::new(expr))
}
ast::Expr::CompoundIdentifier(idents) => {
let mut idents = idents.clone();
remove_dangling_identifiers(&mut idents, available_idents);

if idents.is_empty() {
unreachable!("Identifier must have at least one element");
} else if idents.len() == 1 {
ast::Expr::Identifier(idents[0].clone())
} else {
ast::Expr::CompoundIdentifier(idents)
}
}
ast::Expr::Function(ast::Function {
args,
name,
parameters,
filter,
null_treatment,
over,
within_group,
}) => {
let args = if let ast::FunctionArguments::List(mut args) = args {
args.args.iter_mut().for_each(|arg| match arg {
ast::FunctionArg::Named {
arg: ast::FunctionArgExpr::Expr(expr),
..
}
| ast::FunctionArg::Unnamed(ast::FunctionArgExpr::Expr(expr)) => {
*expr = remove_dangling_expr(expr.clone(), available_idents);
}
_ => {}
});

ast::FunctionArguments::List(args)
} else {
args
};

ast::Expr::Function(ast::Function {
args,
name,
parameters,
filter,
null_treatment,
over,
within_group,
})
}
_ => expr,
}
}

#[cfg(test)]
mod test {
use super::*;

#[test]
fn test_remove_dangling_identifiers() {
let tests = vec![
(vec![], vec![Ident::new("column1".to_string())]),
(
vec!["table1.table2".to_string()],
vec![
Ident::new("table1".to_string()),
Ident::new("table2".to_string()),
Ident::new("column1".to_string()),
],
),
(
vec!["table1".to_string()],
vec![Ident::new("column1".to_string())],
),
];

for test in tests {
let test_in = test.0;
let test_out = test.1;

let mut idents = vec![
Ident::new("table1".to_string()),
Ident::new("table2".to_string()),
Ident::new("column1".to_string()),
];

remove_dangling_identifiers(&mut idents, &test_in);
assert_eq!(idents, test_out);
}
}
}
Loading