Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Internal error: PhysicalOptimizer rule 'join_selection' failed, due to generate a different schema, when joining with metadata #8285

Closed
alamb opened this issue Nov 20, 2023 · 2 comments · Fixed by #8286
Assignees
Labels
bug Something isn't working

Comments

@alamb
Copy link
Contributor

alamb commented Nov 20, 2023

Describe the bug

We are seeing the following bug in our user queries:

Internal error: PhysicalOptimizer rule 'join_selection' failed, due to generate a different schema, original schema: Schema { fields: [Field { name: "id", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }], metadata: {} }, new schema: Schema { fields: [Field { name: "id", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"metadata_key": "the id field"} }], metadata: {} }.

To Reproduce

Run a query like this where the relevant schema has metadata:

query error
SELECT "data"."id"
FROM
  (
    (SELECT "id" FROM "table_with_metadata")
      UNION
    (SELECT "id" FROM "table_with_metadata")
  ) as "data",
  (
    SELECT "id" FROM "table_with_metadata"
  ) as "samples"
WHERE "data"."id" = "samples"."id";
diff --git a/datafusion/sqllogictest/src/test_context.rs b/datafusion/sqllogictest/src/test_context.rs
index b2314f34f3..f5ab8f71aa 100644
--- a/datafusion/sqllogictest/src/test_context.rs
+++ b/datafusion/sqllogictest/src/test_context.rs
@@ -35,6 +35,7 @@ use datafusion::{
 };
 use datafusion_common::DataFusionError;
 use log::info;
+use std::collections::HashMap;
 use std::fs::File;
 use std::io::Write;
 use std::path::Path;
@@ -57,8 +58,8 @@ impl TestContext {
         }
     }

-    /// Create a SessionContext, configured for the specific test, if
-    /// possible.
+    /// Create a SessionContext, configured for the specific sqllogictest
+    /// test(.slt file) , if possible.
     ///
     /// If `None` is returned (e.g. because some needed feature is not
     /// enabled), the file should be skipped
@@ -67,7 +68,7 @@ impl TestContext {
             // hardcode target partitions so plans are deterministic
             .with_target_partitions(4);

-        let test_ctx = TestContext::new(SessionContext::new_with_config(config));
+        let mut test_ctx = TestContext::new(SessionContext::new_with_config(config));

         let file_name = relative_path.file_name().unwrap().to_str().unwrap();
         match file_name {
@@ -86,10 +87,8 @@ impl TestContext {
             "avro.slt" => {
                 #[cfg(feature = "avro")]
                 {
-                    let mut test_ctx = test_ctx;
                     info!("Registering avro tables");
                     register_avro_tables(&mut test_ctx).await;
-                    return Some(test_ctx);
                 }
                 #[cfg(not(feature = "avro"))]
                 {
@@ -99,10 +98,11 @@ impl TestContext {
             }
             "joins.slt" => {
                 info!("Registering partition table tables");
-
-                let mut test_ctx = test_ctx;
                 register_partition_table(&mut test_ctx).await;
-                return Some(test_ctx);
+            }
+            "metadata.slt" => {
+                info!("Registering metadata table tables");
+                register_metadata_tables(test_ctx.session_ctx()).await;
             }
             _ => {
                 info!("Using default SessionContext");
@@ -299,3 +299,31 @@ fn table_with_many_types() -> Arc<dyn TableProvider> {
     let provider = MemTable::try_new(Arc::new(schema), vec![vec![batch]]).unwrap();
     Arc::new(provider)
 }
+
+/// Registers a table_with_metadata that contains both field level and Table level metadata
+pub async fn register_metadata_tables(ctx: &SessionContext) {
+    let id = Field::new("id", DataType::Int32, true).with_metadata(HashMap::from([(
+        String::from("metadata_key"),
+        String::from("the id field"),
+    )]));
+    let name = Field::new("name", DataType::Utf8, true).with_metadata(HashMap::from([(
+        String::from("metadata_key"),
+        String::from("the name field"),
+    )]));
+
+    let batch = RecordBatch::try_new(
+        Arc::new(schema),
+        vec![
+            Arc::new(Int32Array::from(vec![Some(1), None, Some(3)])) as _,
+            Arc::new(StringArray::from(vec![None, Some("bar"), Some("baz")])) as _,
+        ],
+    )
+    .unwrap();
+
+    ctx.register_batch("table_with_metadata", batch).unwrap();
+}
diff --git a/datafusion/sqllogictest/test_files/metadata.slt b/datafusion/sqllogictest/test_files/metadata.slt
new file mode 100644
index 0000000000..a57eba2e3c
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/metadata.slt
@@ -0,0 +1,57 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+##########
+## Tests for tables that has both metadata on each field as well as metadata on
+## the schema itself.
+##########
+
+query IT
+select * from table_with_metadata;
+----
+1 NULL
+NULL bar
+3 baz
+
+query I
+SELECT (
+  SELECT id FROM table_with_metadata
+  ) UNION (
+  SELECT id FROM table_with_metadata
+  );
+----
+1
+NULL
+3
+
+query error
+SELECT "data"."id"
+FROM
+  (
+    (SELECT "id" FROM "table_with_metadata")
+      UNION
+    (SELECT "id" FROM "table_with_metadata")
+  ) as "data",
+  (
+    SELECT "id" FROM "table_with_metadata"
+  ) as "samples"
+WHERE "data"."id" = "samples"."id";
+----
+DataFusion error: join_selection
+caused by
+Internal error: PhysicalOptimizer rule 'join_selection' failed, due to generate a different schema, original schema: Schema { fields: [Field { name: "id", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }], metadata: {} }, new schema: Schema { fields: [Field { name: "id", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"metadata_key": "the id field"} }], metadata: {} }.
+This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker

Expected behavior

The query should run successfully

Additional context

No response

@alamb alamb added the bug Something isn't working label Nov 20, 2023
@alamb
Copy link
Contributor Author

alamb commented Nov 20, 2023

My suspicion is that this is a latent bug that is hit now that we have #8126

@alamb
Copy link
Contributor Author

alamb commented Nov 20, 2023

My suspicion is that this is a latent bug that is hit now that we have #8126

I believe my fix above confirms this suspicion

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
bug Something isn't working
Projects
None yet
Development

Successfully merging a pull request may close this issue.

1 participant